From a02e30a36b1a2609e814e604c648ed4a6b7a7bf3 Mon Sep 17 00:00:00 2001 From: limengxuan <391013634@qq.com> Date: Tue, 18 Feb 2025 17:22:54 +0800 Subject: [PATCH] update Signed-off-by: limengxuan <391013634@qq.com> --- pkg/scheduler/api/devices/config/config.go | 49 ++++++++++--------- .../api/devices/nvidia/vgpu/device_info.go | 9 +++- .../api/devices/nvidia/vgpu/utils.go | 26 ++++++++++ pkg/scheduler/api/node_info.go | 2 - .../plugins/deviceshare/deviceshare.go | 18 +------ 5 files changed, 62 insertions(+), 42 deletions(-) diff --git a/pkg/scheduler/api/devices/config/config.go b/pkg/scheduler/api/devices/config/config.go index 2567c4fff4..bd5c2b3179 100644 --- a/pkg/scheduler/api/devices/config/config.go +++ b/pkg/scheduler/api/devices/config/config.go @@ -19,11 +19,13 @@ package config import ( "context" "errors" + "sync" "gopkg.in/yaml.v2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" + "volcano.sh/volcano/pkg/scheduler/api/devices" ) type Config struct { @@ -32,6 +34,7 @@ type Config struct { var ( configs *Config + once sync.Once ) func GetConfig() *Config { @@ -58,27 +61,29 @@ func LoadConfigFromCM(kubeClient kubernetes.Interface, cmName string) (*Config, return &yamlData, nil } -func InitDevicesConfig(kubeClient kubernetes.Interface, cmName string) { - var err error - if len(cmName) == 0 { - cmName = "volcano-vgpu-device-config" - } - configs, err = LoadConfigFromCM(kubeClient, cmName) - if err != nil { - configs = &Config{ - NvidiaConfig: NvidiaConfig{ - ResourceCountName: "volcano.sh/vgpu-number", - ResourceCoreName: "volcano.sh/vgpu-cores", - ResourceMemoryName: "volcano.sh/vgpu-memory", - DefaultMemory: 0, - DefaultCores: 0, - DefaultGPUNum: 1, - DeviceSplitCount: 10, - DeviceMemoryScaling: 1, - DeviceCoreScaling: 1, - DisableCoreLimit: false, - }, +func InitDevicesConfig(cmName string) { + once.Do(func() { + var err error + if len(cmName) == 0 { + cmName = "volcano-vgpu-device-config" } - } - klog.V(3).InfoS("Initializing volcano vgpu config", "device-configs", configs) + configs, err = LoadConfigFromCM(devices.GetClient(), cmName) + if err != nil { + configs = &Config{ + NvidiaConfig: NvidiaConfig{ + ResourceCountName: "volcano.sh/vgpu-number", + ResourceCoreName: "volcano.sh/vgpu-cores", + ResourceMemoryName: "volcano.sh/vgpu-memory", + DefaultMemory: 0, + DefaultCores: 0, + DefaultGPUNum: 1, + DeviceSplitCount: 10, + DeviceMemoryScaling: 1, + DeviceCoreScaling: 1, + DisableCoreLimit: false, + }, + } + } + klog.V(3).InfoS("-=-=-=-=-=-=-=-=-=-=-=-Initializing volcano vgpu config", "device-configs", configs) + }) } diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go b/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go index 015a53969d..a8efdaa3ca 100644 --- a/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go +++ b/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go @@ -18,6 +18,7 @@ package vgpu import ( "fmt" + "os" "strconv" "strings" "time" @@ -102,12 +103,18 @@ func NewGPUDevices(name string, node *v1.Node) *GPUDevices { if !ok { return nil } + devicecm := os.Getenv("VOLCANO_DEVICE_CM") + if len(devicecm) == 0 { + devicecm = "volcano-vgpu-device-config" + } + deviceconfig.InitDevicesConfig(devicecm) + nodedevices := decodeNodeDevices(name, annos) if (nodedevices == nil) || len(nodedevices.Device) == 0 { return nil } for _, val := range nodedevices.Device { - klog.V(4).InfoS("Nvidia Device registered name", "name", nodedevices.Name, "val", *val) + klog.V(3).InfoS("Nvidia Device registered name", "name", nodedevices.Name, "val", *val) ResetDeviceMetrics(val.UUID, node.Name, float64(val.Memory)) } diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/utils.go b/pkg/scheduler/api/devices/nvidia/vgpu/utils.go index ed8a90e9e7..62056810ab 100644 --- a/pkg/scheduler/api/devices/nvidia/vgpu/utils.go +++ b/pkg/scheduler/api/devices/nvidia/vgpu/utils.go @@ -19,6 +19,7 @@ package vgpu import ( "context" "encoding/json" + "errors" "fmt" "strconv" "strings" @@ -57,6 +58,21 @@ func patchNodeAnnotations(node *v1.Node, annotations map[string]string) error { return err } +func extractGeoMetriyFromType(t string) ([]config.Geometry, error) { + for _, val := range config.GetConfig().NvidiaConfig.MigGeometriesList { + found := false + for _, migDevType := range val.Models { + if strings.Contains(t, migDevType) { + found = true + } + } + if found { + return val.Geometries, nil + } + } + return []config.Geometry{}, errors.New("mig type not found") +} + func decodeNodeDevices(name string, str string) *GPUDevices { if !strings.Contains(str, ":") { return nil @@ -86,6 +102,16 @@ func decodeNodeDevices(name string, str string) *GPUDevices { MigTemplate: []config.Geometry{}, MigUsage: config.MigInUse{}, } + if len(items) > 5 { + i.Mode = items[5] + if i.Mode == "mig" { + var err error + i.MigTemplate, err = extractGeoMetriyFromType(i.Type) + if err != nil { + i.Mode = "hami-core" + } + } + } retval.Device[index] = &i } } diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go index f1fdbdfc32..bdaad2d9fb 100644 --- a/pkg/scheduler/api/node_info.go +++ b/pkg/scheduler/api/node_info.go @@ -26,7 +26,6 @@ import ( k8sframework "k8s.io/kubernetes/pkg/scheduler/framework" "volcano.sh/apis/pkg/apis/scheduling/v1beta1" - "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" ) @@ -336,7 +335,6 @@ func (ni *NodeInfo) setNodeOthersResource(node *v1.Node) { klog.Warningf("received argument of nil node, no need to set other resources for %s", ni.Name) return } - ni.Others[GPUSharingDevice] = gpushare.NewGPUDevices(ni.Name, node) ni.Others[vgpu.DeviceName] = vgpu.NewGPUDevices(ni.Name, node) IgnoredDevicesList.Set( diff --git a/pkg/scheduler/plugins/deviceshare/deviceshare.go b/pkg/scheduler/plugins/deviceshare/deviceshare.go index 7a32a9a691..ba688c8c50 100644 --- a/pkg/scheduler/plugins/deviceshare/deviceshare.go +++ b/pkg/scheduler/plugins/deviceshare/deviceshare.go @@ -20,7 +20,6 @@ import ( "context" "math" "reflect" - "sync" v1 "k8s.io/api/core/v1" "k8s.io/klog/v2" @@ -28,7 +27,6 @@ import ( "volcano.sh/volcano/pkg/scheduler/api" "volcano.sh/volcano/pkg/scheduler/api/devices" - deviceconfig "volcano.sh/volcano/pkg/scheduler/api/devices/config" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" "volcano.sh/volcano/pkg/scheduler/framework" @@ -46,8 +44,6 @@ const ( SchedulePolicyArgument = "deviceshare.SchedulePolicy" ScheduleWeight = "deviceshare.ScheduleWeight" - - deviceConfigMapName = "deviceshare.configMapName" ) type deviceSharePlugin struct { @@ -55,13 +51,11 @@ type deviceSharePlugin struct { pluginArguments framework.Arguments schedulePolicy string scheduleWeight int - deviceCM string - once sync.Once } // New return priority plugin func New(arguments framework.Arguments) framework.Plugin { - dsp := &deviceSharePlugin{pluginArguments: arguments, schedulePolicy: "", scheduleWeight: 0, once: sync.Once{}} + dsp := &deviceSharePlugin{pluginArguments: arguments, schedulePolicy: "", scheduleWeight: 0} enablePredicate(dsp) return dsp } @@ -87,13 +81,6 @@ func enablePredicate(dsp *deviceSharePlugin) { dsp.schedulePolicy = args[SchedulePolicyArgument].(string) } - _, ok = args[deviceConfigMapName] - if ok { - dsp.deviceCM = args[deviceConfigMapName].(string) - } else { - dsp.deviceCM = "" - } - args.GetInt(&dsp.scheduleWeight, ScheduleWeight) if gpushare.GpuSharingEnable && gpushare.GpuNumberEnable { @@ -127,9 +114,6 @@ func getDeviceScore(ctx context.Context, pod *v1.Pod, node *api.NodeInfo, schedu func (dp *deviceSharePlugin) OnSessionOpen(ssn *framework.Session) { // Register event handlers to update task info in PodLister & nodeMap ssn.AddPredicateFn(dp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) error { - dp.once.Do(func() { - deviceconfig.InitDevicesConfig(ssn.KubeClient(), dp.deviceCM) - }) predicateStatus := make([]*api.Status, 0) // Check PredicateWithCache for _, val := range api.RegisteredDevices {