From a02e30a36b1a2609e814e604c648ed4a6b7a7bf3 Mon Sep 17 00:00:00 2001
From: limengxuan <391013634@qq.com>
Date: Tue, 18 Feb 2025 17:22:54 +0800
Subject: [PATCH] update

Signed-off-by: limengxuan <391013634@qq.com>
---
 pkg/scheduler/api/devices/config/config.go    | 49 ++++++++++---------
 .../api/devices/nvidia/vgpu/device_info.go    |  9 +++-
 .../api/devices/nvidia/vgpu/utils.go          | 26 ++++++++++
 pkg/scheduler/api/node_info.go                |  2 -
 .../plugins/deviceshare/deviceshare.go        | 18 +------
 5 files changed, 62 insertions(+), 42 deletions(-)

diff --git a/pkg/scheduler/api/devices/config/config.go b/pkg/scheduler/api/devices/config/config.go
index 2567c4fff4..bd5c2b3179 100644
--- a/pkg/scheduler/api/devices/config/config.go
+++ b/pkg/scheduler/api/devices/config/config.go
@@ -19,11 +19,13 @@ package config
 import (
 	"context"
 	"errors"
+	"sync"
 
 	"gopkg.in/yaml.v2"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/klog/v2"
+	"volcano.sh/volcano/pkg/scheduler/api/devices"
 )
 
 type Config struct {
@@ -32,6 +34,7 @@ type Config struct {
 
 var (
 	configs *Config
+	once    sync.Once
 )
 
 func GetConfig() *Config {
@@ -58,27 +61,29 @@ func LoadConfigFromCM(kubeClient kubernetes.Interface, cmName string) (*Config,
 	return &yamlData, nil
 }
 
-func InitDevicesConfig(kubeClient kubernetes.Interface, cmName string) {
-	var err error
-	if len(cmName) == 0 {
-		cmName = "volcano-vgpu-device-config"
-	}
-	configs, err = LoadConfigFromCM(kubeClient, cmName)
-	if err != nil {
-		configs = &Config{
-			NvidiaConfig: NvidiaConfig{
-				ResourceCountName:   "volcano.sh/vgpu-number",
-				ResourceCoreName:    "volcano.sh/vgpu-cores",
-				ResourceMemoryName:  "volcano.sh/vgpu-memory",
-				DefaultMemory:       0,
-				DefaultCores:        0,
-				DefaultGPUNum:       1,
-				DeviceSplitCount:    10,
-				DeviceMemoryScaling: 1,
-				DeviceCoreScaling:   1,
-				DisableCoreLimit:    false,
-			},
+func InitDevicesConfig(cmName string) {
+	once.Do(func() {
+		var err error
+		if len(cmName) == 0 {
+			cmName = "volcano-vgpu-device-config"
 		}
-	}
-	klog.V(3).InfoS("Initializing volcano vgpu config", "device-configs", configs)
+		configs, err = LoadConfigFromCM(devices.GetClient(), cmName)
+		if err != nil {
+			configs = &Config{
+				NvidiaConfig: NvidiaConfig{
+					ResourceCountName:   "volcano.sh/vgpu-number",
+					ResourceCoreName:    "volcano.sh/vgpu-cores",
+					ResourceMemoryName:  "volcano.sh/vgpu-memory",
+					DefaultMemory:       0,
+					DefaultCores:        0,
+					DefaultGPUNum:       1,
+					DeviceSplitCount:    10,
+					DeviceMemoryScaling: 1,
+					DeviceCoreScaling:   1,
+					DisableCoreLimit:    false,
+				},
+			}
+		}
+		klog.V(3).InfoS("-=-=-=-=-=-=-=-=-=-=-=-Initializing volcano vgpu config", "device-configs", configs)
+	})
 }
diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go b/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go
index 015a53969d..a8efdaa3ca 100644
--- a/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go
+++ b/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go
@@ -18,6 +18,7 @@ package vgpu
 
 import (
 	"fmt"
+	"os"
 	"strconv"
 	"strings"
 	"time"
@@ -102,12 +103,18 @@ func NewGPUDevices(name string, node *v1.Node) *GPUDevices {
 	if !ok {
 		return nil
 	}
+	devicecm := os.Getenv("VOLCANO_DEVICE_CM")
+	if len(devicecm) == 0 {
+		devicecm = "volcano-vgpu-device-config"
+	}
+	deviceconfig.InitDevicesConfig(devicecm)
+
 	nodedevices := decodeNodeDevices(name, annos)
 	if (nodedevices == nil) || len(nodedevices.Device) == 0 {
 		return nil
 	}
 	for _, val := range nodedevices.Device {
-		klog.V(4).InfoS("Nvidia Device registered name", "name", nodedevices.Name, "val", *val)
+		klog.V(3).InfoS("Nvidia Device registered name", "name", nodedevices.Name, "val", *val)
 		ResetDeviceMetrics(val.UUID, node.Name, float64(val.Memory))
 	}
 
diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/utils.go b/pkg/scheduler/api/devices/nvidia/vgpu/utils.go
index ed8a90e9e7..62056810ab 100644
--- a/pkg/scheduler/api/devices/nvidia/vgpu/utils.go
+++ b/pkg/scheduler/api/devices/nvidia/vgpu/utils.go
@@ -19,6 +19,7 @@ package vgpu
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"strconv"
 	"strings"
@@ -57,6 +58,21 @@ func patchNodeAnnotations(node *v1.Node, annotations map[string]string) error {
 	return err
 }
 
+func extractGeoMetriyFromType(t string) ([]config.Geometry, error) {
+	for _, val := range config.GetConfig().NvidiaConfig.MigGeometriesList {
+		found := false
+		for _, migDevType := range val.Models {
+			if strings.Contains(t, migDevType) {
+				found = true
+			}
+		}
+		if found {
+			return val.Geometries, nil
+		}
+	}
+	return []config.Geometry{}, errors.New("mig type not found")
+}
+
 func decodeNodeDevices(name string, str string) *GPUDevices {
 	if !strings.Contains(str, ":") {
 		return nil
@@ -86,6 +102,16 @@ func decodeNodeDevices(name string, str string) *GPUDevices {
 				MigTemplate: []config.Geometry{},
 				MigUsage:    config.MigInUse{},
 			}
+			if len(items) > 5 {
+				i.Mode = items[5]
+				if i.Mode == "mig" {
+					var err error
+					i.MigTemplate, err = extractGeoMetriyFromType(i.Type)
+					if err != nil {
+						i.Mode = "hami-core"
+					}
+				}
+			}
 			retval.Device[index] = &i
 		}
 	}
diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go
index f1fdbdfc32..bdaad2d9fb 100644
--- a/pkg/scheduler/api/node_info.go
+++ b/pkg/scheduler/api/node_info.go
@@ -26,7 +26,6 @@ import (
 	k8sframework "k8s.io/kubernetes/pkg/scheduler/framework"
 
 	"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-
 	"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare"
 	"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu"
 )
@@ -336,7 +335,6 @@ func (ni *NodeInfo) setNodeOthersResource(node *v1.Node) {
 		klog.Warningf("received argument of nil node, no need to set other resources for %s", ni.Name)
 		return
 	}
-
 	ni.Others[GPUSharingDevice] = gpushare.NewGPUDevices(ni.Name, node)
 	ni.Others[vgpu.DeviceName] = vgpu.NewGPUDevices(ni.Name, node)
 	IgnoredDevicesList.Set(
diff --git a/pkg/scheduler/plugins/deviceshare/deviceshare.go b/pkg/scheduler/plugins/deviceshare/deviceshare.go
index 7a32a9a691..ba688c8c50 100644
--- a/pkg/scheduler/plugins/deviceshare/deviceshare.go
+++ b/pkg/scheduler/plugins/deviceshare/deviceshare.go
@@ -20,7 +20,6 @@ import (
 	"context"
 	"math"
 	"reflect"
-	"sync"
 
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/klog/v2"
@@ -28,7 +27,6 @@ import (
 
 	"volcano.sh/volcano/pkg/scheduler/api"
 	"volcano.sh/volcano/pkg/scheduler/api/devices"
-	deviceconfig "volcano.sh/volcano/pkg/scheduler/api/devices/config"
 	"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare"
 	"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu"
 	"volcano.sh/volcano/pkg/scheduler/framework"
@@ -46,8 +44,6 @@ const (
 
 	SchedulePolicyArgument = "deviceshare.SchedulePolicy"
 	ScheduleWeight         = "deviceshare.ScheduleWeight"
-
-	deviceConfigMapName = "deviceshare.configMapName"
 )
 
 type deviceSharePlugin struct {
@@ -55,13 +51,11 @@ type deviceSharePlugin struct {
 	pluginArguments framework.Arguments
 	schedulePolicy  string
 	scheduleWeight  int
-	deviceCM        string
-	once            sync.Once
 }
 
 // New return priority plugin
 func New(arguments framework.Arguments) framework.Plugin {
-	dsp := &deviceSharePlugin{pluginArguments: arguments, schedulePolicy: "", scheduleWeight: 0, once: sync.Once{}}
+	dsp := &deviceSharePlugin{pluginArguments: arguments, schedulePolicy: "", scheduleWeight: 0}
 	enablePredicate(dsp)
 	return dsp
 }
@@ -87,13 +81,6 @@ func enablePredicate(dsp *deviceSharePlugin) {
 		dsp.schedulePolicy = args[SchedulePolicyArgument].(string)
 	}
 
-	_, ok = args[deviceConfigMapName]
-	if ok {
-		dsp.deviceCM = args[deviceConfigMapName].(string)
-	} else {
-		dsp.deviceCM = ""
-	}
-
 	args.GetInt(&dsp.scheduleWeight, ScheduleWeight)
 
 	if gpushare.GpuSharingEnable && gpushare.GpuNumberEnable {
@@ -127,9 +114,6 @@ func getDeviceScore(ctx context.Context, pod *v1.Pod, node *api.NodeInfo, schedu
 func (dp *deviceSharePlugin) OnSessionOpen(ssn *framework.Session) {
 	// Register event handlers to update task info in PodLister & nodeMap
 	ssn.AddPredicateFn(dp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) error {
-		dp.once.Do(func() {
-			deviceconfig.InitDevicesConfig(ssn.KubeClient(), dp.deviceCM)
-		})
 		predicateStatus := make([]*api.Status, 0)
 		// Check PredicateWithCache
 		for _, val := range api.RegisteredDevices {