Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
Signed-off-by: limengxuan <391013634@qq.com>
  • Loading branch information
archlitchi committed Feb 18, 2025
1 parent 2eb2d41 commit a02e30a
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 42 deletions.
49 changes: 27 additions & 22 deletions pkg/scheduler/api/devices/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@ package config
import (
"context"
"errors"
"sync"

"gopkg.in/yaml.v2"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"volcano.sh/volcano/pkg/scheduler/api/devices"
)

type Config struct {
Expand All @@ -32,6 +34,7 @@ type Config struct {

var (
configs *Config
once sync.Once
)

func GetConfig() *Config {
Expand All @@ -58,27 +61,29 @@ func LoadConfigFromCM(kubeClient kubernetes.Interface, cmName string) (*Config,
return &yamlData, nil
}

func InitDevicesConfig(kubeClient kubernetes.Interface, cmName string) {
var err error
if len(cmName) == 0 {
cmName = "volcano-vgpu-device-config"
}
configs, err = LoadConfigFromCM(kubeClient, cmName)
if err != nil {
configs = &Config{
NvidiaConfig: NvidiaConfig{
ResourceCountName: "volcano.sh/vgpu-number",
ResourceCoreName: "volcano.sh/vgpu-cores",
ResourceMemoryName: "volcano.sh/vgpu-memory",
DefaultMemory: 0,
DefaultCores: 0,
DefaultGPUNum: 1,
DeviceSplitCount: 10,
DeviceMemoryScaling: 1,
DeviceCoreScaling: 1,
DisableCoreLimit: false,
},
func InitDevicesConfig(cmName string) {
once.Do(func() {
var err error
if len(cmName) == 0 {
cmName = "volcano-vgpu-device-config"
}
}
klog.V(3).InfoS("Initializing volcano vgpu config", "device-configs", configs)
configs, err = LoadConfigFromCM(devices.GetClient(), cmName)
if err != nil {
configs = &Config{
NvidiaConfig: NvidiaConfig{
ResourceCountName: "volcano.sh/vgpu-number",
ResourceCoreName: "volcano.sh/vgpu-cores",
ResourceMemoryName: "volcano.sh/vgpu-memory",
DefaultMemory: 0,
DefaultCores: 0,
DefaultGPUNum: 1,
DeviceSplitCount: 10,
DeviceMemoryScaling: 1,
DeviceCoreScaling: 1,
DisableCoreLimit: false,
},
}
}
klog.V(3).InfoS("-=-=-=-=-=-=-=-=-=-=-=-Initializing volcano vgpu config", "device-configs", configs)
})
}
9 changes: 8 additions & 1 deletion pkg/scheduler/api/devices/nvidia/vgpu/device_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package vgpu

import (
"fmt"
"os"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -102,12 +103,18 @@ func NewGPUDevices(name string, node *v1.Node) *GPUDevices {
if !ok {
return nil
}
devicecm := os.Getenv("VOLCANO_DEVICE_CM")
if len(devicecm) == 0 {
devicecm = "volcano-vgpu-device-config"
}
deviceconfig.InitDevicesConfig(devicecm)

nodedevices := decodeNodeDevices(name, annos)
if (nodedevices == nil) || len(nodedevices.Device) == 0 {
return nil
}
for _, val := range nodedevices.Device {
klog.V(4).InfoS("Nvidia Device registered name", "name", nodedevices.Name, "val", *val)
klog.V(3).InfoS("Nvidia Device registered name", "name", nodedevices.Name, "val", *val)
ResetDeviceMetrics(val.UUID, node.Name, float64(val.Memory))
}

Expand Down
26 changes: 26 additions & 0 deletions pkg/scheduler/api/devices/nvidia/vgpu/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package vgpu
import (
"context"
"encoding/json"
"errors"
"fmt"
"strconv"
"strings"
Expand Down Expand Up @@ -57,6 +58,21 @@ func patchNodeAnnotations(node *v1.Node, annotations map[string]string) error {
return err
}

func extractGeoMetriyFromType(t string) ([]config.Geometry, error) {
for _, val := range config.GetConfig().NvidiaConfig.MigGeometriesList {
found := false
for _, migDevType := range val.Models {
if strings.Contains(t, migDevType) {
found = true
}
}
if found {
return val.Geometries, nil
}
}
return []config.Geometry{}, errors.New("mig type not found")
}

func decodeNodeDevices(name string, str string) *GPUDevices {
if !strings.Contains(str, ":") {
return nil
Expand Down Expand Up @@ -86,6 +102,16 @@ func decodeNodeDevices(name string, str string) *GPUDevices {
MigTemplate: []config.Geometry{},
MigUsage: config.MigInUse{},
}
if len(items) > 5 {
i.Mode = items[5]
if i.Mode == "mig" {
var err error
i.MigTemplate, err = extractGeoMetriyFromType(i.Type)
if err != nil {
i.Mode = "hami-core"
}
}
}
retval.Device[index] = &i
}
}
Expand Down
2 changes: 0 additions & 2 deletions pkg/scheduler/api/node_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import (
k8sframework "k8s.io/kubernetes/pkg/scheduler/framework"

"volcano.sh/apis/pkg/apis/scheduling/v1beta1"

"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare"
"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu"
)
Expand Down Expand Up @@ -336,7 +335,6 @@ func (ni *NodeInfo) setNodeOthersResource(node *v1.Node) {
klog.Warningf("received argument of nil node, no need to set other resources for %s", ni.Name)
return
}

ni.Others[GPUSharingDevice] = gpushare.NewGPUDevices(ni.Name, node)
ni.Others[vgpu.DeviceName] = vgpu.NewGPUDevices(ni.Name, node)
IgnoredDevicesList.Set(
Expand Down
18 changes: 1 addition & 17 deletions pkg/scheduler/plugins/deviceshare/deviceshare.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,13 @@ import (
"context"
"math"
"reflect"
"sync"

v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
k8sframework "k8s.io/kubernetes/pkg/scheduler/framework"

"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/api/devices"
deviceconfig "volcano.sh/volcano/pkg/scheduler/api/devices/config"
"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare"
"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu"
"volcano.sh/volcano/pkg/scheduler/framework"
Expand All @@ -46,22 +44,18 @@ const (

SchedulePolicyArgument = "deviceshare.SchedulePolicy"
ScheduleWeight = "deviceshare.ScheduleWeight"

deviceConfigMapName = "deviceshare.configMapName"
)

type deviceSharePlugin struct {
// Arguments given for the plugin
pluginArguments framework.Arguments
schedulePolicy string
scheduleWeight int
deviceCM string
once sync.Once
}

// New return priority plugin
func New(arguments framework.Arguments) framework.Plugin {
dsp := &deviceSharePlugin{pluginArguments: arguments, schedulePolicy: "", scheduleWeight: 0, once: sync.Once{}}
dsp := &deviceSharePlugin{pluginArguments: arguments, schedulePolicy: "", scheduleWeight: 0}
enablePredicate(dsp)
return dsp
}
Expand All @@ -87,13 +81,6 @@ func enablePredicate(dsp *deviceSharePlugin) {
dsp.schedulePolicy = args[SchedulePolicyArgument].(string)
}

_, ok = args[deviceConfigMapName]
if ok {
dsp.deviceCM = args[deviceConfigMapName].(string)
} else {
dsp.deviceCM = ""
}

args.GetInt(&dsp.scheduleWeight, ScheduleWeight)

if gpushare.GpuSharingEnable && gpushare.GpuNumberEnable {
Expand Down Expand Up @@ -127,9 +114,6 @@ func getDeviceScore(ctx context.Context, pod *v1.Pod, node *api.NodeInfo, schedu
func (dp *deviceSharePlugin) OnSessionOpen(ssn *framework.Session) {
// Register event handlers to update task info in PodLister & nodeMap
ssn.AddPredicateFn(dp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) error {
dp.once.Do(func() {
deviceconfig.InitDevicesConfig(ssn.KubeClient(), dp.deviceCM)
})
predicateStatus := make([]*api.Status, 0)
// Check PredicateWithCache
for _, val := range api.RegisteredDevices {
Expand Down

0 comments on commit a02e30a

Please # to comment.