Merge branch 'master' into master

volcano-sh · Jan 19, 2024 · dd47599 · dd47599
2 parents 9887637 + 1792279
commit dd47599
Show file tree

Hide file tree

Showing 47 changed files with 1,374 additions and 593 deletions.
diff --git a/OWNERS b/OWNERS
@@ -18,6 +18,7 @@ reviewers:
   - wangyang0616
   - Monokaix
   - lowang-bh
+  - archlitchi
 approvers:
   - k82cn
   - kevin-wangzefeng

diff --git a/cmd/scheduler/app/options/options.go b/cmd/scheduler/app/options/options.go
@@ -79,6 +79,11 @@ type ServerOption struct {
 	NodeSelector      []string
 	EnableCacheDumper bool
 	NodeWorkerThreads uint32
+
+	// IgnoredCSIProvisioners contains a list of provisioners, and pod request pvc with these provisioners will
+	// not be counted in pod pvc resource request and node.Allocatable, because the spec.drivers of csinode resource
+	// is always null, these provisioners usually are host path csi controllers like rancher.io/local-path and hostpath.csi.k8s.io.
+	IgnoredCSIProvisioners []string
 }
 
 type DecryptFunc func(c *ServerOption) error
@@ -134,6 +139,7 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) {
 	fs.StringSliceVar(&s.NodeSelector, "node-selector", nil, "volcano only work with the labeled node, like: --node-selector=volcano.sh/role:train --node-selector=volcano.sh/role:serving")
 	fs.BoolVar(&s.EnableCacheDumper, "cache-dumper", true, "Enable the cache dumper, it's true by default")
 	fs.Uint32Var(&s.NodeWorkerThreads, "node-worker-threads", defaultNodeWorkers, "The number of threads syncing node operations.")
+	fs.StringSliceVar(&s.IgnoredCSIProvisioners, "ignored-provisioners", nil, "The provisioners that will be ignored during pod pvc request computation and preemption.")
 }
 
 // CheckOptionOrDie check lock-object-namespace when LeaderElection is enabled.

diff --git a/cmd/scheduler/app/server.go b/cmd/scheduler/app/server.go
@@ -23,9 +23,8 @@ import (
 	"os"
 	"time"
 
-	"github.com/prometheus/client_golang/prometheus/promhttp"
-
 	"volcano.sh/apis/pkg/apis/helpers"
+
 	"volcano.sh/volcano/cmd/scheduler/app/options"
 	"volcano.sh/volcano/pkg/kube"
 	"volcano.sh/volcano/pkg/scheduler"
@@ -34,11 +33,15 @@ import (
 	commonutil "volcano.sh/volcano/pkg/util"
 	"volcano.sh/volcano/pkg/version"
 
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/collectors"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	clientset "k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/kubernetes/scheme"
 	corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
+	"k8s.io/component-base/metrics/legacyregistry"
 	"k8s.io/klog/v2"
 
 	// Register gcp auth
@@ -47,6 +50,9 @@ import (
 	"k8s.io/client-go/tools/leaderelection"
 	"k8s.io/client-go/tools/leaderelection/resourcelock"
 	"k8s.io/client-go/tools/record"
+
+	// Register rest client metrics
+	_ "k8s.io/component-base/metrics/prometheus/restclient"
 )
 
 const (
@@ -81,7 +87,7 @@ func Run(opt *options.ServerOption) error {
 
 	if opt.EnableMetrics {
 		go func() {
-			http.Handle("/metrics", promhttp.Handler())
+			http.Handle("/metrics", promHandler())
 			klog.Fatalf("Prometheus Http Server failed %s", http.ListenAndServe(opt.ListenAddress, nil))
 		}()
 	}
@@ -147,3 +153,10 @@ func Run(opt *options.ServerOption) error {
 	})
 	return fmt.Errorf("lost lease")
 }
+
+func promHandler() http.Handler {
+	// Unregister go and process related collector because it's duplicated and `legacyregistry.DefaultGatherer` also has registered them.
+	prometheus.DefaultRegisterer.Unregister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
+	prometheus.DefaultRegisterer.Unregister(collectors.NewGoCollector())
+	return promhttp.InstrumentMetricHandler(prometheus.DefaultRegisterer, promhttp.HandlerFor(prometheus.Gatherers{prometheus.DefaultGatherer, legacyregistry.DefaultGatherer}, promhttp.HandlerOpts{}))
+}
diff --git a/docs/design/capacity-scheduling.md b/docs/design/capacity-scheduling.md
@@ -0,0 +1,102 @@
+# Capacity scheduling Design
+
+@william-wang  @Monokaix  @Frank Gu
+
+## Motivation
+
+Volcano Proportion plugin provides capacity scheduling and fair scheduling between multiple queues by weight, and the Queue minimal resource (guaranteed resource) and maximal resource (capacity) is supported as well. Users run heterogeneous clusters with varying resource types (i.e. different types of GPU cards and network interfaces, and require  fine-grained resource sharing and preemption between queues. For example it is expected that the share ratio of A100 GPU between ORG1 and ORG2 is 1:3, however, the share ratio of V100 GPU between ORG1 and ORG2 is 1:1. And it is expected to configure the `deserved resource` using resource type scalar rather than `weight` (a simple percentage of cluster resource) for each queue. 
+
+## In Scope
+
+- Allow users to specify `deserved resource` in each queue for resource reclaiming between queues.
+- Allow users to configure multiple dimensional resources in ` deserved resource ` .
+- Support elastic resource management in queue.
+
+## Out of Scope
+
+- Hierarchical elastic resource management
+
+## Proposal
+
+We proposed decouple the capacity scheduling from proportion plugin and support min capacity, max capacity, deserved capacity capability in capacity plugin for elastic resource management and resource preemption in multi-tenancy scenario.
+
+Three fields of the queue are respected by the capacity plugin.
+`Capability`: the upper quota limit by resource type. Under no circumstance can the total resource for any resource type in a queue exceed this limit. 
+`Guarantee`: the lower quota limit by resource type. This part of resources are reserved and not lent to other queue even though there is no job in queue.
+`Deserved`: the deserved quota by resource type. This part of resources is allowed to be shared to other queues and can be reclaimed back.
+
+## User Stories
+
+### Story 1
+
+Administrator can create a queue with Maximum capacity configured and the resource amount of jobs in queue cannot exceed maximum capacity.
+
+### Story 2
+
+Administrator can create two queues with deserved capacity configured and the deserved resource can be lent to jobs in another queue.
+
+### Story 3
+
+Administrator can create two queues with guarantee and deserved resources configured and the deserved resource can be reclaim back. And different resource type can hold diffferent guarantee and deserved quantity. For example, we consume there are 2 orgs Org1 and Org2, and use Queue1 and Queue2 respectively, Queue1's guarantee resources are A100 GPU card number=10, V100 GPU card number=10, and deserved resources are A100 GPU card number=20, V00 GPU card number=50, for Queue2, its guarantee resources are A100 GPU card number=10,V100 GPU card number=10, and deserved resources are A100 GPU card number=80, V100 GPU card number=50. 
+
+<div align="center"><img width="582" height="393" src="images/capacity-scheduling/queue-deserved.png" /></div>
+
+Queue1 can use cluster's total resoures when Queue2 is idle.
+
+<div align="center"><img width="612" height="303" src="images/capacity-scheduling/queue1-use-all.png" /></div>
+
+Queue2 can reclaim its deserved resources when job is submitted to Queue2.
+
+<div align="center"><img width="612" height="481" src="images/capacity-scheduling/queue2-reclaim.png" /></div>
+
+### Story 4
+
+Org2 can reclaim its deserved resources from Org1 but except Org’s guarantee resources.
+
+<div align="center"><img width="586" height="399" src="images/capacity-scheduling/queue-guarantee.png" /></div>
+
+Queue1 can use queue's capability A100=60 when queue2 is idle.
+
+<div align="center"><img width="630" height="303" src="images/capacity-scheduling/queue1-capability.png" /></div>
+
+Queue2 can reclaim from Queue1 when job is submitted but exclude Queue1's guarantee resources A100=20, and can only reclaim A100=40 from Queue1.
+
+<div align="center"><img width="615" height="456" src="images/capacity-scheduling/queue2-reclaim-exclude-guarantee.png" /></div>
+
+## Design detail
+
+### API design
+
+Add a new field `Deserved` of queue spec to indicate the deserved resources of current queue.
+
+```go
+type QueueSpec struct {
+    Capability v1.ResourceList
+    // Reclaimable indicate whether the queue can be reclaimed by other queue
+    Reclaimable *bool
+    // Guarantee indicate configuration about resource reservation
+    Guarantee Guarantee `json:"guarantee,omitempty" protobuf:"bytes,4,opt,name=guarantee"`
+    // Deserved is the deserved resource.
+    Deserved v1.ResourceList
+}
+```
+
+### Main process
+
+Key function needed to add:
+
+- Action Update
+  - **Reclaim:**  Add `ssn.Preemptive` function to check whether queue can  reclaim resources by preempt other queue's resources.
+
+- New Plugin capacity
+
+  - **AddAllocatableFn:**  Check that queue's total allocated resources can not exceed its limit, which is Queue.Allocated + task.Request should <= Queue.capability.
+  - **AddReclaimableFn:** Choose victim whose queue.Allocated > Queue. Deserved, stop choose victim to preempt when Queue.Allocated = Queue.Guarantee.
+
+  - **AddPreemptiveFn:** Check whether queue can reclaim by preempt other queue's resources, when  Queue.Allocated >= Queue.Deserved, reclaim should not happen.
+  - **AddQueueOrderFn:**  Compute and sort queue by share value, share value=Queue.Allocated/Queue.Deserved.
+  - **AddJobEnqueueableFn:** Check whether a job can enqueue.
+
+### Notes
+
+Capacity plugin provides the preemption/reclaim based on `deserved resource ` configured by the user. The Proportion plugin provides fair scheduling based on the weight of the queue. They are different policies for different scenarios. It is not supported to enable them both.
diff --git a/docs/design/images/capacity-scheduling/queue-deserved.png b/docs/design/images/capacity-scheduling/queue-deserved.png
diff --git a/docs/design/images/capacity-scheduling/queue-guarantee.png b/docs/design/images/capacity-scheduling/queue-guarantee.png
diff --git a/docs/design/images/capacity-scheduling/queue1-capability.png b/docs/design/images/capacity-scheduling/queue1-capability.png
diff --git a/docs/design/images/capacity-scheduling/queue1-use-all.png b/docs/design/images/capacity-scheduling/queue1-use-all.png
diff --git a/docs/design/images/capacity-scheduling/queue2-reclaim-exclude-guarantee.png b/docs/design/images/capacity-scheduling/queue2-reclaim-exclude-guarantee.png
diff --git a/docs/design/images/capacity-scheduling/queue2-reclaim.png b/docs/design/images/capacity-scheduling/queue2-reclaim.png
diff --git a/docs/design/images/nodegroup_plugin-predict-1.png b/docs/design/images/nodegroup_plugin-predict-1.png
diff --git a/docs/design/images/nodegroup_plugin-score-1.png b/docs/design/images/nodegroup_plugin-score-1.png
diff --git a/docs/design/img.png b/docs/design/img.png
diff --git a/docs/design/node-group.md b/docs/design/node-group.md
@@ -39,7 +39,7 @@ case2: recommend queue can use private cloud nodes or public cloud nodes, but tt
 
 affinity configure:
 1. affinity.nodeGroupAffinity.requiredDuringSchedulingIgnoredDuringExecution, hard constraints, such as `nlp = nodegroup1,nodegroup2`, it means that task in queue=nlp can ony run on the nodes in nodegroup1 or nodegroup2.
-2. affinity.nodeGroupAffinity.preferredDuringSchedulingIgnoredDuringExecution, soft constraints, such as `nlp = nodegroup1`, it means that task in queue=nlp runs on nodegroup1 first, but if the resources of nodegroup1 is insufficient, it can also run on other nodegroups.
+2. affinity.nodeGroupAffinity.preferredDuringSchedulingIgnoredDuringExecution, soft constraints, such as `nlp = nodegroup1`, it means that task in queue=nlp runs on nodegroup1 first, but if the resources of nodegroup1 is insufficient, it can also run on other nodegroups. Combine rule1 and rule2, task in queue=nlp runs on nodegroup1 first,  but if the resources of nodegroup1 is insufficient, it can also run on nodegroup2.
 3. affinity.nodeGroupAntiAffinity.requiredDuringSchedulingIgnoredDuringExecution, hard constraints, such as `nlp = nodegroup1`, it means that task in queue=nlp can run on any nodegroups but nodegroup1.
 4. affinity.nodeGroupAntiAffinity.preferredDuringSchedulingIgnoredDuringExecution, soft constraints, such as `nlp = nodegroup1`, it means that task in queue=nlp runs on any other nodegroups, but if the resources of other nodegroup is insufficient, it can also run on nodegroup1.
 
@@ -66,3 +66,11 @@ risk: The resources of the queue can not be too different from the resources of
 ## Implement
 
 Hard constraints are implemented by using PredicateFn, and soft constraints are implemented by using NodeOrderFn.
+
+predict flow chart:
+
+![](images/nodegroup_plugin-predict-1.png)
+
+score flow chart:
+
+![](images/nodegroup_plugin-score-1.png)