drop bypassCoscheduler; add test for setting schedulerName

project-codeflare · Dec 19, 2024 · 01050db · 01050db
1 parent 02213ef
commit 01050db
Show file tree

Hide file tree

Showing 7 changed files with 164 additions and 10 deletions.
diff --git a/tools/pytorchjob-generator/chart/README.md b/tools/pytorchjob-generator/chart/README.md
@@ -59,7 +59,7 @@ customize the Jobs generated by the tool.
 | initContainers | array | `nil` | List of "(name, image, command[])" specifying an init containers to be run before the main job. The 'command' field is a list of commands to run in the container, see the Kubernetes entry on initContainers for reference.  |
 | autopilotHealthChecks | array | No pre-flight checks are enabled. | Autopilot health checks. List of labels enabling one or more system health pre-flight checks. |
 | hostIgnoreList | array | `nil` | List of host names on which the Job must not be scheduled (to avoid faulty nodes). |
-| bypassCoscheduler | boolean | `false` | If true, use the default Kubernetes scheduler instead of the co-scheduler. ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set to true when explicitly directed to do so by a cluster admin!*** |
+| schedulerName | string | `nil` | If non-nil, use the specified Kubernetes scheduler. ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** |
 | serviceAccountName | string | the default service account for the namespace will be used. | Service account to be used for running the Job |
 
 ### Fault Tolerance

diff --git a/tools/pytorchjob-generator/chart/templates/_helpers.tpl b/tools/pytorchjob-generator/chart/templates/_helpers.tpl
@@ -34,7 +34,7 @@ metadata:
 {{- if ne .Values.terminationGracePeriodSeconds nil }}
 terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
 {{- end }}
-{{- if .Values.bypassCoscheduler }}
+{{- if .Values.schedulerName }}
 schedulerName: {{ .Values.schedulerName }}
 {{- end }}
 priorityClassName: {{ .Values.priority }}

diff --git a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap
@@ -1362,3 +1362,153 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
                         - emptyDir:
                             medium: Memory
                           name: dshm
+scheduler can be set:
+  1: |
+    apiVersion: workload.codeflare.dev/v1beta2
+    kind: AppWrapper
+    metadata:
+      annotations:
+        workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.6
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+      name: my-job
+      namespace: my-namespace
+    spec:
+      components:
+        - template:
+            apiVersion: kubeflow.org/v1
+            kind: PyTorchJob
+            metadata:
+              name: my-job
+            spec:
+              pytorchReplicaSpecs:
+                Master:
+                  replicas: 1
+                  restartPolicy: Never
+                  template:
+                    spec:
+                      affinity:
+                        nodeAffinity:
+                          requiredDuringSchedulingIgnoredDuringExecution:
+                            nodeSelectorTerms:
+                              - matchExpressions:
+                                  - key: autopilot.ibm.com/gpuhealth
+                                    operator: NotIn
+                                    values:
+                                      - ERR
+                                      - TESTING
+                                      - EVICT
+                      containers:
+                        - command:
+                            - sh
+                            - -c
+                            - |
+                              echo "Environment variables set by the kubeflow training operator:"
+                              echo ${MASTER_ADDR}:${MASTER_PORT}
+                              echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
+                              echo My global rank is ${RANK} / ${WORLD_SIZE}
+                              echo "Other injected environment variables:"
+                              echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
+                              #
+                              # User commands
+                              #
+                              echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
+                              export RANK=$SAKKARA_RANK
+                              git clone https://github.com/dbarnett/python-helloworld
+                              cd python-helloworld
+                              echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
+                              torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
+                          env:
+                            - name: SAKKARA_RANK
+                              valueFrom:
+                                fieldRef:
+                                  fieldPath: metadata.labels['sakkara.member.rank']
+                          image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
+                          imagePullPolicy: IfNotPresent
+                          name: pytorch
+                          resources:
+                            limits:
+                              cpu: 500m
+                              memory: 1Gi
+                              nvidia.com/gpu: 8
+                              nvidia.com/roce_gdr: 0
+                            requests:
+                              cpu: 500m
+                              memory: 1Gi
+                              nvidia.com/gpu: 8
+                              nvidia.com/roce_gdr: 0
+                          volumeMounts:
+                            - mountPath: /dev/shm
+                              name: dshm
+                      imagePullSecrets: []
+                      priorityClassName: default-priority
+                      schedulerName: sakkara
+                      volumes:
+                        - emptyDir:
+                            medium: Memory
+                          name: dshm
+                Worker:
+                  replicas: 3
+                  restartPolicy: Never
+                  template:
+                    spec:
+                      affinity:
+                        nodeAffinity:
+                          requiredDuringSchedulingIgnoredDuringExecution:
+                            nodeSelectorTerms:
+                              - matchExpressions:
+                                  - key: autopilot.ibm.com/gpuhealth
+                                    operator: NotIn
+                                    values:
+                                      - ERR
+                                      - TESTING
+                                      - EVICT
+                      containers:
+                        - command:
+                            - sh
+                            - -c
+                            - |
+                              echo "Environment variables set by the kubeflow training operator:"
+                              echo ${MASTER_ADDR}:${MASTER_PORT}
+                              echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
+                              echo My global rank is ${RANK} / ${WORLD_SIZE}
+                              echo "Other injected environment variables:"
+                              echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
+                              #
+                              # User commands
+                              #
+                              echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
+                              export RANK=$SAKKARA_RANK
+                              git clone https://github.com/dbarnett/python-helloworld
+                              cd python-helloworld
+                              echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
+                              torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
+                          env:
+                            - name: SAKKARA_RANK
+                              valueFrom:
+                                fieldRef:
+                                  fieldPath: metadata.labels['sakkara.member.rank']
+                          image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
+                          imagePullPolicy: IfNotPresent
+                          name: pytorch
+                          resources:
+                            limits:
+                              cpu: 500m
+                              memory: 1Gi
+                              nvidia.com/gpu: 8
+                              nvidia.com/roce_gdr: 0
+                            requests:
+                              cpu: 500m
+                              memory: 1Gi
+                              nvidia.com/gpu: 8
+                              nvidia.com/roce_gdr: 0
+                          volumeMounts:
+                            - mountPath: /dev/shm
+                              name: dshm
+                      imagePullSecrets: []
+                      priorityClassName: default-priority
+                      schedulerName: sakkara
+                      volumes:
+                        - emptyDir:
+                            medium: Memory
+                          name: dshm
diff --git a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml
@@ -86,6 +86,13 @@ tests:
       path: metadata.namespace
       value: testing-ns
 
+- it: scheduler can be set
+  set:
+    schedulerName: sakkara
+  asserts:
+  - matchSnapshot:
+    path: spec.components[0].template
+
 - it: Enabling sshGitConfig injects the envvars, volumes, and volumeMounts
   set:
     sshGitCloneConfig.secretName: my-git-secret

diff --git a/tools/pytorchjob-generator/chart/values.schema.json b/tools/pytorchjob-generator/chart/values.schema.json
@@ -114,10 +114,9 @@
             { "type": "null" },
             { "type": "array" }
         ]},
-        "bypassCoscheduler": { "type": "boolean" },
         "schedulerName": { "oneOf": [
             { "type": "null" },
-            { "type": "string", "enum": ["sakkara", "default-scheduler" ] }
+            { "type": "string", "enum": ["sakkara", "scheduler-plugins-scheduler", "default-scheduler" ] }
         ]},
         "serviceAccountName":  { "oneOf" : [
             { "type": "null" },

diff --git a/tools/pytorchjob-generator/chart/values.yaml b/tools/pytorchjob-generator/chart/values.yaml
@@ -211,11 +211,11 @@ hostIgnoreList:
 #    - a100-large-drlfv-worker-3-with-secondary-nw5qh
 #    - a100-large-drlfv-worker-3-with-secondary-lb7ch
 
-# -- (boolean) If true, use the default Kubernetes scheduler instead of the co-scheduler.
-# ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set
-# to true when explicitly directed to do so by a cluster admin!***
+# -- (string) If non-nil, use the specified Kubernetes scheduler.
+# ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this
+# to any non-nil value should only be done when explicitly directed to do so by a cluster admin!***
 # @section -- Advanced Options
-bypassCoscheduler: false
+schedulerName:
 
 # -- (string) Service account to be used for running the Job
 # @section -- Advanced Options

diff --git a/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml b/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml
@@ -1,8 +1,6 @@
-namespace: my-namespace       # namespace to deploy to (required)
 jobName: my-job               # name of the generated AppWrapper and PyTorchJob objects (required)
 queueName: default-queue      # local queue to submit to (default: default-queue)
 
-bypassCoscheduler: true
 schedulerName: sakkara
 # If additional constraints are used, specify the configmap here:
 #customLabels: