Skip to content

Commit

Permalink
drop bypassCoscheduler; add test for setting schedulerName
Browse files Browse the repository at this point in the history
  • Loading branch information
dgrove-oss committed Dec 19, 2024
1 parent 02213ef commit 01050db
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 10 deletions.
2 changes: 1 addition & 1 deletion tools/pytorchjob-generator/chart/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ customize the Jobs generated by the tool.
| initContainers | array | `nil` | List of "(name, image, command[])" specifying an init containers to be run before the main job. The 'command' field is a list of commands to run in the container, see the Kubernetes entry on initContainers for reference. |
| autopilotHealthChecks | array | No pre-flight checks are enabled. | Autopilot health checks. List of labels enabling one or more system health pre-flight checks. |
| hostIgnoreList | array | `nil` | List of host names on which the Job must not be scheduled (to avoid faulty nodes). |
| bypassCoscheduler | boolean | `false` | If true, use the default Kubernetes scheduler instead of the co-scheduler. ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set to true when explicitly directed to do so by a cluster admin!*** |
| schedulerName | string | `nil` | If non-nil, use the specified Kubernetes scheduler. ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** |
| serviceAccountName | string | the default service account for the namespace will be used. | Service account to be used for running the Job |

### Fault Tolerance
Expand Down
2 changes: 1 addition & 1 deletion tools/pytorchjob-generator/chart/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ metadata:
{{- if ne .Values.terminationGracePeriodSeconds nil }}
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
{{- end }}
{{- if .Values.bypassCoscheduler }}
{{- if .Values.schedulerName }}
schedulerName: {{ .Values.schedulerName }}
{{- end }}
priorityClassName: {{ .Values.priority }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1362,3 +1362,153 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
- emptyDir:
medium: Memory
name: dshm
scheduler can be set:
1: |
apiVersion: workload.codeflare.dev/v1beta2
kind: AppWrapper
metadata:
annotations:
workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.6
labels:
kueue.x-k8s.io/queue-name: default-queue
name: my-job
namespace: my-namespace
spec:
components:
- template:
apiVersion: kubeflow.org/v1
kind: PyTorchJob
metadata:
name: my-job
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: Never
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
- -c
- |
echo "Environment variables set by the kubeflow training operator:"
echo ${MASTER_ADDR}:${MASTER_PORT}
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
echo My global rank is ${RANK} / ${WORLD_SIZE}
echo "Other injected environment variables:"
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
#
# User commands
#
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
export RANK=$SAKKARA_RANK
git clone https://github.com/dbarnett/python-helloworld
cd python-helloworld
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
env:
- name: SAKKARA_RANK
valueFrom:
fieldRef:
fieldPath: metadata.labels['sakkara.member.rank']
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
imagePullPolicy: IfNotPresent
name: pytorch
resources:
limits:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
requests:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
volumeMounts:
- mountPath: /dev/shm
name: dshm
imagePullSecrets: []
priorityClassName: default-priority
schedulerName: sakkara
volumes:
- emptyDir:
medium: Memory
name: dshm
Worker:
replicas: 3
restartPolicy: Never
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
- -c
- |
echo "Environment variables set by the kubeflow training operator:"
echo ${MASTER_ADDR}:${MASTER_PORT}
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
echo My global rank is ${RANK} / ${WORLD_SIZE}
echo "Other injected environment variables:"
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
#
# User commands
#
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
export RANK=$SAKKARA_RANK
git clone https://github.com/dbarnett/python-helloworld
cd python-helloworld
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
env:
- name: SAKKARA_RANK
valueFrom:
fieldRef:
fieldPath: metadata.labels['sakkara.member.rank']
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
imagePullPolicy: IfNotPresent
name: pytorch
resources:
limits:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
requests:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
volumeMounts:
- mountPath: /dev/shm
name: dshm
imagePullSecrets: []
priorityClassName: default-priority
schedulerName: sakkara
volumes:
- emptyDir:
medium: Memory
name: dshm
7 changes: 7 additions & 0 deletions tools/pytorchjob-generator/chart/tests/helloworld_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ tests:
path: metadata.namespace
value: testing-ns

- it: scheduler can be set
set:
schedulerName: sakkara
asserts:
- matchSnapshot:
path: spec.components[0].template

- it: Enabling sshGitConfig injects the envvars, volumes, and volumeMounts
set:
sshGitCloneConfig.secretName: my-git-secret
Expand Down
3 changes: 1 addition & 2 deletions tools/pytorchjob-generator/chart/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,9 @@
{ "type": "null" },
{ "type": "array" }
]},
"bypassCoscheduler": { "type": "boolean" },
"schedulerName": { "oneOf": [
{ "type": "null" },
{ "type": "string", "enum": ["sakkara", "default-scheduler" ] }
{ "type": "string", "enum": ["sakkara", "scheduler-plugins-scheduler", "default-scheduler" ] }
]},
"serviceAccountName": { "oneOf" : [
{ "type": "null" },
Expand Down
8 changes: 4 additions & 4 deletions tools/pytorchjob-generator/chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -211,11 +211,11 @@ hostIgnoreList:
# - a100-large-drlfv-worker-3-with-secondary-nw5qh
# - a100-large-drlfv-worker-3-with-secondary-lb7ch

# -- (boolean) If true, use the default Kubernetes scheduler instead of the co-scheduler.
# ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set
# to true when explicitly directed to do so by a cluster admin!***
# -- (string) If non-nil, use the specified Kubernetes scheduler.
# ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this
# to any non-nil value should only be done when explicitly directed to do so by a cluster admin!***
# @section -- Advanced Options
bypassCoscheduler: false
schedulerName:

# -- (string) Service account to be used for running the Job
# @section -- Advanced Options
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
namespace: my-namespace # namespace to deploy to (required)
jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required)
queueName: default-queue # local queue to submit to (default: default-queue)

bypassCoscheduler: true
schedulerName: sakkara
# If additional constraints are used, specify the configmap here:
#customLabels:
Expand Down

0 comments on commit 01050db

Please # to comment.