Skip to content

Commit

Permalink
[k8s] Fix GKELabelFormatter for H100s (#3627)
Browse files Browse the repository at this point in the history
* H100-80gb does not exist, fix to H100

* Fix H100 support
  • Loading branch information
romilbhardwaj authored Jun 4, 2024
1 parent 29d6520 commit 0ebc5fd
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ def get_gke_accelerator_name(accelerator: str) -> str:
Uses the format - nvidia-tesla-<accelerator>.
A100-80GB, H100-80GB and L4 are an exception. They use nvidia-<accelerator>.
"""
if accelerator == 'H100':
# H100 is named as H100-80GB in GKE.
accelerator = 'H100-80GB'
if accelerator in ('A100-80GB', 'L4', 'H100-80GB'):
# A100-80GB, L4 and H100-80GB have a different name pattern.
return 'nvidia-{}'.format(accelerator.lower())
Expand Down Expand Up @@ -183,7 +186,11 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
if value.startswith('nvidia-tesla-'):
return value.replace('nvidia-tesla-', '').upper()
elif value.startswith('nvidia-'):
return value.replace('nvidia-', '').upper()
acc = value.replace('nvidia-', '').upper()
if acc == 'H100-80GB':
# H100 is named as H100-80GB in GKE.
return 'H100'
return acc
else:
raise ValueError(
f'Invalid accelerator name in GKE cluster: {value}')
Expand Down

0 comments on commit 0ebc5fd

Please # to comment.