Skip to content

Update cluster-toolkit to 1.47.0 #416

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 19 commits into from
Mar 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/xpk/commands/kjob_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from ..core.cluster import get_gpu_type_from_cluster


def add_tcpxo_annotations(args, cmd: str) -> str:
def add_tcpxo_annotations(args, cmd) -> str:
tcpxo, interfaces, eth0 = get_a3mega_pod_template_annotations(args)
cmd += f" --pod-template-annotation {tcpxo} \\\n"
cmd += f" --pod-template-annotation {eth0} \\\n"
Expand Down
2 changes: 1 addition & 1 deletion src/xpk/commands/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
limitations under the License.
"""

from ..core.blueprint.blueprint_generator import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
from ..core.cluster import (
create_xpk_k8s_service_account,
get_cluster_credentials,
Expand Down Expand Up @@ -81,6 +80,7 @@
wait_for_job_completion,
zone_to_region,
)
from ..core.network import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
from ..core.workload_decorators import rdma_decorator, tcpxo_decorator, storage_decorator
from ..utils.console import get_user_input, xpk_exit, xpk_print
from ..utils.file import write_tmp_file
Expand Down
22 changes: 11 additions & 11 deletions src/xpk/core/blueprint/blueprint_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,7 @@
}

cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
cluster_toolkit_version = "v1.45.1"


def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
return [f"{cluster_name}-gpunet-{i}-subnet" for i in range(8)]


def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
return [f"{cluster_name}-sub-1"] + [
f"{cluster_name}-rdma-sub-{i}" for i in range(8)
]
cluster_toolkit_version = "develop"


class BlueprintGeneratorOutput:
Expand Down Expand Up @@ -157,6 +147,11 @@ def generate_a3_mega_blueprint(
"total_min_nodes": system_node_pool_min_node_count,
"total_max_nodes": 1000,
},
"k8s_network_names": {
"gvnic_prefix": f"{cluster_name}-gpunet-",
"gvnic_postfix": "-subnet",
"gvnic_start_index": 0,
},
},
outputs=["instructions"],
)
Expand Down Expand Up @@ -490,6 +485,11 @@ def generate_a3_ultra_blueprint(
" alias_ip_range=[]}],"
f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
),
"k8s_network_names": {
"rdma_prefix": f"{cluster_name}-rdma-sub-",
"rdma_start_index": 0,
"rdma_postfix": "",
},
},
outputs=["instructions"],
)
Expand Down
4 changes: 2 additions & 2 deletions src/xpk/core/docker_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@
DockerRunCommandExitCode = 135
dockerBuildErrorCode = 134
ctk_dockerfile_path = "Dockerfile"
ctk_build_ref = "v1.45.1"
ctk_build_ref = "develop"
ctk_docker_image = "xpk-ctk"
ctk_container_name = "xpk-ctk-container"
gcloud_cfg_mount_path = "/root/.config/gcloud"
working_dir_mount_path = "/out"
dockerfile_gh_path = f"https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/{ctk_build_ref}/tools/cloud-build/images/cluster-toolkit-dockerfile/Dockerfile"
dockerfile_gh_path = "https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/develop/tools/cloud-build/images/cluster-toolkit-dockerfile/Dockerfile"
upload_dir_name = "uploads"


Expand Down
43 changes: 28 additions & 15 deletions src/xpk/core/kjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,40 @@
limitations under the License.
"""

from ..core.blueprint.blueprint_generator import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
from argparse import Namespace
import yaml
from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry
from ..utils.console import xpk_print, xpk_exit
from enum import Enum

from ..utils import templates
import yaml
from kubernetes import client as k8s_client
from kubernetes.client import ApiClient
from kubernetes.client.rest import ApiException
from .cluster import setup_k8s_env, XPK_SA, DEFAULT_NAMESPACE
from .storage import get_auto_mount_storages, get_auto_mount_gcsfuse_storages
from .commands import run_command_for_value, run_kubectl_apply, run_command_with_updates
from .config import XpkConfig, KJOB_SHELL_IMAGE, KJOB_SHELL_INTERACTIVE_COMMAND, KJOB_SHELL_WORKING_DIRECTORY, KJOB_BATCH_IMAGE, KJOB_BATCH_WORKING_DIRECTORY
from .resources import get_cluster_system_characteristics, SystemCharacteristics, AcceleratorType
from enum import Enum

from ..core.workload_decorators import tcpxo_decorator

from ..core.workload_decorators import rdma_decorator
from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
from ..core.workload_decorators import rdma_decorator, tcpxo_decorator
from ..utils import templates
from ..utils.console import xpk_exit, xpk_print
from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
from .commands import (
run_command_for_value,
run_command_with_updates,
run_kubectl_apply,
)
from .config import (
KJOB_BATCH_IMAGE,
KJOB_BATCH_WORKING_DIRECTORY,
KJOB_SHELL_IMAGE,
KJOB_SHELL_INTERACTIVE_COMMAND,
KJOB_SHELL_WORKING_DIRECTORY,
XpkConfig,
)
from .network import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
from .resources import (
AcceleratorType,
SystemCharacteristics,
get_cluster_system_characteristics,
)
from .storage import get_auto_mount_gcsfuse_storages, get_auto_mount_storages
from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry

KJOB_API_GROUP_NAME = "kjobctl.x-k8s.io"
KJOB_API_GROUP_VERSION = "v1alpha1"
Expand Down
6 changes: 5 additions & 1 deletion src/xpk/core/tests/data/a3_mega.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
!Blueprint
blueprint_name: xpk-gke-a3-megagpu
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
toolkit_modules_version: v1.45.1
toolkit_modules_version: develop

vars:
project_id: "foo"
Expand Down Expand Up @@ -65,6 +65,10 @@ deployment_groups:
system_node_pool_node_count:
total_min_nodes: 5
total_max_nodes: 1000
k8s_network_names:
gvnic_prefix: "bar-gpunet-"
gvnic_postfix: "-subnet"
gvnic_start_index: 0
outputs: [instructions]
- !DeploymentModule
id: group_placement_0
Expand Down
6 changes: 5 additions & 1 deletion src/xpk/core/tests/data/a3_ultra.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
!Blueprint
blueprint_name: xpk-gke-a3-ultra
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
toolkit_modules_version: v1.45.1
toolkit_modules_version: develop

vars:

Expand Down Expand Up @@ -114,6 +114,10 @@ deployment_groups:
total_min_nodes: 2
total_max_nodes: 1000
additional_networks: $(concat([{network=gke-a3-ultra-net-1.network_name, subnetwork=gke-a3-ultra-net-1.subnetwork_name, subnetwork_project="foo", nic_type="GVNIC", queue_count=null, network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a3-ultra-rdma-net.subnetwork_interfaces_gke))
k8s_network_names:
rdma_prefix: "gke-a3-ultra-rdma-sub-"
rdma_start_index: 0
rdma_postfix: ""
outputs: [instructions]

- !DeploymentModule
Expand Down
4 changes: 2 additions & 2 deletions src/xpk/core/workload_decorators/rdma_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def decorate_kjob_template(job_manifest) -> str:
return job_manifest


def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
"""
Decorates a JobSet manifest with the necessary components for rdma-daemon.

Expand Down Expand Up @@ -80,7 +80,7 @@ def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))


def add_annotations(job_manifest, sub_networks):
def add_annotations(job_manifest: dict, sub_networks: list[str]):
"""Adds or updates annotations in the Pod template."""
annotations = job_manifest['spec']['template']['metadata']['annotations']
interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
Expand Down
4 changes: 2 additions & 2 deletions src/xpk/core/workload_decorators/tcpxo_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def decorate_job(job_manifest: dict, sub_networks: list[str]) -> dict:
return job_manifest


def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
"""
Decorates a JobSet manifest with the necessary components for tcpxo-daemon.

Expand Down Expand Up @@ -105,7 +105,7 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
)


def add_annotations(job_manifest, sub_networks):
def add_annotations(job_manifest: dict, sub_networks: list[str]):
"""Adds or updates annotations in the Pod template."""
annotations = job_manifest['spec']['template']['metadata']['annotations']
tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
Expand Down
Loading