diff --git a/.gitignore b/.gitignore
index ecced5bbd..7c5a8cd87 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # editor and IDE paraphernalia
 .idea/
+.vscode/
 
 *__pycache__*
 tmp/
diff --git a/pytype-conf.cfg b/pytype-conf.cfg
index bd351922d..a7e504353 100644
--- a/pytype-conf.cfg
+++ b/pytype-conf.cfg
@@ -11,12 +11,8 @@ exclude =
     src/xpk/commands
     src/xpk/core/tests
     src/xpk/core/__init__.py
-    src/xpk/core/app_profile.py
-    src/xpk/core/blueprint.py
     src/xpk/core/cluster_private.py
     src/xpk/core/commands.py
-    src/xpk/core/core.py
-    src/xpk/core/job_template.py
     src/xpk/core/kjob.py
     src/xpk/core/kueue.py
     src/xpk/core/nap.py
diff --git a/src/xpk/commands/batch.py b/src/xpk/commands/batch.py
index 114535e11..23726f779 100644
--- a/src/xpk/commands/batch.py
+++ b/src/xpk/commands/batch.py
@@ -16,12 +16,12 @@
 
 from argparse import Namespace
 
+from ..core.commands import run_command_for_value
+from ..core.gcloud_context import add_zone_and_project
+from ..core.kjob import AppProfileDefaults
 from ..core.kueue import LOCAL_QUEUE_NAME
 from ..utils.console import xpk_exit, xpk_print
 from .common import set_cluster_command
-from ..core.core import add_zone_and_project
-from ..core.kjob import AppProfileDefaults
-from ..core.commands import run_command_for_value
 from .kind import set_local_cluster_command
 
 
diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py
index fbca56a7b..07981f2c0 100644
--- a/src/xpk/commands/cluster.py
+++ b/src/xpk/commands/cluster.py
@@ -14,36 +14,28 @@
 limitations under the License.
 """
 
-from ..core.commands import run_command_for_value, run_command_with_updates
-from ..core.core import (
-    VERTEX_TENSORBOARD_FEATURE_FLAG,
-    add_zone_and_project,
-    create_cluster_configmaps,
-    create_cluster_network_config,
-    create_vertex_tensorboard,
-    delete_cluster_subnets,
+from tabulate import tabulate
+
+from ..core.capacity import H100_DEVICE_TYPE
+from ..core.cluster import (
     get_all_clusters_programmatic,
     get_cluster_credentials,
-    get_gke_control_plane_version,
-    get_gke_node_pool_version,
-    get_gke_server_config,
-    h100_device_type,
     install_nccl_on_cluster,
-    run_gke_node_pool_create_command,
     set_jobset_on_cluster,
-    set_up_cluster_network_for_gpu,
     setup_k8s_env,
     update_cluster_with_gcsfuse_driver_if_necessary,
     update_cluster_with_workload_identity_if_necessary,
-    zone_to_region,
-    get_user_input,
 )
 from ..core.cluster_private import authorize_private_cluster_access_if_necessary
-from ..core.kjob import (
-    verify_kjob_installed,
-    prepare_kjob,
-    apply_kjob_crds,
+from ..core.commands import run_command_for_value, run_command_with_updates
+from ..core.config import VERTEX_TENSORBOARD_FEATURE_FLAG
+from ..core.gcloud_context import (
+    add_zone_and_project,
+    get_gke_control_plane_version,
+    get_gke_server_config,
+    zone_to_region,
 )
+from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
 from ..core.kueue import (
     cluster_preheat_yml,
     install_kueue_crs,
@@ -51,7 +43,14 @@
     wait_for_kueue_available,
 )
 from ..core.nap import enable_autoprovisioning_on_cluster
+from ..core.network import (
+    create_cluster_network_config,
+    delete_cluster_subnets,
+    set_up_cluster_network_for_gpu,
+)
+from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
 from ..core.ray import install_ray_cluster
+from ..core.resources import create_cluster_configmaps
 from ..core.storage import install_storage_crd
 from ..core.system_characteristics import (
     AcceleratorType,
@@ -59,13 +58,12 @@
     SystemCharacteristics,
     get_system_characteristics,
 )
+from ..core.vertex import create_vertex_tensorboard
 from ..core.workload import get_workload_list
+from ..utils.console import get_user_input, xpk_exit, xpk_print
 from ..utils.file import write_tmp_file
-from ..utils.console import xpk_exit, xpk_print
 from . import cluster_gcluster
 
-from tabulate import tabulate
-
 
 def cluster_create(args) -> None:
   """Function around cluster creation.
@@ -148,7 +146,7 @@ def cluster_create(args) -> None:
     if set_up_cluster_network_code != 0:
       xpk_exit(set_up_cluster_network_code)
 
-  if system.device_type == h100_device_type:
+  if system.device_type == H100_DEVICE_TYPE:
     xpk_print('Creating Network Config for cluster')
     create_cluster_network_config_code = create_cluster_network_config(args)
     if create_cluster_network_config_code != 0:
diff --git a/src/xpk/commands/cluster_gcluster.py b/src/xpk/commands/cluster_gcluster.py
index 908c6a585..c50c17f36 100644
--- a/src/xpk/commands/cluster_gcluster.py
+++ b/src/xpk/commands/cluster_gcluster.py
@@ -14,16 +14,24 @@
 limitations under the License.
 """
 
-from ..core.blueprint.blueprint_generator import BlueprintGenerator, BlueprintGeneratorOutput, supported_device_types, a3mega_device_type, a3ultra_device_type
+import os
+
+from ..core.blueprint.blueprint_generator import (
+    BlueprintGenerator,
+    BlueprintGeneratorOutput,
+    a3mega_device_type,
+    a3ultra_device_type,
+    supported_device_types,
+)
+from ..core.capacity import get_capacity_type
 from ..core.docker_manager import DockerManager
+from ..core.gcloud_context import zone_to_region
 from ..core.gcluster_manager import GclusterManager
-from ..core.core import zone_to_region, get_capacity_type
 from ..utils.console import xpk_exit, xpk_print
-from ..utils.network import all_IPs_cidr
 from ..utils.file import ensure_directory_exists
+from ..utils.network import all_IPs_cidr
 from ..utils.objects import hash_string
 from .common import set_cluster_command
-import os
 
 blueprints_path = os.path.abspath('xpkclusters/blueprints')
 gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
diff --git a/src/xpk/commands/common.py b/src/xpk/commands/common.py
index f1c3c4098..d927f84c6 100644
--- a/src/xpk/commands/common.py
+++ b/src/xpk/commands/common.py
@@ -15,7 +15,7 @@
 """
 
 from ..core.commands import run_command_with_updates_retry
-from ..core.core import zone_to_region
+from ..core.gcloud_context import zone_to_region
 from ..utils.console import xpk_print
 
 
diff --git a/src/xpk/commands/info.py b/src/xpk/commands/info.py
index 807f4d75a..abdb2e5d0 100644
--- a/src/xpk/commands/info.py
+++ b/src/xpk/commands/info.py
@@ -14,19 +14,17 @@
 limitations under the License.
 """
 
-from ..utils.console import xpk_exit, xpk_print
-from ..core.kueue import verify_kueuectl
-from .common import set_cluster_command
-from ..core.commands import (
-    run_command_for_value,
-)
-from ..core.core import (
-    add_zone_and_project,
-)
 import json
-from tabulate import tabulate
 from argparse import Namespace
 
+from tabulate import tabulate
+
+from ..core.commands import run_command_for_value
+from ..core.gcloud_context import add_zone_and_project
+from ..core.kueue import verify_kueuectl
+from ..utils.console import xpk_exit, xpk_print
+from .common import set_cluster_command
+
 table_fmt = 'plain'
 
 
diff --git a/src/xpk/commands/inspector.py b/src/xpk/commands/inspector.py
index 14afed38b..580aeff36 100644
--- a/src/xpk/commands/inspector.py
+++ b/src/xpk/commands/inspector.py
@@ -14,17 +14,13 @@
 limitations under the License.
 """
 
+from ..core.cluster import get_cluster_credentials
 from ..core.commands import run_command_for_value
-from ..core.core import (
-    CLUSTER_METADATA_CONFIGMAP,
-    CLUSTER_RESOURCES_CONFIGMAP,
-    add_zone_and_project,
-    get_cluster_credentials,
-    zone_to_region,
-)
+from ..core.gcloud_context import add_zone_and_project, zone_to_region
 from ..core.kueue import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
-from ..utils.file import append_tmp_file, write_tmp_file
+from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
 from ..utils.console import xpk_exit, xpk_print
+from ..utils.file import append_tmp_file, write_tmp_file
 from .workload import get_workload_list
 
 
diff --git a/src/xpk/commands/job.py b/src/xpk/commands/job.py
index 3990c7478..7bde661ab 100644
--- a/src/xpk/commands/job.py
+++ b/src/xpk/commands/job.py
@@ -14,16 +14,18 @@
 limitations under the License.
 """
 
-from .common import set_cluster_command
-from .kind import set_local_cluster_command
-from ..core.commands import run_command_for_value, run_command_with_updates
-from ..utils.console import xpk_exit, xpk_print
-from ..core.kjob import AppProfileDefaults
-from ..core.core import add_zone_and_project
-from ruamel.yaml import YAML
 import re
 import sys
 
+from ruamel.yaml import YAML
+
+from ..core.commands import run_command_for_value, run_command_with_updates
+from ..core.gcloud_context import add_zone_and_project
+from ..core.kjob import AppProfileDefaults
+from ..utils.console import xpk_exit, xpk_print
+from .common import set_cluster_command
+from .kind import set_local_cluster_command
+
 
 def job_info(args):
   """Run commands obtaining information about a job given by name.
diff --git a/src/xpk/commands/kind.py b/src/xpk/commands/kind.py
index 07c0ab285..9a42bdfc7 100644
--- a/src/xpk/commands/kind.py
+++ b/src/xpk/commands/kind.py
@@ -18,7 +18,7 @@
     run_command_for_value,
     run_command_with_updates,
 )
-from ..core.core import (
+from ..core.cluster import (
     set_jobset_on_cluster,
 )
 from ..core.kjob import (
diff --git a/src/xpk/commands/run.py b/src/xpk/commands/run.py
index 43eaff0f3..fb5074436 100644
--- a/src/xpk/commands/run.py
+++ b/src/xpk/commands/run.py
@@ -16,12 +16,12 @@
 
 from argparse import Namespace
 
+from ..core.commands import run_command_with_full_controls
+from ..core.gcloud_context import add_zone_and_project
+from ..core.kjob import AppProfileDefaults
 from ..core.kueue import LOCAL_QUEUE_NAME
 from ..utils.console import xpk_exit, xpk_print
 from .common import set_cluster_command
-from ..core.core import add_zone_and_project
-from ..core.kjob import AppProfileDefaults
-from ..core.commands import run_command_with_full_controls
 from .kind import set_local_cluster_command
 
 
diff --git a/src/xpk/commands/storage.py b/src/xpk/commands/storage.py
index 10d141b52..c4c3f6e83 100644
--- a/src/xpk/commands/storage.py
+++ b/src/xpk/commands/storage.py
@@ -19,7 +19,7 @@
 from kubernetes import client as k8s_client
 from kubernetes.client.rest import ApiException
 
-from ..core.core import (
+from ..core.cluster import (
     setup_k8s_env,
     update_cluster_with_gcsfuse_driver_if_necessary,
     update_cluster_with_workload_identity_if_necessary,
diff --git a/src/xpk/commands/version.py b/src/xpk/commands/version.py
index bfd77c58e..4e7ed4590 100644
--- a/src/xpk/commands/version.py
+++ b/src/xpk/commands/version.py
@@ -14,7 +14,7 @@
 limitations under the License.
 """
 
-from ..core.core import __version__
+from ..core.config import __version__
 from ..utils.console import xpk_print
 
 
diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py
index 7149f489d..f5a88efdb 100644
--- a/src/xpk/commands/workload.py
+++ b/src/xpk/commands/workload.py
@@ -14,41 +14,26 @@
 limitations under the License.
 """
 
-from ..core.commands import (
-    run_command_with_updates,
-    run_commands,
+from ..core.cluster import (
+    create_k8s_service_account,
+    get_cluster_credentials,
+    setup_k8s_env,
 )
-from ..core.core import (
-    CLUSTER_METADATA_CONFIGMAP,
+from ..core.commands import run_command_with_updates, run_commands
+from ..core.config import (
     GCS_FUSE_ANNOTATION,
     VERTEX_TENSORBOARD_FEATURE_FLAG,
-    AcceleratorTypeToAcceleratorCharacteristics,
-    add_zone_and_project,
-    check_if_workload_can_schedule,
-    check_if_workload_exists,
-    create_accelerator_label,
-    create_k8s_service_account,
-    create_machine_label,
-    create_vertex_experiment,
-    get_cluster_configmap,
-    get_cluster_credentials,
-    get_cpu_affinity,
-    get_gke_outlier_dashboard,
-    get_gpu_rxdm_cmd,
-    get_gpu_rxdm_image,
-    get_gpu_scheduler,
-    get_gpu_tcp_volume,
-    get_gpu_volume,
+    XPK_CURRENT_VERSION,
+    parse_env_config,
+)
+from ..core.docker_container import (
     get_main_container_docker_image,
     get_user_workload_container,
-    get_volumes,
-    parse_env_config,
-    setup_k8s_env,
-    wait_for_job_completion,
-    xpk_current_version,
-    zone_to_region,
 )
+from ..core.docker_resources import get_volumes
+from ..core.gcloud_context import add_zone_and_project
 from ..core.kueue import LOCAL_QUEUE_NAME
+from ..core.monitoring import get_gke_outlier_dashboard
 from ..core.nap import (
     get_autoprovisioning_node_selector_args,
     is_autoprovisioning_enabled,
@@ -62,6 +47,14 @@
     get_pathways_worker_args,
     get_user_workload_for_pathways,
 )
+from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
+from ..core.scheduling import (
+    check_if_workload_can_schedule,
+    create_accelerator_label,
+    create_machine_label,
+    get_cpu_affinity,
+    get_gpu_scheduler,
+)
 from ..core.storage import (
     GCS_FUSE_TYPE,
     XPK_SA,
@@ -75,15 +68,26 @@
 )
 from ..core.system_characteristics import (
     AcceleratorType,
+    AcceleratorTypeToAcceleratorCharacteristics,
     get_system_characteristics,
 )
-from ..core.workload import get_workload_list
+from ..core.vertex import create_vertex_experiment
+from ..core.workload import (
+    check_if_workload_exists,
+    get_gpu_rxdm_cmd,
+    get_gpu_rxdm_image,
+    get_gpu_tcp_volume,
+    get_gpu_volume,
+    get_workload_list,
+    wait_for_job_completion,
+    zone_to_region,
+)
+from ..core.workload_decorators import rdma_decorator, tcpxo_decorator
 from ..utils.console import get_user_input, xpk_exit, xpk_print
 from ..utils.file import write_tmp_file
-from ..core.workload_decorators import tcpxo_decorator, rdma_decorator
 from . import cluster_gcluster
 
-workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
+WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
 metadata:
   name: {args.workload}
@@ -132,7 +136,7 @@
 """
 
 
-gpu_workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
+GPU_WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
 metadata:
   name: {args.workload}
@@ -199,7 +203,7 @@
               {container}
 """
 
-a3_gpu_workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
+A3_GPU_WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
 metadata:
   name: {args.workload}
@@ -239,7 +243,7 @@
               {container}
 """
 
-pw_workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
+PW_WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
 metadata:
   name: {args.workload}
@@ -450,12 +454,12 @@ def workload_create(args) -> None:
     cluster_xpk_version = cluster_config_map.get('xpk_version')
   if (
       cluster_xpk_version is not None
-      and cluster_xpk_version != xpk_current_version
+      and cluster_xpk_version != XPK_CURRENT_VERSION
   ):
     xpk_print(
         'Warning: Cluster has been created using XPK version:'
         f' {cluster_config_map["xpk_version"]} but the XPK version you are'
-        f' using to schedule workload is: {xpk_current_version}. Some features'
+        f' using to schedule workload is: {XPK_CURRENT_VERSION}. Some features'
         ' might not be available for this cluster. We recommend to'
         ' upgrade/downgrade your XPK version or cluster by running `xpk'
         ' cluster create`.'
@@ -526,7 +530,7 @@ def workload_create(args) -> None:
       xpk_exit(return_code)
 
     if system.device_type in cluster_gcluster.supported_device_types:
-      yml_string = a3_gpu_workload_create_yaml.format(
+      yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
           args=args,
           container=container,
           service_account=XPK_SA,
@@ -552,7 +556,7 @@ def workload_create(args) -> None:
               yml_string, gcs_fuse_storages
           )
     else:
-      yml_string = gpu_workload_create_yaml.format(
+      yml_string = GPU_WORKLOAD_CREATE_YAML.format(
           args=args,
           container=container,
           command=args.command,
@@ -575,7 +579,7 @@ def workload_create(args) -> None:
   elif args.use_pathways and ensure_pathways_workload_prerequisites(
       args, system
   ):
-    yml_string = pw_workload_create_yaml.format(
+    yml_string = PW_WORKLOAD_CREATE_YAML.format(
         args=args,
         system=system,
         accelerator_label=create_accelerator_label(
@@ -606,7 +610,7 @@ def workload_create(args) -> None:
     container, debugging_dashboard_id = get_user_workload_container(
         args, system
     )
-    yml_string = workload_create_yaml.format(
+    yml_string = WORKLOAD_CREATE_YAML.format(
         args=args,
         system=system,
         container=container,
diff --git a/src/xpk/core/blueprint/blueprint_generator.py b/src/xpk/core/blueprint/blueprint_generator.py
index 84cd59111..6971b4f92 100644
--- a/src/xpk/core/blueprint/blueprint_generator.py
+++ b/src/xpk/core/blueprint/blueprint_generator.py
@@ -14,21 +14,22 @@
 limitations under the License.
 """
 
+import os
 import shutil
 from typing import Optional
+
 from ruamel import yaml
-import os
 
-from .blueprint_definitions import DeploymentGroup, DeploymentModule, Blueprint
-from ..system_characteristics import get_system_characteristics_by_device_type
-from ...utils.console import xpk_print, xpk_exit
+from ...utils.console import xpk_exit, xpk_print
 from ...utils.file import ensure_directory_exists
-from ..core import CapacityType, h100_mega_device_type, h200_device_type
+from ..capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE, CapacityType
+from ..system_characteristics import get_system_characteristics_by_device_type
+from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
 
 yaml = yaml.YAML()
 
-a3mega_device_type = h100_mega_device_type
-a3ultra_device_type = h200_device_type
+a3mega_device_type = H100_MEGA_DEVICE_TYPE
+a3ultra_device_type = H200_DEVICE_TYPE
 supported_device_types = {a3mega_device_type, a3ultra_device_type}
 blueprint_dependencies_dir = {
     a3mega_device_type: "src/xpk/blueprints/a3mega",
diff --git a/src/xpk/core/capacity.py b/src/xpk/core/capacity.py
new file mode 100644
index 000000000..3106ffc8d
--- /dev/null
+++ b/src/xpk/core/capacity.py
@@ -0,0 +1,185 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import enum
+
+from ..utils.console import xpk_print
+from .commands import run_command_with_updates
+
+AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
+AUTOPROVISIONING_CONFIG_MINIMUM_KEY = 'minimum_chips'
+AUTOPROVISIONING_CONFIG_MAXIMUM_KEY = 'maximum_chips'
+CAPACITY_TYPE_CONFIG_KEY = 'capacity_type'
+
+H100_DEVICE_TYPE = 'h100-80gb-8'
+H100_MEGA_DEVICE_TYPE = 'h100-mega-80gb-8'
+H200_DEVICE_TYPE = 'h200-141gb-8'
+RESERVATION_CONFIG_KEY = 'reservation_id'
+
+
+class CapacityType(enum.Enum):
+  ON_DEMAND = 'on_demand'
+  RESERVATION = 'reservation'
+  SPOT = 'spot'
+  UNKNOWN = 'unknown'
+
+
+def print_reservations(args) -> int:
+  """Print the reservations in the project.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = f'gcloud beta compute reservations list --project={args.project}'
+  return_code = run_command_with_updates(
+      command, 'Get all reservations in the project', args
+  )
+  if return_code != 0:
+    xpk_print(f'Get all reservations returned ERROR {return_code}')
+    return 1
+  return 0
+
+
+def get_capacity_type(args) -> tuple[CapacityType, int]:
+  """Determine the capacity type based on user arguments.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    Tuple with string with the system characteristics and
+    int of 0 if successful and 1 otherwise.
+  """
+  capacity_type = CapacityType.UNKNOWN
+  num_types = 0
+  return_code = 0
+
+  # Determine the capacity argument.
+  if args.on_demand:
+    capacity_type = CapacityType.ON_DEMAND
+    num_types += 1
+  if args.reservation:
+    return_code = verify_reservation_exists(args)
+    if return_code > 0:
+      return capacity_type, return_code
+    capacity_type = CapacityType.RESERVATION
+    num_types += 1
+  if args.spot:
+    capacity_type = CapacityType.SPOT
+    num_types += 1
+
+  # Check that the number of user arguments provided is valid.
+  if num_types == 0:
+    capacity_type = CapacityType.UNKNOWN
+  elif num_types != 1:
+    xpk_print(
+        'ERROR: User specified more than one of the following arguments. Please'
+        ' specify only one of `--reservation=$RESERVATION_NAME`, `--on-demand`'
+        ' or `--spot`.'
+    )
+    return_code = 1
+
+  return capacity_type, return_code
+
+
+def verify_reservation_exists(args) -> int:
+  """Verify the reservation exists.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = (
+      f'gcloud beta compute reservations describe {args.reservation}'
+      f' --project={args.project} --zone={args.zone}'
+  )
+  return_code = run_command_with_updates(command, 'Describe reservation', args)
+  if return_code != 0:
+    xpk_print(f'Describe reservation returned ERROR {return_code}')
+    xpk_print('Please confirm that your reservation name is correct.')
+    return 1
+  return 0
+
+
+def get_capacity_arguments_from_capacity_type(
+    args, capacity_type: CapacityType
+) -> tuple[str, int]:
+  """Determine the TPU Nodepool creation capacity arguments needed.
+
+  Args:
+    args: user provided arguments for running the command.
+    capacity_type: The type of capacity the user configured.
+
+  Returns:
+    Tuple with string with the capacity argument to use and
+    int of 0 if successful and 1 otherwise.
+  """
+  capacity_args = ''
+  return_code = 0
+
+  match capacity_type:
+    case CapacityType.ON_DEMAND:
+      capacity_args = ''
+    case CapacityType.SPOT:
+      capacity_args = '--spot'
+    case CapacityType.RESERVATION:
+      capacity_args = (
+          f'--reservation-affinity=specific --reservation={args.reservation}'
+      )
+    case _:
+      xpk_print(
+          f'Unknown capacity type: {capacity_type}. Unable to determine'
+          ' capacity args.'
+      )
+      return_code = 1
+  return capacity_args, return_code
+
+
+def get_capacity_node_selectors_from_capacity_type(
+    args, capacity_type: str
+) -> tuple[str, int]:
+  """Determine the node selectors for a workload to run on a specific capacity type.
+
+  Args:
+    args: user provided arguments for running the command.
+    capacity_type: The type of capacity the user configured.
+
+  Returns:
+    Tuple with string with the node selectors to use and
+    int of 0 if successful and 1 otherwise.
+  """
+  node_selector = ''
+  return_code = 0
+
+  match capacity_type:
+    case CapacityType.ON_DEMAND.name:
+      node_selector = ''
+    case CapacityType.SPOT.name:
+      node_selector = 'cloud.google.com/gke-spot="true"'
+    case CapacityType.RESERVATION.name:
+      node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
+    case _:
+      xpk_print(
+          f'Unknown capacity type: {capacity_type}. Unable to determine the'
+          ' node selectors.'
+      )
+      return_code = 1
+  return node_selector, return_code
diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py
new file mode 100644
index 000000000..163917728
--- /dev/null
+++ b/src/xpk/core/cluster.py
@@ -0,0 +1,460 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from google.api_core.exceptions import PermissionDenied
+from google.cloud import resourcemanager_v3
+from kubernetes import client as k8s_client
+from kubernetes import config
+from kubernetes.client.exceptions import ApiException
+
+from ..utils.console import xpk_exit, xpk_print
+from .capacity import H100_DEVICE_TYPE
+from .commands import (
+    run_command_for_value,
+    run_command_with_updates,
+    run_command_with_updates_retry,
+)
+from .gcloud_context import add_zone_and_project, get_gke_server_config, zone_to_region
+from .nodepool import upgrade_gke_nodepools_version
+from .system_characteristics import SystemCharacteristics
+
+JOBSET_VERSION = 'v0.7.2'
+INSTALLER_NCC_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
+INSTALLER_NCC_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
+
+
+# TODO(vbarr): Remove this function when jobsets gets enabled by default on
+# GKE clusters.
+def set_jobset_on_cluster(args) -> int:
+  """Add jobset command on server side and ask user to verify it is created.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = (
+      'kubectl apply --server-side -f'
+      f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
+  )
+  task = f'Install Jobset on {args.cluster}'
+  return_code = run_command_with_updates_retry(command, task, args)
+
+  if return_code != 0:
+    xpk_print(f'{task} returned with ERROR {return_code}.\n')
+    xpk_print(
+        "This LIKELY means you're missing Kubernetes Permissions, you can"
+        ' validate this by checking if the error references permission problems'
+        ' such as `requires one of ["container.*"] permission(s)`. Follow our'
+        ' readme:'
+        ' https://github.com/google/xpk/blob/main/README.md#troubleshooting for'
+        ' instructions on how to fix these permissions.'
+    )
+  return return_code
+
+
+def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
+  """Install NCCL plugin on the cluster.
+
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  if system.device_type == H100_DEVICE_TYPE:
+    command = f'kubectl apply -f {INSTALLER_NCC_TCPX}'
+  else:
+    command = f'kubectl apply -f {INSTALLER_NCC_TCPXO}'
+
+  return_code = run_command_with_updates(
+      command, 'Install NCCL Plugin On Cluster', args
+  )
+
+  if return_code != 0:
+    xpk_print(
+        f'Install NCCL Plugin On Cluster request returned ERROR {return_code}'
+    )
+    return 1
+
+  return 0
+
+
+def get_all_clusters_programmatic(args) -> tuple[list[str], int]:
+  """Gets all the clusters associated with the project / region.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    List of cluster names and 0 if successful and 1 otherwise.
+  """
+  command = (
+      'gcloud container clusters list'
+      f' --project={args.project} --region={zone_to_region(args.zone)}'
+      ' --format="csv[no-heading](name)"'
+  )
+  return_code, raw_cluster_output = run_command_for_value(
+      command, 'Find if Cluster Exists', args
+  )
+  if return_code != 0:
+    xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
+    return [], return_code
+
+  return raw_cluster_output.splitlines(), 0
+
+
+def project_id_to_project_number(project_id: str) -> str:
+  client = resourcemanager_v3.ProjectsClient()
+  request = resourcemanager_v3.GetProjectRequest()
+  request.name = f'projects/{project_id}'
+  try:
+    response = client.get_project(request=request)
+  except PermissionDenied as e:
+    xpk_print(
+        f"Couldn't translate project id: {project_id} to project number."
+        f' Error: {e}'
+    )
+    xpk_exit(1)
+  parts = response.name.split('/', 1)
+  xpk_print(f'Project number for project: {project_id} is {parts[1]}')
+  return str(parts[1])
+
+
+def setup_k8s_env(args) -> k8s_client.ApiClient:
+  add_zone_and_project(args)
+  get_cluster_credentials(args)
+  args.project_number = project_id_to_project_number(args.project)
+
+  config.load_kube_config()
+  return k8s_client.ApiClient()  # pytype: disable=bad-return-type
+
+
+def create_k8s_service_account(name: str, namespace: str) -> None:
+  k8s_core_client = k8s_client.CoreV1Api()
+  sa = k8s_client.V1ServiceAccount(metadata=k8s_client.V1ObjectMeta(name=name))
+
+  xpk_print(f'Creating a new service account: {name}')
+  try:
+    k8s_core_client.create_namespaced_service_account(
+        namespace, sa, pretty=True
+    )
+    xpk_print(f'Created a new service account: {sa} successfully')
+  except ApiException:
+    xpk_print(f'Service account: {name} already exists. Skipping its creation')
+
+
+def update_gke_cluster_with_clouddns(args) -> int:
+  """Run the GKE cluster update command for existing clusters and enable CloudDNS.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = (
+      'gcloud container clusters update'
+      f' {args.cluster} --project={args.project}'
+      f' --region={zone_to_region(args.zone)}'
+      ' --cluster-dns=clouddns'
+      ' --cluster-dns-scope=vpc'
+      f' --cluster-dns-domain={args.cluster}-domain'
+      ' --quiet'
+  )
+  xpk_print('Updating GKE cluster to use Cloud DNS, may take a while!')
+  return_code = run_command_with_updates(
+      command, 'GKE Cluster Update to enable Cloud DNS', args
+  )
+  if return_code != 0:
+    xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
+    return 1
+  return 0
+
+
+def update_gke_cluster_with_workload_identity_enabled(args) -> int:
+  """Run the GKE cluster update command for existing cluster and enable Workload Identity Federation.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = (
+      'gcloud container clusters update'
+      f' {args.cluster} --project={args.project}'
+      f' --region={zone_to_region(args.zone)}'
+      f' --workload-pool={args.project}.svc.id.goog'
+      ' --quiet'
+  )
+  xpk_print(
+      'Updating GKE cluster to enable Workload Identity Federation, may take a'
+      ' while!'
+  )
+  return_code = run_command_with_updates(
+      command, 'GKE Cluster Update to enable Workload Identity Federation', args
+  )
+  if return_code != 0:
+    xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
+    return 1
+  return 0
+
+
+def update_gke_cluster_with_gcsfuse_driver_enabled(args) -> int:
+  """Run the GKE cluster update command for existing cluster and enable GCSFuse CSI driver.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = (
+      'gcloud container clusters update'
+      f' {args.cluster} --project={args.project}'
+      f' --region={zone_to_region(args.zone)}'
+      ' --update-addons GcsFuseCsiDriver=ENABLED'
+      ' --quiet'
+  )
+  xpk_print(
+      'Updating GKE cluster to enable GCSFuse CSI driver, may take a while!'
+  )
+  return_code = run_command_with_updates(
+      command, 'GKE Cluster Update to enable GCSFuse CSI driver', args
+  )
+  if return_code != 0:
+    xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
+    return 1
+  return 0
+
+
+def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
+  """Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
+
+  Args:
+    args: user provided arguments for running the command.
+    default_rapid_gke_version: Rapid default version for the upgrade.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = (
+      'gcloud container clusters upgrade'
+      f' {args.cluster} --project={args.project}'
+      f' --region={zone_to_region(args.zone)}'
+      f' --cluster-version={default_rapid_gke_version}'
+      ' --master'
+      ' --quiet'
+  )
+  xpk_print("Updating GKE cluster's control plane version, may take a while!")
+  return_code = run_command_with_updates(
+      command,
+      'GKE Cluster control plane version update to enable Cloud DNS',
+      args,
+  )
+  if return_code != 0:
+    xpk_print(
+        "GKE cluster's control plane version update request returned"
+        f' ERROR {return_code}'
+    )
+    return 1
+  return 0
+
+
+def is_cluster_using_clouddns(args) -> bool:
+  """Checks if cluster is using CloudDNS.
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    True if cluster is using CloudDNS and False otherwise.
+  """
+  command = (
+      f'gcloud container clusters describe {args.cluster}'
+      f' --project={args.project} --region={zone_to_region(args.zone)}'
+      ' 2> /dev/null | grep "clusterDns: CLOUD_DNS"'
+  )
+  return_code, _ = run_command_for_value(
+      command,
+      'Check if Cloud DNS is enabled in cluster describe.',
+      args,
+  )
+  if return_code == 0:
+    xpk_print('Cloud DNS is enabled on the cluster, no update needed.')
+    return True
+  return False
+
+
+def is_workload_identity_enabled_on_cluster(args) -> bool:
+  """Checks if Workload Identity Federation is enabled on the cluster.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    True if Workload Identity Federation is enabled on the cluster and False otherwise.
+  """
+  command = (
+      f'gcloud container clusters describe {args.cluster}'
+      f' --project={args.project} --region={zone_to_region(args.zone)}'
+      ' --format="value(workloadIdentityConfig.workloadPool)"'
+  )
+  return_code, workload_pool = run_command_for_value(
+      command,
+      'Checks if Workload Identity Federation is enabled in cluster describe.',
+      args,
+  )
+  if return_code != 0:
+    xpk_exit(return_code)
+  if workload_pool == f'{args.project}.svc.id.goog':
+    xpk_print(
+        'Workload Identity Federation is enabled on the cluster, no update'
+        ' needed.'
+    )
+    return True
+  return False
+
+
+def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
+  """Checks if GCSFuse CSI driver is enabled on the cluster.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    True if GCSFuse CSI driver is enabled on the cluster and False otherwise.
+  """
+  command = (
+      f'gcloud container clusters describe {args.cluster}'
+      f' --project={args.project} --region={zone_to_region(args.zone)}'
+      ' --format="value(addonsConfig.gcsFuseCsiDriverConfig.enabled)"'
+  )
+  return_code, gcsfuse_driver_enabled = run_command_for_value(
+      command,
+      'Checks if GCSFuse CSI driver is enabled in cluster describe.',
+      args,
+  )
+  if return_code != 0:
+    xpk_exit(return_code)
+  if gcsfuse_driver_enabled.lower() == 'true':
+    xpk_print('GCSFuse CSI driver is enabled on the cluster, no update needed.')
+    return True
+  return False
+
+
+def update_cluster_with_clouddns_if_necessary(args) -> int:
+  """Updates a GKE cluster to use CloudDNS, if not enabled already.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    0 if successful and error code otherwise.
+  """
+  all_clusters, return_code = get_all_clusters_programmatic(args)
+  if return_code > 0:
+    xpk_print('Listing all clusters failed!')
+    return 1
+  if args.cluster in all_clusters:
+    # If cluster is already using clouddns, no update necessary!
+    if is_cluster_using_clouddns(args):
+      return 0
+    cluster_update_return_code = update_gke_cluster_with_clouddns(args)
+    if cluster_update_return_code > 0:
+      xpk_print('Updating GKE cluster to use CloudDNS failed!')
+      return cluster_update_return_code
+
+    # Find default rapid control plane version and update the control plane to the same.
+    server_config_return_code, gke_server_config = get_gke_server_config(args)
+    if server_config_return_code != 0:
+      xpk_exit(server_config_return_code)
+    upgrade_master_return_code = upgrade_gke_control_plane_version(
+        args, gke_server_config.default_rapid_gke_version  # pytype: disable=attribute-error
+    )
+    if upgrade_master_return_code > 0:
+      xpk_print("Updating GKE cluster's control plane upgrade failed!")
+      return upgrade_master_return_code
+
+    # Upgrade nodepools version after the master upgrade.
+    node_pool_update_code = upgrade_gke_nodepools_version(
+        args, gke_server_config.default_rapid_gke_version  # pytype: disable=attribute-error
+    )
+    if node_pool_update_code > 0:
+      xpk_print('Upgrading nodepools version failed!')
+      return node_pool_update_code
+  return 0
+
+
+def update_cluster_with_workload_identity_if_necessary(args) -> int:
+  """Updates a GKE cluster to enable Workload Identity Federation, if not enabled already.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    0 if successful and error code otherwise.
+  """
+
+  if is_workload_identity_enabled_on_cluster(args):
+    return 0
+  cluster_update_return_code = (
+      update_gke_cluster_with_workload_identity_enabled(args)
+  )
+  if cluster_update_return_code > 0:
+    xpk_print(
+        'Updating GKE cluster to enable Workload Identity Federation failed!'
+    )
+    return cluster_update_return_code
+
+  return 0
+
+
+def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
+  """Updates a GKE cluster to enable GCSFuse CSI driver, if not enabled already.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    0 if successful and error code otherwise.
+  """
+
+  if is_gcsfuse_driver_enabled_on_cluster(args):
+    return 0
+  cluster_update_return_code = update_gke_cluster_with_gcsfuse_driver_enabled(
+      args
+  )
+  if cluster_update_return_code > 0:
+    xpk_print('Updating GKE cluster to enable GCSFuse CSI driver failed!')
+    return cluster_update_return_code
+
+  return 0
+
+
+def get_cluster_credentials(args) -> None:
+  """Run cluster configuration command to set the kubectl config.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = (
+      'gcloud container clusters get-credentials'
+      f' {args.cluster} --region={zone_to_region(args.zone)}'
+      f' --project={args.project} &&'
+      ' kubectl config view && kubectl config set-context --current'
+      ' --namespace=default'
+  )
+  task = f'get-credentials to cluster {args.cluster}'
+  return_code = run_command_with_updates_retry(
+      command, task, args, verbose=False
+  )
+  if return_code != 0:
+    xpk_print(f'{task} returned ERROR {return_code}')
+    xpk_exit(return_code)
diff --git a/src/xpk/core/cluster_private.py b/src/xpk/core/cluster_private.py
index 70f3c9dd9..9c15c632b 100644
--- a/src/xpk/core/cluster_private.py
+++ b/src/xpk/core/cluster_private.py
@@ -14,11 +14,14 @@
 limitations under the License.
 """
 
-from .core import zone_to_region
-from .commands import run_command_for_value, run_command_with_updates
 from ..utils.console import xpk_exit, xpk_print
-from ..utils.network import add_current_machine_to_networks, is_current_machine_in_any_network
+from ..utils.network import (
+    add_current_machine_to_networks,
+    is_current_machine_in_any_network,
+)
 from ..utils.objects import is_text_true
+from .commands import run_command_for_value, run_command_with_updates
+from .gcloud_context import zone_to_region
 
 
 def authorize_private_cluster_access_if_necessary(args) -> int:
diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py
index ab293f961..5708f76f3 100644
--- a/src/xpk/core/config.py
+++ b/src/xpk/core/config.py
@@ -14,24 +14,35 @@
 limitations under the License.
 """
 
-import ruamel.yaml
+import importlib.metadata as importlib_metadata
 import os
+import re
+
+import ruamel.yaml
 
 from ..utils import file
 from ..utils.console import xpk_print
+from .system_characteristics import AcceleratorType, SystemCharacteristics
 
+# This is the version for XPK PyPI package
+__version__ = importlib_metadata.version('xpk')
+XPK_CURRENT_VERSION = __version__.split('+')[0]
+XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
+
+CONFIGS_KEY = 'configs'
 CFG_BUCKET_KEY = 'cluster-state-gcs-bucket'
 CLUSTER_NAME_KEY = 'cluster-name'
 PROJECT_KEY = 'project-id'
 ZONE_KEY = 'zone'
-CONFIGS_KEY = 'configs'
-XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
-default_keys = [
+DEFAULT_KEYS = [
     CFG_BUCKET_KEY,
     CLUSTER_NAME_KEY,
     PROJECT_KEY,
     ZONE_KEY,
 ]
+VERTEX_TENSORBOARD_FEATURE_FLAG = XPK_CURRENT_VERSION >= '0.4.0'
+GCS_FUSE_ANNOTATION = 'gke-gcsfuse/volumes: "true"'
+
 
 yaml = ruamel.yaml.YAML()
 
@@ -41,7 +52,7 @@ class XpkConfig:
 
   def __init__(self, custom_config_file: str = XPK_CONFIG_FILE) -> None:
     self._config = custom_config_file
-    self._allowed_keys = default_keys
+    self._allowed_keys = DEFAULT_KEYS
 
   def _open_configs(self) -> dict | None:
     dir_path = '/'.join(self._config.split('/')[:-1])
@@ -91,3 +102,65 @@ def get_all(
       return None
     val: dict[str, str] = config_yaml[CONFIGS_KEY]
     return val
+
+
+def parse_env_config(args, tensorboard_config, system: SystemCharacteristics):
+  """Parses the environment configurations to the jobset config.
+
+  Args:
+    args: user provided arguments for running the command.
+    tensorboard_config: configuration of Vertex Tensorboard.
+    system: system characteristics.
+  """
+  env = {}
+
+  env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
+  if args.env_file:
+    print('Setting container environment from', args.env_file)
+    with open(file=args.env_file, mode='r', encoding='utf-8') as f:
+      for match in env_pat.finditer(f.read()):
+        variable = match.group(1)
+        if match.group(2) is not None:
+          env[variable] = match.group(2)
+        else:
+          assert variable in os.environ, (
+              f'Variable {variable} is not set in the current '
+              'environment, a value must be specified.'
+          )
+          env[variable] = os.environ[variable]
+  if args.env:
+    for var in args.env:
+      match = env_pat.match(var)
+      assert match and match.group(2) is not None, (
+          'Invalid environment variable, format must be '
+          f'`--env VARIABLE=value`: {var}'
+      )
+      variable = match.group(1)
+      env[variable] = match.group(2)
+
+  if not args.use_pathways:
+    if args.debug_dump_gcs:
+      if 'XLA_FLAGS' in env:
+        raise ValueError(
+            'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
+            'and environment file. Please choose one way to define '
+            'XLA_FLAGS.'
+        )
+      env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
+
+    if tensorboard_config:
+      env['UPLOAD_DATA_TO_TENSORBOARD'] = True
+      for key, value in tensorboard_config.items():
+        env[key.upper()] = value
+
+  if system.accelerator_type == AcceleratorType['GPU']:
+    # For GPUs, it has two more spaces ahead of name and value respectively
+    env_format = '''
+                  - name: {key}
+                    value: "{value}"'''
+  else:
+    env_format = '''
+                - name: {key}
+                  value: "{value}"'''
+
+  args.env = ''.join(env_format.format(key=k, value=v) for k, v in env.items())
diff --git a/src/xpk/core/core.py b/src/xpk/core/core.py
deleted file mode 100644
index ed590f129..000000000
--- a/src/xpk/core/core.py
+++ /dev/null
@@ -1,3324 +0,0 @@
-"""
-Copyright 2023 Google LLC
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     https://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-r"""xpk (Accelerated Processing Kit).
-
-Next Steps:
-- Cluster describe is broken by Cacheimage since that counts as a workload.
-- Cluster describe: count by jobset.
-- If any instance goes down, bring down the whole job.
-- How to more gracefully handle job failures, distinguishing between software
-  and infra?
-- Look into --docker-name and --docker-image.
-  Shouldn't one string be adequate to express what we want?
-- Apply learnings from about private, region, coredns, etc:
-- Enable special preheater
-- Make Argparse logic this a function?
-  - Obvious logic that starts in main instead of here in code but args will
-    not be a universal argument.
-"""
-
-import datetime
-import enum
-import os
-import random
-import re
-import string
-import subprocess
-import sys
-import importlib.metadata as importlib_metadata
-from argparse import Namespace
-from dataclasses import dataclass
-
-from ..utils.file import write_tmp_file
-from ..utils.console import get_user_input, xpk_exit, xpk_print
-from google.api_core.exceptions import PermissionDenied
-from google.cloud import resourcemanager_v3
-from kubernetes import client as k8s_client
-from kubernetes import config
-from kubernetes.client.exceptions import ApiException
-
-from .commands import (
-    run_command_for_value,
-    run_command_with_updates,
-    run_command_with_updates_retry,
-    run_commands,
-)
-from .storage import Storage, get_storages_to_mount, GCS_FUSE_TYPE
-from .system_characteristics import (
-    AcceleratorType,
-    AcceleratorTypeToAcceleratorCharacteristics,
-    SystemCharacteristics,
-)
-
-################### Internally used constants ##############
-
-default_docker_image = 'python:3.10'
-default_script_dir = os.getcwd()
-# This is the version for XPK PyPI package
-__version__ = importlib_metadata.version('xpk')
-
-xpk_current_version = __version__.split('+')[0]
-
-h100_device_type = 'h100-80gb-8'
-h100_mega_device_type = 'h100-mega-80gb-8'
-h200_device_type = 'h200-141gb-8'
-
-JOBSET_VERSION = 'v0.7.2'
-
-CAPACITY_TYPE_CONFIG_KEY = 'capacity_type'
-RESERVATION_CONFIG_KEY = 'reservation_id'
-_DEFAULT_POOL_NAME = 'default-pool'
-CLUSTER_RESOURCES_CONFIGMAP = 'resources-configmap'
-CLUSTER_METADATA_CONFIGMAP = 'metadata-configmap'
-VERTEX_TENSORBOARD_FEATURE_FLAG = xpk_current_version >= '0.4.0'
-DEFAULT_VERTEX_TENSORBOARD_NAME = 'tb-instance'
-AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
-AUTOPROVISIONING_CONFIG_MINIMUM_KEY = 'minimum_chips'
-AUTOPROVISIONING_CONFIG_MAXIMUM_KEY = 'maximum_chips'
-CLOUD_PLATFORM_AUTH_SCOPE_URL = (
-    '"https://www.googleapis.com/auth/cloud-platform"'
-)
-PLATFORM = 'linux/amd64'
-GCS_FUSE_ANNOTATION = 'gke-gcsfuse/volumes: "true"'
-
-
-class CapacityType(enum.Enum):
-  ON_DEMAND = 'on_demand'
-  RESERVATION = 'reservation'
-  SPOT = 'spot'
-  UNKNOWN = 'unknown'
-
-
-@dataclass
-class AutoprovisioningConfig:
-  config_filename: str
-  minimum_chips: int
-  maximum_chips: int
-
-
-cluster_configmap_yaml = """kind: ConfigMap
-apiVersion: v1
-metadata:
-  name: {name}
-data:
-  {data}
-"""
-
-# cluster_network_yaml: the config when creating the network for a3 cluster
-cluster_network_yaml = """
-apiVersion: networking.gke.io/v1
-kind: Network
-metadata:
-  name: vpc1
-spec:
-  parametersRef:
-    group: networking.gke.io
-    kind: GKENetworkParamSet
-    name: vpc1
-  type: Device
----
-apiVersion: networking.gke.io/v1
-kind: Network
-metadata:
-  name: vpc2
-spec:
-  parametersRef:
-    group: networking.gke.io
-    kind: GKENetworkParamSet
-    name: vpc2
-  type: Device
----
-apiVersion: networking.gke.io/v1
-kind: Network
-metadata:
-  name: vpc3
-spec:
-  parametersRef:
-    group: networking.gke.io
-    kind: GKENetworkParamSet
-    name: vpc3
-  type: Device
----
-apiVersion: networking.gke.io/v1
-kind: Network
-metadata:
-  name: vpc4
-spec:
-  parametersRef:
-    group: networking.gke.io
-    kind: GKENetworkParamSet
-    name: vpc4
-  type: Device
----
-apiVersion: networking.gke.io/v1
-kind: GKENetworkParamSet
-metadata:
-  name: vpc1
-spec:
-  vpc: {cluster_name}-net-1
-  vpcSubnet: {cluster_name}-sub-1
-  deviceMode: NetDevice
----
-apiVersion: networking.gke.io/v1
-kind: GKENetworkParamSet
-metadata:
-  name: vpc2
-spec:
-  vpc: {cluster_name}-net-2
-  vpcSubnet: {cluster_name}-sub-2
-  deviceMode: NetDevice
----
-apiVersion: networking.gke.io/v1
-kind: GKENetworkParamSet
-metadata:
-  name: vpc3
-spec:
-  vpc: {cluster_name}-net-3
-  vpcSubnet: {cluster_name}-sub-3
-  deviceMode: NetDevice
----
-apiVersion: networking.gke.io/v1
-kind: GKENetworkParamSet
-metadata:
-  name: vpc4
-spec:
-  vpc: {cluster_name}-net-4
-  vpcSubnet: {cluster_name}-sub-4
-  deviceMode: NetDevice
-"""
-
-
-def add_zone_and_project(args):
-  """Obtains the zone and project names from gcloud configs if not defined.
-
-  Args:
-    args: user provided arguments for running the command.
-  """
-  if not args.project:
-    args.project = get_project()
-  if not args.zone:
-    args.zone = get_zone()
-  xpk_print(f'Working on {args.project} and {args.zone}')
-
-
-def parse_env_config(args, tensorboard_config, system: SystemCharacteristics):
-  """Parses the environment configurations to the jobset config.
-
-  Args:
-    args: user provided arguments for running the command.
-    tensorboard_config: configuration of Vertex Tensorboard.
-    system: system characteristics.
-  """
-  env = {}
-
-  env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
-  if args.env_file:
-    print('Setting container environment from', args.env_file)
-    with open(file=args.env_file, mode='r', encoding='utf-8') as f:
-      for match in env_pat.finditer(f.read()):
-        variable = match.group(1)
-        if match.group(2) is not None:
-          env[variable] = match.group(2)
-        else:
-          assert variable in os.environ, (
-              f'Variable {variable} is not set in the current '
-              'environment, a value must be specified.'
-          )
-          env[variable] = os.environ[variable]
-  if args.env:
-    for var in args.env:
-      match = env_pat.match(var)
-      assert match and match.group(2) is not None, (
-          'Invalid environment variable, format must be '
-          f'`--env VARIABLE=value`: {var}'
-      )
-      variable = match.group(1)
-      env[variable] = match.group(2)
-
-  if not args.use_pathways:
-    if args.debug_dump_gcs:
-      if 'XLA_FLAGS' in env:
-        raise ValueError(
-            'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
-            'and environment file. Please choose one way to define '
-            'XLA_FLAGS.'
-        )
-      env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
-
-    if tensorboard_config:
-      env['UPLOAD_DATA_TO_TENSORBOARD'] = True
-      for key, value in tensorboard_config.items():
-        env[key.upper()] = value
-
-  if system.accelerator_type == AcceleratorType['GPU']:
-    # For GPUs, it has two more spaces ahead of name and value respectively
-    env_format = '''
-                  - name: {key}
-                    value: "{value}"'''
-  else:
-    env_format = '''
-                - name: {key}
-                  value: "{value}"'''
-
-  args.env = ''.join(env_format.format(key=k, value=v) for k, v in env.items())
-
-
-def get_project():
-  """Get GCE project from `gcloud config get project`.
-
-  Returns:
-     The project name.
-  """
-  completed_command = subprocess.run(
-      ['gcloud', 'config', 'get', 'project'], check=True, capture_output=True
-  )
-  project_outputs = completed_command.stdout.decode().strip().split('\n')
-  if len(project_outputs) < 1 or project_outputs[-1] == '':
-    sys.exit(
-        'You must specify the project in the project flag or set it with'
-        " 'gcloud config set project <project>'"
-    )
-  return project_outputs[
-      -1
-  ]  # The project name lives on the last line of the output
-
-
-def project_id_to_project_number(project_id: str) -> str:
-  client = resourcemanager_v3.ProjectsClient()
-  request = resourcemanager_v3.GetProjectRequest()
-  request.name = f'projects/{project_id}'
-  try:
-    response: resourcemanager_v3.Project = client.get_project(request=request)
-  except PermissionDenied as e:
-    xpk_print(
-        f"Couldn't translate project id: {project_id} to project number."
-        f' Error: {e}'
-    )
-    xpk_exit(1)
-  parts = response.name.split('/', 1)
-  xpk_print(f'Project number for project: {project_id} is {parts[1]}')
-  return parts[1]
-
-
-def get_zone():
-  """Get GCE zone from `gcloud config get compute/zone`.
-
-  Returns:
-     The zone name.
-  """
-  completed_command = subprocess.run(
-      ['gcloud', 'config', 'get', 'compute/zone'],
-      check=True,
-      capture_output=True,
-  )
-  zone_outputs = completed_command.stdout.decode().strip().split('\n')
-  if len(zone_outputs) < 1 or zone_outputs[-1] == '':
-    sys.exit(
-        "You must specify the zone in the zone flag or set it with 'gcloud"
-        " config set compute/zone <zone>'"
-    )
-  return zone_outputs[-1]  # The zone name lives on the last line of the output
-
-
-def zone_to_region(zone) -> str:
-  """Helper function converts zone name to region name.
-
-  Args:
-    zone: zone name.
-
-  Returns:
-     The region name.
-  """
-  zone_terms = zone.split('-')
-  return zone_terms[0] + '-' + zone_terms[1]
-
-
-def setup_k8s_env(args: Namespace) -> k8s_client.ApiClient:
-  add_zone_and_project(args)
-  get_cluster_credentials(args)
-  args.project_number = project_id_to_project_number(args.project)
-
-  config.load_kube_config()
-  return k8s_client.ApiClient()
-
-
-def create_k8s_service_account(name: str, namespace: str) -> None:
-  k8s_core_client = k8s_client.CoreV1Api()
-  sa = k8s_client.V1ServiceAccount(metadata=k8s_client.V1ObjectMeta(name=name))
-
-  xpk_print(f'Creating a new service account: {name}')
-  try:
-    k8s_core_client.create_namespaced_service_account(
-        namespace, sa, pretty=True
-    )
-    xpk_print(f'Created a new service account: {sa} successfully')
-  except ApiException:
-    xpk_print(f'Service account: {name} already exists. Skipping its creation')
-
-
-def get_total_chips_requested_from_args(
-    args, system: SystemCharacteristics
-) -> int:
-  """Return the total chips requested based on user args.
-
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics.
-
-  Returns:
-    num of chips for the current request.
-  """
-  if system.accelerator_type == AcceleratorType['GPU']:
-    num_chips = system.vms_per_slice * system.chips_per_vm * args.num_nodes
-  else:
-    num_chips = system.vms_per_slice * system.chips_per_vm * args.num_slices
-
-  return num_chips
-
-
-def update_gke_cluster_with_clouddns(args) -> int:
-  """Run the GKE cluster update command for existing clusters and enable CloudDNS.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  command = (
-      'gcloud container clusters update'
-      f' {args.cluster} --project={args.project}'
-      f' --region={zone_to_region(args.zone)}'
-      ' --cluster-dns=clouddns'
-      ' --cluster-dns-scope=vpc'
-      f' --cluster-dns-domain={args.cluster}-domain'
-      ' --quiet'
-  )
-  xpk_print('Updating GKE cluster to use Cloud DNS, may take a while!')
-  return_code = run_command_with_updates(
-      command, 'GKE Cluster Update to enable Cloud DNS', args
-  )
-  if return_code != 0:
-    xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
-    return 1
-  return 0
-
-
-def update_gke_cluster_with_workload_identity_enabled(args) -> int:
-  """Run the GKE cluster update command for existing cluster and enable Workload Identity Federation.
-  Args:
-    args: user provided arguments for running the command.
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  command = (
-      'gcloud container clusters update'
-      f' {args.cluster} --project={args.project}'
-      f' --region={zone_to_region(args.zone)}'
-      f' --workload-pool={args.project}.svc.id.goog'
-      ' --quiet'
-  )
-  xpk_print(
-      'Updating GKE cluster to enable Workload Identity Federation, may take a'
-      ' while!'
-  )
-  return_code = run_command_with_updates(
-      command, 'GKE Cluster Update to enable Workload Identity Federation', args
-  )
-  if return_code != 0:
-    xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
-    return 1
-  return 0
-
-
-def update_gke_cluster_with_gcsfuse_driver_enabled(args) -> int:
-  """Run the GKE cluster update command for existing cluster and enable GCSFuse CSI driver.
-  Args:
-    args: user provided arguments for running the command.
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  command = (
-      'gcloud container clusters update'
-      f' {args.cluster} --project={args.project}'
-      f' --region={zone_to_region(args.zone)}'
-      ' --update-addons GcsFuseCsiDriver=ENABLED'
-      ' --quiet'
-  )
-  xpk_print(
-      'Updating GKE cluster to enable GCSFuse CSI driver, may take a while!'
-  )
-  return_code = run_command_with_updates(
-      command, 'GKE Cluster Update to enable GCSFuse CSI driver', args
-  )
-  if return_code != 0:
-    xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
-    return 1
-  return 0
-
-
-def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
-  """Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
-
-  Args:
-    args: user provided arguments for running the command.
-    default_rapid_gke_version: Rapid default version for the upgrade.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  command = (
-      'gcloud container clusters upgrade'
-      f' {args.cluster} --project={args.project}'
-      f' --region={zone_to_region(args.zone)}'
-      f' --cluster-version={default_rapid_gke_version}'
-      ' --master'
-      ' --quiet'
-  )
-  xpk_print("Updating GKE cluster's control plane version, may take a while!")
-  return_code = run_command_with_updates(
-      command,
-      'GKE Cluster control plane version update to enable Cloud DNS',
-      args,
-  )
-  if return_code != 0:
-    xpk_print(
-        "GKE cluster's control plane version update request returned"
-        f' ERROR {return_code}'
-    )
-    return 1
-  return 0
-
-
-def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
-  """Upgrade nodepools in the cluster to default rapid gke version. Recreates the nodes.
-
-  Args:
-    args: user provided arguments for running the command.
-    default_rapid_gke_version: Rapid default version for the upgrade.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
-  if return_code != 0:
-    xpk_print('Listing all node pools failed!')
-    return return_code
-
-  # Batch execution to upgrade node pools simultaneously
-  commands = []
-  task_names = []
-  for node_pool_name in existing_node_pool_names:
-    commands.append(
-        'gcloud container clusters upgrade'
-        f' {args.cluster} --project={args.project}'
-        f' --region={zone_to_region(args.zone)}'
-        f' --cluster-version={default_rapid_gke_version}'
-        f' --node-pool={node_pool_name}'
-        ' --quiet'
-    )
-    task_names.append(f'Upgrading node pool {node_pool_name}.')
-
-  for i, command in enumerate(commands):
-    xpk_print(f'To complete {task_names[i]} we are executing {command}')
-  max_return_code = run_commands(
-      commands, 'Update GKE node pools to default RAPID GKE version', task_names
-  )
-  if max_return_code != 0:
-    xpk_print(
-        'GKE node pools update to default RAPID GKE version returned ERROR:'
-        f' {max_return_code}'
-    )
-    return max_return_code
-  return 0
-
-
-def set_up_cluster_network_for_gpu(args, system: SystemCharacteristics) -> int:
-  """Set up GKE Cluster networks, subnets and firewall rules for A3/A3+.
-  Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node,
-  and there are 8 NICs for GPU-GPU bw and 1 NIC for host in an A3+ node.
-
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  num_networks = 5 if system.device_type == h100_device_type else 9
-  for i in range(1, num_networks):
-    return_code = create_cluster_network(args, i)
-    if return_code != 0:
-      return 1
-    return_code = create_cluster_subnet(args, i)
-    if return_code != 0:
-      return 1
-    return_code = create_cluster_firewall_rule(args, i)
-    if return_code != 0:
-      return 1
-  return 0
-
-
-def create_cluster_network(args, index) -> int:
-  """Create one GKE Cluster network.
-
-  Args:
-    args: user provided arguments for running the command.
-    index: index number for the network to be created.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  existing_network_names, return_code = get_all_networks_programmatic(args)
-  if return_code > 0:
-    xpk_print('Listing all networks failed!')
-    return return_code
-
-  network_name = f'{args.cluster}-net-{index}'
-  if network_name not in existing_network_names:
-    command = (
-        f'gcloud compute --project={args.project}'
-        f' networks create {network_name}'
-        ' --subnet-mode=custom --mtu=8244'
-    )
-    return_code = run_command_with_updates(
-        command, 'Create Cluster Network', args, verbose=False
-    )
-
-    if return_code != 0:
-      xpk_print(f'Create Cluster Network request returned ERROR {return_code}')
-      return 1
-  else:
-    xpk_print(f'Reusing existing network {network_name}')
-
-  return 0
-
-
-def create_cluster_subnet(args, index) -> int:
-  """Create one GKE Cluster subnet.
-
-  Args:
-    args: user provided arguments for running the command.
-    index: index number for the subnet to be created.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  existing_subnet_names, return_code = get_all_subnets_programmatic(args)
-  if return_code > 0:
-    xpk_print('Listing all subnets failed!')
-    return return_code
-  subnet_name = f'{args.cluster}-{zone_to_region(args.zone)}-sub-{index}'
-  if subnet_name not in existing_subnet_names:
-    command = (
-        f'gcloud compute --project={args.project}'
-        f' networks subnets create {subnet_name}'
-        f' --network={args.cluster}-net-{index}'
-        f' --region={zone_to_region(args.zone)} --range=192.168.{index}.0/24'
-    )
-    return_code = run_command_with_updates(
-        command, 'Create Cluster Subnet', args, verbose=False
-    )
-
-    if return_code != 0:
-      xpk_print(f'Create Cluster Subnet request returned ERROR {return_code}')
-      return 1
-  else:
-    xpk_print(f'Reusing existing subnet {subnet_name}')
-
-  return 0
-
-
-def delete_cluster_subnets(args) -> int:
-  """Delete GKE Cluster subnets.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  existing_subnet_names, return_code = get_all_subnets_programmatic(args)
-  if return_code > 0:
-    xpk_print('Listing all subnets failed!')
-    return return_code
-
-  for subnet_name in existing_subnet_names:
-    command = (
-        f'gcloud compute networks subnets delete {subnet_name}'
-        f' --region={zone_to_region(args.zone)} --project={args.project} --quiet'
-    )
-
-    return_code = run_command_with_updates(
-        command, 'Delete Cluster Subnet', args, verbose=False
-    )
-
-    if return_code != 0:
-      xpk_print(f'Delete Cluster Subnet request returned ERROR {return_code}')
-      return 1
-    else:
-      xpk_print(f'Deleted existing subnet {subnet_name}')
-
-  return 0
-
-
-def create_cluster_firewall_rule(args, index) -> int:
-  """Create one GKE Cluster firewall rule.
-
-  Args:
-    args: user provided arguments for running the command.
-    index: index number for the firewall rule to be created.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  existing_firewall_rules_names, return_code = (
-      get_all_firewall_rules_programmatic(args)
-  )
-  if return_code > 0:
-    xpk_print('Listing all firewall rules failed!')
-    return return_code
-  firewall_rule_name = f'{args.cluster}-internal-{index}'
-  if firewall_rule_name not in existing_firewall_rules_names:
-    command = (
-        f'gcloud compute --project={args.project} firewall-rules create'
-        f' {firewall_rule_name} --network={args.cluster}-net-{index} --action=ALLOW'
-        ' --rules=tcp:0-65535,udp:0-65535,icmp --source-ranges=192.168.0.0/16'
-    )
-    return_code = run_command_with_updates(
-        command, 'Create Cluster Firewall Rule', args, verbose=False
-    )
-
-    if return_code != 0:
-      xpk_print(
-          f'Create Cluster Firewall Rule request returned ERROR {return_code}'
-      )
-      return 1
-  else:
-    xpk_print(f'Reusing existing firewall rule {firewall_rule_name}')
-  return 0
-
-
-def create_cluster_network_config(args) -> int:
-  """Run the Create GKE Cluster Network Config request.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  yml_string = cluster_network_yaml.format(cluster_name=args.cluster)
-  tmp = write_tmp_file(yml_string)
-  command = f'kubectl apply -f {str(tmp.file.name)}'
-
-  return_code = run_command_with_updates(
-      command, 'GKE Cluster Create Network Config', args
-  )
-  if return_code != 0:
-    xpk_print(
-        f'GKE Cluster Create ConfigMap request returned ERROR {return_code}'
-    )
-    return 1
-
-  return 0
-
-
-def print_reservations(args) -> int:
-  """Print the reservations in the project.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  command = f'gcloud beta compute reservations list --project={args.project}'
-  return_code = run_command_with_updates(
-      command, 'Get all reservations in the project', args
-  )
-  if return_code != 0:
-    xpk_print(f'Get all reservations returned ERROR {return_code}')
-    return 1
-  return 0
-
-
-def verify_reservation_exists(args) -> int:
-  """Verify the reservation exists.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  command = (
-      f'gcloud beta compute reservations describe {args.reservation}'
-      f' --project={args.project} --zone={args.zone}'
-  )
-  return_code = run_command_with_updates(command, 'Describe reservation', args)
-  if return_code != 0:
-    xpk_print(f'Describe reservation returned ERROR {return_code}')
-    xpk_print('Please confirm that your reservation name is correct.')
-    return 1
-  return 0
-
-
-def get_capacity_type(args) -> tuple[CapacityType, int]:
-  """Determine the capacity type based on user arguments.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    Tuple with string with the system characteristics and
-    int of 0 if successful and 1 otherwise.
-  """
-  capacity_type = CapacityType.UNKNOWN
-  num_types = 0
-  return_code = 0
-
-  # Determine the capacity argument.
-  if args.on_demand:
-    capacity_type = CapacityType.ON_DEMAND
-    num_types += 1
-  if args.reservation:
-    return_code = verify_reservation_exists(args)
-    if return_code > 0:
-      return capacity_type, return_code
-    capacity_type = CapacityType.RESERVATION
-    num_types += 1
-  if args.spot:
-    capacity_type = CapacityType.SPOT
-    num_types += 1
-
-  # Check that the number of user arguments provided is valid.
-  if num_types == 0:
-    capacity_type = CapacityType.UNKNOWN
-  elif num_types != 1:
-    xpk_print(
-        'ERROR: User specified more than one of the following arguments. Please'
-        ' specify only one of `--reservation=$RESERVATION_NAME`, `--on-demand`'
-        ' or `--spot`.'
-    )
-    return_code = 1
-
-  return capacity_type, return_code
-
-
-def get_capacity_arguments_from_capacity_type(
-    args, capacity_type: CapacityType
-) -> tuple[str, int]:
-  """Determine the TPU Nodepool creation capacity arguments needed.
-
-  Args:
-    args: user provided arguments for running the command.
-    capacity_type: The type of capacity the user configured.
-
-  Returns:
-    Tuple with string with the capacity argument to use and
-    int of 0 if successful and 1 otherwise.
-  """
-  capacity_args = ''
-  return_code = 0
-
-  match capacity_type:
-    case CapacityType.ON_DEMAND:
-      capacity_args = ''
-    case CapacityType.SPOT:
-      capacity_args = '--spot'
-    case CapacityType.RESERVATION:
-      capacity_args = (
-          f'--reservation-affinity=specific --reservation={args.reservation}'
-      )
-    case _:
-      xpk_print(
-          f'Unknown capacity type: {capacity_type}. Unable to determine'
-          ' capacity args.'
-      )
-      return_code = 1
-  return capacity_args, return_code
-
-
-def get_capacity_node_selectors_from_capacity_type(
-    args, capacity_type: str
-) -> tuple[str, int]:
-  """Determine the node selectors for a workload to run on a specific capacity type.
-
-  Args:
-    args: user provided arguments for running the command.
-    capacity_type: The type of capacity the user configured.
-
-  Returns:
-    Tuple with string with the node selectors to use and
-    int of 0 if successful and 1 otherwise.
-  """
-  node_selector = ''
-  return_code = 0
-
-  match capacity_type:
-    case CapacityType.ON_DEMAND.name:
-      node_selector = ''
-    case CapacityType.SPOT.name:
-      node_selector = 'cloud.google.com/gke-spot="true"'
-    case CapacityType.RESERVATION.name:
-      node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
-    case _:
-      xpk_print(
-          f'Unknown capacity type: {capacity_type}. Unable to determine the'
-          ' node selectors.'
-      )
-      return_code = 1
-  return node_selector, return_code
-
-
-def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
-  """
-  Args:
-    configmap_yml: dict containing ConfigMap name and yml string.
-
-  Returns:
-    0 if successful, 1 otherwise.
-  """
-  commands = []
-  task_names = []
-  for configmap_name, yml_string in configmap_yml.items():
-    tmp = write_tmp_file(yml_string)
-    command = f'kubectl apply -f {str(tmp.file.name)}'
-    commands.append(command)
-    task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
-    task_names.append(task_name)
-
-  return_code = run_commands(
-      commands, 'GKE Cluster CreateOrUpdate ConfigMap(s)', task_names
-  )
-  if return_code != 0:
-    xpk_print(
-        'GKE Cluster Create/Update ConfigMap(s) request returned ERROR'
-        f' {return_code}'
-    )
-    return 1
-  return 0
-
-
-def create_cluster_configmaps(
-    args,
-    system,
-    tensorboard_config: dict,
-    autoprovisioning_config: AutoprovisioningConfig | None,
-) -> int:
-  """Run the Create GKE Cluster ConfigMap request.
-
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics.
-    tensorboard_config: map that contains Vertex Tensorboard name, id and location
-    autoprovisioning_config: Config used in autoprovisioning.
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  configmap_yml = {}
-
-  # ConfigMap to store resources available in the cluster.
-  device_type = system.device_type
-  if system.accelerator_type == AcceleratorType['GPU']:
-    resources_data = f'{device_type}: "{int(args.num_nodes)}"'
-  elif (
-      not args.enable_pathways
-      and args.enable_autoprovisioning
-      and autoprovisioning_config
-  ):
-    # Currently autoprovisioning is not supported with Pathways.
-    # Auto provisioning will have variable topologies for a gke accelerator type.
-    resources_data = (
-        f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
-    )
-    resources_data += (
-        f'\n  {AUTOPROVISIONING_CONFIG_MINIMUM_KEY}:'
-        f' "{autoprovisioning_config.minimum_chips}"'
-    )
-    resources_data += (
-        f'\n  {AUTOPROVISIONING_CONFIG_MAXIMUM_KEY}:'
-        f' "{autoprovisioning_config.maximum_chips}"'
-    )
-  else:
-    resources_data = (
-        f'{device_type}: "{int(args.num_slices) * system.vms_per_slice}"'
-    )
-  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
-  resources_yml = cluster_configmap_yaml.format(
-      args=args, name=resources_configmap_name, data=resources_data
-  )
-  configmap_yml[resources_configmap_name] = resources_yml
-
-  # ConfigMap to store cluster metadata.
-  # XPK Version.
-  metadata = f'xpk_version: {xpk_current_version}'
-  # Vertex Tensorboard information
-  for key, value in tensorboard_config.items():
-    metadata += f'\n  {key}: "{value}"'
-  # Capacity Type.
-  capacity_type, return_code = get_capacity_type(args)
-  if return_code != 0:
-    xpk_print('Unable to determine capacity type.')
-    return return_code
-  metadata += f'\n  {CAPACITY_TYPE_CONFIG_KEY}: {capacity_type.name}'
-  # Reservation ID if applicable.
-  if capacity_type == CapacityType.RESERVATION:
-    metadata += f'\n  {RESERVATION_CONFIG_KEY}: {args.reservation}'
-  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
-  metadata_yml = cluster_configmap_yaml.format(
-      args=args, name=metadata_configmap_name, data=metadata
-  )
-  configmap_yml[metadata_configmap_name] = metadata_yml
-  return create_or_update_cluster_configmap(configmap_yml)
-
-
-def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
-  """Run the Get GKE Cluster ConfigMap request.
-
-  Args:
-    args: user provided arguments for running the command.
-    configmap_name: name of the configmap.
-
-  Returns:
-    key:value pairs stored in cluster ConfigMap.
-  """
-  command = (
-      'kubectl get configmap'
-      f' {configmap_name} -o=custom-columns="ConfigData:data" --no-headers=true'
-  )
-
-  return_code, return_value = run_command_for_value(
-      command, 'GKE Cluster Get ConfigMap', args
-  )
-  if return_code != 0:
-    xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
-    return None
-
-  config_map = {}
-  return_value = return_value.strip()
-
-  if return_value:
-    # Format of ConfigMap: map[key1:value1 key2:value2]
-    return_value = return_value[return_value.index('map') :]
-    configs = return_value[4:-1].split(' ')
-
-    for pair in configs:
-      key, value = pair.strip().split(':')
-      config_map[key] = value
-  return config_map
-
-
-def get_cluster_provisioner(args) -> str:
-  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
-  cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
-  cluster_provisioner = 'gcloud'
-  if not cluster_config_map is None:
-    provisioner = cluster_config_map.get('provisioner')
-    if not provisioner is None:
-      cluster_provisioner = provisioner
-  xpk_print(f'Cluster provisioner: {cluster_provisioner}')
-  return cluster_provisioner
-
-
-def create_vertex_tensorboard(args) -> dict:
-  """Creates a Tensorboard instance in Vertex AI.
-
-  Args:
-    args: user provided arguments.
-
-  Returns:
-    dict containing Tensorboard instance name, id and location.
-  """
-  from cloud_accelerator_diagnostics import tensorboard  # pylint: disable=import-outside-toplevel
-
-  tensorboard_config = {}
-  tensorboard_name = args.tensorboard_name
-  if tensorboard_name is None:
-    tensorboard_name = f'{args.cluster}-{DEFAULT_VERTEX_TENSORBOARD_NAME}'
-  instance_id = tensorboard.create_instance(  # pylint: disable=used-before-assignment
-      project=args.project,
-      location=args.tensorboard_region,
-      tensorboard_name=tensorboard_name,
-  )
-  if instance_id:
-    xpk_print(
-        f'Tensorboard instance {tensorboard_name} is successfully created.'
-    )
-    tensorboard_config['tensorboard_region'] = args.tensorboard_region
-    tensorboard_config['tensorboard_name'] = tensorboard_name
-    tensorboard_config['tensorboard_id'] = instance_id
-  return tensorboard_config
-
-
-def create_vertex_experiment(args) -> dict:
-  """Creates an Experiment in Vertex AI.
-
-  Args:
-    args: user provided arguments.
-
-  Returns:
-    map containing Vertex Tensorboard configurations.
-  """
-  from cloud_accelerator_diagnostics import tensorboard  # pylint: disable=import-outside-toplevel
-
-  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
-  cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
-
-  if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map:
-    xpk_print(
-        'No Vertex Tensorboard instance has been created in cluster create. Run'
-        ' `xpk cluster create --create-vertex-tensorboard` before running `xpk'
-        ' workload create --use-vertex-tensorboard` to create a Vertex'
-        ' Tensorboard instance. Alternatively, use `xpk cluster create-pathways'
-        ' --create-vertex-tensorboard` before running `xpk workload'
-        ' create-pathways --use-vertex-tensorboard`.'
-    )
-    return None
-
-  tensorboard_config = {}
-  tensorboard_config['tensorboard_project'] = args.project
-  tensorboard_config['tensorboard_region'] = cluster_config_map[
-      'tensorboard_region'
-  ]
-  tensorboard_config['tensorboard_name'] = cluster_config_map[
-      'tensorboard_name'
-  ]
-  experiment_name = args.experiment_name
-  if experiment_name is None:
-    experiment_name = f'{args.cluster}-{args.workload}'
-  tensorboard_config['experiment_name'] = experiment_name
-
-  _, tensorboard_url = tensorboard.create_experiment(
-      project=args.project,
-      location=tensorboard_config['tensorboard_region'],
-      experiment_name=experiment_name,
-      tensorboard_name=tensorboard_config['tensorboard_name'],
-  )
-  if tensorboard_url is None:
-    return None
-
-  xpk_print(f'You can view Vertex Tensorboard at: {tensorboard_url}')
-  return tensorboard_config
-
-
-def get_all_clusters_programmatic(args) -> tuple[list[str], int]:
-  """Gets all the clusters associated with the project / region.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    List of cluster names and 0 if successful and 1 otherwise.
-  """
-  command = (
-      'gcloud container clusters list'
-      f' --project={args.project} --region={zone_to_region(args.zone)}'
-      ' --format="csv[no-heading](name)"'
-  )
-  return_code, raw_cluster_output = run_command_for_value(
-      command, 'Find if Cluster Exists', args
-  )
-  if return_code != 0:
-    xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
-    return [], return_code
-
-  return raw_cluster_output.splitlines(), 0
-
-
-def is_cluster_using_clouddns(args) -> bool:
-  """Checks if cluster is using CloudDNS.
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    True if cluster is using CloudDNS and False otherwise.
-  """
-  command = (
-      f'gcloud container clusters describe {args.cluster}'
-      f' --project={args.project} --region={zone_to_region(args.zone)}'
-      ' 2> /dev/null | grep "clusterDns: CLOUD_DNS"'
-  )
-  return_code, _ = run_command_for_value(
-      command,
-      'Check if Cloud DNS is enabled in cluster describe.',
-      args,
-  )
-  if return_code == 0:
-    xpk_print('Cloud DNS is enabled on the cluster, no update needed.')
-    return True
-  return False
-
-
-def is_workload_identity_enabled_on_cluster(args) -> bool:
-  """Checks if Workload Identity Federation is enabled on the cluster.
-  Args:
-    args: user provided arguments for running the command.
-  Returns:
-    True if Workload Identity Federation is enabled on the cluster and False otherwise.
-  """
-  command = (
-      f'gcloud container clusters describe {args.cluster}'
-      f' --project={args.project} --region={zone_to_region(args.zone)}'
-      ' --format="value(workloadIdentityConfig.workloadPool)"'
-  )
-  return_code, workload_pool = run_command_for_value(
-      command,
-      'Checks if Workload Identity Federation is enabled in cluster describe.',
-      args,
-  )
-  if return_code != 0:
-    xpk_exit(return_code)
-  if workload_pool == f'{args.project}.svc.id.goog':
-    xpk_print(
-        'Workload Identity Federation is enabled on the cluster, no update'
-        ' needed.'
-    )
-    return True
-  return False
-
-
-def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
-  """Checks if GCSFuse CSI driver is enabled on the cluster.
-  Args:
-    args: user provided arguments for running the command.
-  Returns:
-    True if GCSFuse CSI driver is enabled on the cluster and False otherwise.
-  """
-  command = (
-      f'gcloud container clusters describe {args.cluster}'
-      f' --project={args.project} --region={zone_to_region(args.zone)}'
-      ' --format="value(addonsConfig.gcsFuseCsiDriverConfig.enabled)"'
-  )
-  return_code, gcsfuse_driver_enabled = run_command_for_value(
-      command,
-      'Checks if GCSFuse CSI driver is enabled in cluster describe.',
-      args,
-  )
-  if return_code != 0:
-    xpk_exit(return_code)
-  if gcsfuse_driver_enabled.lower() == 'true':
-    xpk_print('GCSFuse CSI driver is enabled on the cluster, no update needed.')
-    return True
-  return False
-
-
-def update_cluster_with_clouddns_if_necessary(args) -> int:
-  """Updates a GKE cluster to use CloudDNS, if not enabled already.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    0 if successful and error code otherwise.
-  """
-  all_clusters, return_code = get_all_clusters_programmatic(args)
-  if return_code > 0:
-    xpk_print('Listing all clusters failed!')
-    return 1
-  if args.cluster in all_clusters:
-    # If cluster is already using clouddns, no update necessary!
-    if is_cluster_using_clouddns(args):
-      return 0
-    cluster_update_return_code = update_gke_cluster_with_clouddns(args)
-    if cluster_update_return_code > 0:
-      xpk_print('Updating GKE cluster to use CloudDNS failed!')
-      return cluster_update_return_code
-
-    # Find default rapid control plane version and update the control plane to the same.
-    server_config_return_code, gke_server_config = get_gke_server_config(args)
-    if server_config_return_code != 0:
-      xpk_exit(server_config_return_code)
-    upgrade_master_return_code = upgrade_gke_control_plane_version(
-        args, gke_server_config.default_rapid_gke_version
-    )
-    if upgrade_master_return_code > 0:
-      xpk_print("Updating GKE cluster's control plane upgrade failed!")
-      return upgrade_master_return_code
-
-    # Upgrade nodepools version after the master upgrade.
-    node_pool_update_code = upgrade_gke_nodepools_version(
-        args, gke_server_config.default_rapid_gke_version
-    )
-    if node_pool_update_code > 0:
-      xpk_print('Upgrading nodepools version failed!')
-      return node_pool_update_code
-  return 0
-
-
-def update_cluster_with_workload_identity_if_necessary(args) -> int:
-  """Updates a GKE cluster to enable Workload Identity Federation, if not enabled already.
-  Args:
-    args: user provided arguments for running the command.
-  Returns:
-    0 if successful and error code otherwise.
-  """
-
-  if is_workload_identity_enabled_on_cluster(args):
-    return 0
-  cluster_update_return_code = (
-      update_gke_cluster_with_workload_identity_enabled(args)
-  )
-  if cluster_update_return_code > 0:
-    xpk_print(
-        'Updating GKE cluster to enable Workload Identity Federation failed!'
-    )
-    return cluster_update_return_code
-
-  return 0
-
-
-def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
-  """Updates a GKE cluster to enable GCSFuse CSI driver, if not enabled already.
-  Args:
-    args: user provided arguments for running the command.
-  Returns:
-    0 if successful and error code otherwise.
-  """
-
-  if is_gcsfuse_driver_enabled_on_cluster(args):
-    return 0
-  cluster_update_return_code = update_gke_cluster_with_gcsfuse_driver_enabled(
-      args
-  )
-  if cluster_update_return_code > 0:
-    xpk_print('Updating GKE cluster to enable GCSFuse CSI driver failed!')
-    return cluster_update_return_code
-
-  return 0
-
-
-def get_nodepool_zone(args, nodepool_name) -> tuple[int, str]:
-  """Return zone in which nodepool exists in the cluster.
-
-  Args:
-    args: user provided arguments for running the command.
-    nodepool_name: name of nodepool.
-
-  Returns:
-    Tuple of int, str where
-    int is the return code - 0 if successful, 1 otherwise.
-    str is the zone of nodepool.
-  """
-  command = (
-      f'gcloud beta container node-pools describe {nodepool_name}'
-      f' --cluster {args.cluster} --project={args.project}'
-      f' --region={zone_to_region(args.zone)} --format="value(locations)"'
-  )
-  return_code, nodepool_zone = run_command_for_value(
-      command, 'Get Node Pool Zone', args
-  )
-  if return_code != 0:
-    xpk_print(f'Get Node Pool Zone returned ERROR {return_code}')
-    return 1, None
-
-  return 0, nodepool_zone.strip()
-
-
-def get_nodepool_workload_metadata_mode(args, nodepool_name) -> tuple[int, str]:
-  """Return Workload Identity metadata mode of the nodepool.
-  Args:
-    args: user provided arguments for running the command.
-    nodepool_name: name of nodepool.
-  Returns:
-    Tuple of int, str where
-    int is the return code - 0 if successful, 1 otherwise.
-    str is the workload metadata mode of nodepool.
-  """
-  command = (
-      f'gcloud beta container node-pools describe {nodepool_name}'
-      f' --cluster {args.cluster} --project={args.project}'
-      f' --region={zone_to_region(args.zone)} --format="value(config.workloadMetadataConfig.mode)"'
-  )
-  return_code, nodepool_WI_mode = run_command_for_value(
-      command, 'Get Node Pool Workload Identity Metadata Mode', args
-  )
-  if return_code != 0:
-    xpk_print(
-        'Get Node Pool Workload Identity Metadata Mode returned ERROR'
-        f' {return_code}'
-    )
-    return 1, None
-
-  return 0, nodepool_WI_mode.strip()
-
-
-def check_cluster_resources(args, system) -> tuple[bool, bool]:
-  """Check if cluster has resources of a specified device_type/gke_accelerator.
-  This check will be skipped if <args.cluster>-<_CLUSTER_RESOURCES_CONFIGMAP> ConfigMap doesn't exist for the cluster.
-
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics.
-
-  Returns:
-    Tuple of bool, bool
-    True if resources in the cluster should be checked, False otherwise.
-    True if device_type/gke_accelerator exists in the cluster, False otherwise.
-  """
-  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
-  resources_config_map = get_cluster_configmap(args, resources_configmap_name)
-  if resources_config_map is None:
-    xpk_print(
-        f'No ConfigMap exist for cluster with the name {resources_config_map}.'
-        ' Cluster resources check will be skipped.'
-    )
-    return False, False
-  if system.device_type in resources_config_map:
-    return True, True
-  elif system.gke_accelerator in resources_config_map:
-    return True, True
-  return True, False
-
-
-def get_all_nodepools_programmatic(args) -> tuple[list[str], int]:
-  """Gets all the nodepools associated with the cluster / project / region.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    List of nodepools and 0 if successful and 1 otherwise.
-  """
-  command = (
-      'gcloud beta container node-pools list'
-      ' --cluster'
-      f' {args.cluster} --project={args.project} --region={zone_to_region(args.zone)}'
-      ' --format="csv[no-heading](name)"'
-  )
-  return_code, raw_nodepool_output = run_command_for_value(
-      command, 'Get All Node Pools', args
-  )
-  if return_code != 0:
-    xpk_print(f'Get All Node Pools returned ERROR {return_code}')
-    return [], 1
-
-  return raw_nodepool_output.splitlines(), 0
-
-
-def get_all_networks_programmatic(args) -> tuple[list[str], int]:
-  """Gets all the networks associated with project .
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    List of networks and 0 if successful and 1 otherwise.
-  """
-  command = 'gcloud compute networks list --format="csv[no-heading](name)"'
-  return_code, raw_network_output = run_command_for_value(
-      command, 'Get All Networks', args
-  )
-  if return_code != 0:
-    xpk_print(f'Get All Networks returned ERROR {return_code}')
-    return [], 1
-
-  return raw_network_output.splitlines(), 0
-
-
-def get_all_subnets_programmatic(args) -> tuple[list[str], int]:
-  """Gets all the subnets associated with the project.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    List of subnets and 0 if successful and 1 otherwise.
-  """
-  subnet_name_filter = f'{args.cluster}-{zone_to_region(args.zone)}-sub-*'
-
-  command = (
-      'gcloud compute networks subnets list'
-      f' --filter=name~"{subnet_name_filter}" --project={args.project}'
-  )
-  return_code, raw_subnets_output = run_command_for_value(
-      command, 'Get All Subnets', args
-  )
-  if return_code != 0:
-    xpk_print(f'Get All Subnets returned ERROR {return_code}')
-    return [], 1
-
-  all_outputs = raw_subnets_output.splitlines()
-  all_networks = [
-      all_outputs[i].split(' ')[0] for i in range(1, len(all_outputs))
-  ]
-  return all_networks, 0
-
-
-def get_all_firewall_rules_programmatic(args) -> tuple[list[str], int]:
-  """Gets all the firewall rules associated with the project.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    List of firewall rules and 0 if successful and 1 otherwise.
-  """
-  command = (
-      'gcloud compute firewall-rules list --format="csv[no-heading](name)"'
-  )
-  return_code, raw_subnets_output = run_command_for_value(
-      command, 'Get All Firewall Rules', args
-  )
-  if return_code != 0:
-    xpk_print(f'Get All Firewall Rules returned ERROR {return_code}')
-    return [], 1
-
-  return raw_subnets_output.splitlines(), 0
-
-
-def get_node_pools_to_delete(
-    args, system, existing_node_pool_names, desired_node_pool_names
-) -> list:
-  """Get list of nodepools to delete from the cluster.
-
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics.
-    existing_node_pool_names: names of nodepools that already exist in the cluster.
-    desired_node_pool_names: names of nodepools that should exist in the cluster.
-
-  Returns:
-    List of nodepool names to delete.
-  """
-  node_pools_to_delete = []
-  check_resource, is_requested_resource_in_cluster = check_cluster_resources(
-      args, system
-  )
-  for existing_node_pool_name in existing_node_pool_names:
-    # Deletion logic would leave behind any Pathways CPU nodepools.
-    if existing_node_pool_name.find(f'{args.cluster}-np-') != 0:
-      continue
-
-    # Nodepools will be deleted in two scenarios:
-    # Scenario 1: Cluster exists with 3 nodepools of 'x' device_type/gke_accelerator and now we are updating
-    # the cluster to 2 nodepools of 'x' device_type/gke_accelerator. In this case, we will delete
-    # '{args.cluster}-np-2' from the cluster.
-    # Scenario 2: Cluster exists with 2 nodepools of 'x' device_type/gke_accelerator and now we are updating
-    # the cluster to 2 nodepools of 'y' device_type/gke_accelerator. In this case, we will delete
-    # '{args.cluster}-np-0' and '{args.cluster}-np-1' from the cluster.
-    if existing_node_pool_name not in desired_node_pool_names or (
-        check_resource and not is_requested_resource_in_cluster
-    ):
-      node_pools_to_delete.append(existing_node_pool_name)
-
-  return node_pools_to_delete
-
-
-def run_gke_node_pool_create_command(
-    args, system, gke_node_pool_version
-) -> int:
-  """Run the Create GKE Node Pool request.
-
-  Args:
-    args: user provided arguments for running the command.
-    system: System characteristics based on device type/topology.
-    gke_node_pool_version: GKE version to use to create node pools.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  device_type = args.tpu_type if args.tpu_type else args.device_type
-  xpk_print(
-      f'Creating {args.num_slices} node pool or pools of {device_type}\n'
-      f'We assume that the underlying system is: {system}'
-  )
-  existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
-  if return_code > 0:
-    xpk_print('Listing all node pools failed!')
-    return return_code
-
-  capacity_type, return_code = get_capacity_type(args)
-  if return_code > 0:
-    xpk_print('Parsing capacity type failed!')
-    return return_code
-  if capacity_type == CapacityType.UNKNOWN:
-    return_code = print_reservations(args)
-    xpk_print(
-        'ERROR: User needs to provide the capacity type. Please specify one of'
-        ' the following `--reservation=$RESERVATION_NAME`, `--on-demand`'
-        ' or `--spot`. See the above list of reservations to choose from.'
-    )
-    if return_code > 0:
-      xpk_print('Listing all reservations failed!')
-    return_code = 1
-  capacity_args, return_code = get_capacity_arguments_from_capacity_type(
-      args, capacity_type
-  )
-  if return_code > 0:
-    xpk_print('Parsing capacity arguments failed!')
-    return return_code
-
-  if system.accelerator_type == AcceleratorType['GPU']:
-    xpk_print(
-        f'Creating 1 node pool with {args.num_nodes} nodes of'
-        f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
-    )
-    desired_node_pool_names = [f'{args.cluster}-np-0']
-  else:
-    xpk_print(
-        f'Creating {args.num_slices} node pool or pools of'
-        f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
-    )
-    desired_node_pool_names = [
-        f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
-    ]
-
-  node_pools_to_remain = []
-  delete_commands = []
-  delete_task_names = []
-  node_pools_to_update_WI = []
-  update_WI_commands = []
-  update_WI_task_names = []
-  if existing_node_pool_names:
-    return_code, existing_node_pool_zone = get_nodepool_zone(
-        args, existing_node_pool_names[0]
-    )
-    if return_code != 0:
-      return 1
-
-    if existing_node_pool_zone and existing_node_pool_zone != args.zone:
-      xpk_print(
-          f'Cluster {args.cluster} already has nodepools in zone:'
-          f' {existing_node_pool_zone}. Use the same zone to update nodepools'
-          ' in the cluster.'
-      )
-      return 1
-
-    node_pools_to_delete = get_node_pools_to_delete(
-        args, system, existing_node_pool_names, desired_node_pool_names
-    )
-    for node_pool_name in existing_node_pool_names:
-      if node_pool_name.find(f'{args.cluster}-np-') != 0:
-        continue
-
-      if node_pool_name in node_pools_to_delete:
-        command = (
-            'gcloud beta container node-pools delete'
-            f' {node_pool_name} --cluster={args.cluster}'
-            f' --zone={zone_to_region(args.zone)}'
-            f' --project={args.project} --quiet'
-        )
-        task = f'NodepoolDelete-{node_pool_name}'
-        delete_commands.append(command)
-        delete_task_names.append(task)
-      else:
-        node_pools_to_remain.append(node_pool_name)
-
-    # Workload Identity for existing nodepools
-    if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
-      for node_pool_name in existing_node_pool_names:
-        if not node_pool_name in node_pools_to_delete:
-          # Check if workload identity is not already enabled:
-          return_code, existing_node_pool_medadata_mode = (
-              get_nodepool_workload_metadata_mode(args, node_pool_name)
-          )
-          if return_code != 0:
-            return 1
-
-          if (
-              existing_node_pool_zone
-              and existing_node_pool_medadata_mode != 'GKE_METADATA'
-          ):
-            command = (
-                'gcloud container node-pools update'
-                f' {node_pool_name} --cluster={args.cluster}'
-                f' --zone={zone_to_region(args.zone)}'
-                f' --project={args.project} --quiet'
-                ' --workload-metadata=GKE_METADATA'
-            )
-            task = (
-                'Update nodepool with Workload Identity enabled'
-                f' {node_pool_name}'
-            )
-            update_WI_commands.append(command)
-            update_WI_task_names.append(task)
-            node_pools_to_update_WI.append(node_pool_name)
-
-  # Deletion of nodepools should happen before attempting to create new nodepools for the case
-  # when cluster is getting updated from 'x' device_type/gke_accelerator to 'y' device_type/gke_accelerator.
-  # In that case, '{args.cluster}-np-i' nodepool will be re-created for 'y' device_type/gke_accelerator.
-  if delete_commands:
-    will_delete = True
-    if node_pools_to_delete and not args.force:
-      will_delete = get_user_input(
-          f'Planning to delete {len(node_pools_to_delete)} node pools including'
-          f' {node_pools_to_delete}. \nDo you wish to delete: y (yes) / n'
-          ' (no):\n'
-      )
-    if not will_delete:
-      xpk_print(
-          'You have requested to not delete the existing nodepools in the'
-          ' cluster. There will be no change to the cluster.'
-      )
-      return 1
-
-    for i, command in enumerate(delete_commands):
-      xpk_print(
-          f'To complete {delete_task_names[i]} we are executing {command}'
-      )
-    max_return_code = run_commands(
-        delete_commands,
-        'Delete Nodepools',
-        delete_task_names,
-        dry_run=args.dry_run,
-    )
-    if max_return_code != 0:
-      xpk_print(f'Delete Nodepools returned ERROR {max_return_code}')
-      return 1
-
-  # Enable Workload Identity on existing Nodepools
-  if update_WI_commands:
-    will_update_WI = True
-    if node_pools_to_update_WI and not args.force:
-      will_update_WI = get_user_input(
-          'Planning to enable Workload Identity Federation on'
-          f' {len(node_pools_to_update_WI)} existing node pools including'
-          f' {node_pools_to_update_WI}.This immediately enables Workload'
-          ' Identity Federation for GKE for any workloads running in the node'
-          ' pool. Also, xpk does not support disabling Workload Identity on'
-          ' clusters that have it enabled already \nDo you wish to update: y'
-          ' (yes) / n (no):\n'
-      )
-    if not will_update_WI:
-      for i, command in enumerate(update_WI_commands):
-        xpk_print(
-            f'To complete {update_WI_task_names[i]} we are executing {command}'
-        )
-      max_return_code = run_commands(
-          update_WI_commands,
-          'Enable Workload Identity on existing Nodepools',
-          update_WI_task_names,
-          dry_run=args.dry_run,
-      )
-      if max_return_code != 0:
-        xpk_print(
-            'Enable Workload Identity on existing Nodepools returned ERROR'
-            f' {max_return_code}'
-        )
-        return 1
-
-    # Update {args.cluster}-{_CLUSTER_RESOURCES_CONFIGMAP} ConfigMap to 'y': '0'
-    # and remove 'x' from the ConfigMap when cluster is getting updated from
-    # 'x' device_type/gke_accelerator to 'y' device_type/gke_accelerator.
-    if not node_pools_to_remain:
-      if args.enable_autoprovisioning:
-        resources_data = (
-            f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
-        )
-      else:
-        resources_data = f'{device_type}: "0"'
-      resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
-      resources_yml = cluster_configmap_yaml.format(
-          args=args, name=resources_configmap_name, data=resources_data
-      )
-      configmap_yml = {}
-      configmap_yml[resources_configmap_name] = resources_yml
-      return_code = create_or_update_cluster_configmap(configmap_yml)
-      if return_code != 0:
-        return 1
-
-  create_commands = []
-  create_task_names = []
-  for node_pool_name in desired_node_pool_names:
-    if node_pool_name in node_pools_to_remain:
-      continue
-    command = (
-        'gcloud beta container node-pools create'
-        f' {node_pool_name}'
-        f' --region={zone_to_region(args.zone)}'
-        f' --cluster={args.cluster}'
-        f' --project={args.project} --node-locations={args.zone}'
-        f' --machine-type={system.gce_machine_type}'
-        f' --host-maintenance-interval={args.host_maintenance_interval}'
-        f' {capacity_args}'
-        ' --enable-gvnic'
-        f' {args.custom_nodepool_arguments}'
-    )
-    if system.accelerator_type == AcceleratorType['TPU']:
-      command += f' --node-version={gke_node_pool_version}'
-      command += f' --num-nodes={system.vms_per_slice}'
-      command += ' --placement-type=COMPACT  --max-pods-per-node 15'
-      command += (
-          f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
-      )
-      command += f' --tpu-topology={system.topology}'
-      command += f' {args.custom_tpu_nodepool_arguments}'
-    elif system.accelerator_type == AcceleratorType['GPU']:
-      subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
-      command += f' --num-nodes={args.num_nodes}'
-      command += (
-          ' --accelerator'
-          f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
-          ' --no-enable-autoupgrade '
-          f' --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL} --additional-node-network'
-          f' network={args.cluster}-net-1,subnetwork={subnet_prefix}-sub-1'
-          ' --additional-node-network'
-          f' network={args.cluster}-net-2,subnetwork={subnet_prefix}-sub-2'
-          ' --additional-node-network'
-          f' network={args.cluster}-net-3,subnetwork={subnet_prefix}-sub-3'
-          ' --additional-node-network'
-          f' network={args.cluster}-net-4,subnetwork={subnet_prefix}-sub-4'
-      )
-      if device_type == h100_mega_device_type:
-        command += (
-            ' --additional-node-network'
-            f' network={args.cluster}-net-5,subnetwork={subnet_prefix}-sub-5'
-            ' --additional-node-network'
-            f' network={args.cluster}-net-6,subnetwork={subnet_prefix}-sub-6'
-            ' --additional-node-network'
-            f' network={args.cluster}-net-7,subnetwork={subnet_prefix}-sub-7'
-            ' --additional-node-network'
-            f' network={args.cluster}-net-8,subnetwork={subnet_prefix}-sub-8'
-            ' --max-pods-per-node=32'
-        )
-    elif system.accelerator_type == AcceleratorType['CPU']:
-      command += f' --num-nodes={system.vms_per_slice}'
-      command += (
-          f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
-      )
-
-    if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
-      command += ' --workload-metadata=GKE_METADATA'
-
-    task = f'NodepoolCreate-{node_pool_name}'
-    create_commands.append(command)
-    create_task_names.append(task)
-
-  desired_pw_cpu_node_pools = ['cpu-user-np', 'cpu-rm-np', 'cpu-proxy-np']
-  if args.enable_pathways:
-    # Pathways needs CPU nodepools in addition to TPU nodepools
-    for node_pool_name in desired_pw_cpu_node_pools:
-      if node_pool_name in existing_node_pool_names:
-        continue
-      command = (
-          'gcloud beta container node-pools create'
-          f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
-          f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
-          ' --min-nodes=1 --max-nodes=20'
-      )
-      task = f'NodepoolCreate-{node_pool_name}'
-      create_commands.append(command)
-      create_task_names.append(task)
-
-  for i, command in enumerate(create_commands):
-    xpk_print(f'To complete {create_task_names[i]} we are executing {command}')
-  max_return_code = run_commands(
-      create_commands,
-      'Create Nodepools',
-      create_task_names,
-      dry_run=args.dry_run,
-  )
-  if max_return_code != 0:
-    xpk_print(f'Create Nodepools returned ERROR {max_return_code}')
-    return 1
-
-  xpk_print('Create or delete node pool request complete.')
-  return 0
-
-
-# TODO(vbarr): Remove this function when jobsets gets enabled by default on
-# GKE clusters.
-def set_jobset_on_cluster(args) -> int:
-  """Add jobset command on server side and ask user to verify it is created.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  command = (
-      'kubectl apply --server-side -f'
-      f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
-  )
-  task = f'Install Jobset on {args.cluster}'
-  return_code = run_command_with_updates_retry(command, task, args)
-
-  if return_code != 0:
-    xpk_print(f'{task} returned with ERROR {return_code}.\n')
-    xpk_print(
-        "This LIKELY means you're missing Kubernetes Permissions, you can"
-        ' validate this by checking if the error references permission problems'
-        ' such as `requires one of ["container.*"] permission(s)`. Follow our'
-        ' readme:'
-        ' https://github.com/google/xpk/blob/main/README.md#troubleshooting for'
-        ' instructions on how to fix these permissions.'
-    )
-  return return_code
-
-
-def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
-  """Install NCCL plugin on the cluster.
-
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  if system.device_type == h100_device_type:
-    command = (
-        'kubectl apply -f '
-        # pylint: disable=line-too-long
-        'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
-    )
-  else:
-    command = (
-        'kubectl apply -f '
-        # pylint: disable=line-too-long
-        'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
-    )
-
-  return_code = run_command_with_updates(
-      command, 'Install NCCL Plugin On Cluster', args
-  )
-
-  if return_code != 0:
-    xpk_print(
-        f'Install NCCL Plugin On Cluster request returned ERROR {return_code}'
-    )
-    return 1
-
-  return 0
-
-
-@dataclass
-class GkeServerConfig:
-  """Stores the valid gke versions based on gcloud recommendations."""
-
-  default_rapid_gke_version: str
-  valid_versions: set[str]
-
-
-def get_gke_server_config(args) -> tuple[int, GkeServerConfig | None]:
-  """Determine the GKE versions supported by gcloud currently.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    Tuple of
-    int: 0 if successful and 1 otherwise.
-    GkeServerConfig: stores valid gke version to use in node pool and cluster.
-  """
-  base_command = (
-      'gcloud container get-server-config'
-      f' --project={args.project} --region={zone_to_region(args.zone)}'
-  )
-  default_rapid_gke_version_cmd = (
-      base_command
-      + ' --flatten="channels" --filter="channels.channel=RAPID"'
-      ' --format="value(channels.defaultVersion)"'
-  )
-  valid_versions_cmd = (
-      base_command
-      + ' --flatten="channels" --filter="channels.channel=RAPID"'
-      ' --format="value(channels.validVersions)"'
-  )
-  base_command_description = 'Determine server supported GKE versions for '
-
-  server_config_commands_and_descriptions = [
-      (
-          default_rapid_gke_version_cmd,
-          base_command_description + 'default rapid gke version',
-      ),
-      (
-          valid_versions_cmd,
-          base_command_description + 'valid versions',
-      ),
-  ]
-  command_outputs = []
-
-  for command, command_description in server_config_commands_and_descriptions:
-    return_code, cmd_output = run_command_for_value(
-        command,
-        command_description,
-        args,
-        hide_error=True,
-    )
-    if return_code != 0:
-      xpk_print(f'Unable to get server config for {command_description}.')
-      return return_code, None
-    command_outputs.append(cmd_output)
-
-  return 0, GkeServerConfig(
-      default_rapid_gke_version=command_outputs[0].strip(),
-      valid_versions=set(command_outputs[1].split(';')),
-  )
-
-
-def get_gke_control_plane_version(
-    args, gke_server_config: GkeServerConfig
-) -> tuple[int, str | None]:
-  """Determine gke control plane version for cluster creation.
-
-  Args:
-    args: user provided arguments for running the command.
-    gke_server_config: holds valid gke versions and recommended default version.
-
-  Returns:
-    Tuple of
-    int: 0 if successful and 1 otherwise.
-    str: gke control plane version to use.
-  """
-
-  # Override with user provide gke version if specified.
-  if args.gke_version is not None:
-    master_gke_version = args.gke_version
-  else:
-    master_gke_version = gke_server_config.default_rapid_gke_version
-
-  is_valid_version = master_gke_version in gke_server_config.valid_versions
-
-  if not is_valid_version:
-    xpk_print(
-        f'Planned GKE Version: {master_gke_version}\n Valid Versions:'
-        f'\n{gke_server_config.valid_versions}\nRecommended / Default GKE'
-        f' Version: {gke_server_config.default_rapid_gke_version}'
-    )
-    xpk_print(
-        f'Error: Planned GKE Version {master_gke_version} is not valid.'
-        f'Checks failed: Is Version Valid: {is_valid_version}'
-    )
-    xpk_print(
-        'Please select a gke version from the above list using --gke-version=x'
-        ' argument or rely on the default gke version:'
-        f' {gke_server_config.default_rapid_gke_version}'
-    )
-    return 1, None
-
-  return 0, master_gke_version
-
-
-def get_gke_node_pool_version(
-    args, gke_server_config: GkeServerConfig
-) -> tuple[int, str | None]:
-  """Determine the gke node pool version for the node pool.
-
-  Args:
-    args: user provided arguments for running the command.
-    gke_server_config: holds valid gke versions and recommended default version.
-
-  Returns:
-    Tuple of
-    int: 0 if successful and 1 otherwise.
-    str: gke control plane version to use.
-  """
-
-  # By default use the current gke master version for creating node pools.
-  command_description = 'Determine current gke master version'
-  command = (
-      f'gcloud beta container clusters describe {args.cluster}'
-      f' --region {zone_to_region(args.zone)} --project {args.project}'
-      ' --format="value(currentMasterVersion)"'
-  )
-
-  return_code, current_gke_master_version = run_command_for_value(
-      command, command_description, args
-  )
-  if return_code != 0:
-    xpk_print(
-        f'Unable to get server config for command: {command_description}.'
-    )
-    return return_code, None
-
-  # Override with user provide gke version if specified.
-  if args.gke_version is not None:
-    node_pool_gke_version = args.gke_version
-  else:
-    master_gke_version = current_gke_master_version.strip()
-    node_pool_gke_version = ''
-    # Select minimum version which is >= master_gke_version and has the same minor version.
-    # If this does not exist select maximum version which is < master_gke_version.
-    for version in gke_server_config.valid_versions:
-      if (
-          (node_pool_gke_version == '' or node_pool_gke_version < version)
-          and version < master_gke_version
-      ) or (
-          (node_pool_gke_version == '' or node_pool_gke_version > version)
-          and master_gke_version <= version
-          and master_gke_version.split('.')[:2] == version.split('.')[:2]
-      ):
-        node_pool_gke_version = version
-
-  is_supported_node_pool_version = (
-      node_pool_gke_version in gke_server_config.valid_versions
-  )
-  # In rare cases, user's provided gke version may be invalid, but gke will return an error if so.
-  # An example scenario is if the user provided gke version is greater than the master version.
-  if not is_supported_node_pool_version:
-    xpk_print(
-        f'Planned node pool version {node_pool_gke_version} is not supported in'
-        ' valid version'
-        f' {gke_server_config.valid_versions}\nPlease adjust the gke version'
-        ' using --gke-version=x or remove the arg and depend on xpk default of'
-        f' {current_gke_master_version}'
-    )
-    return 1, None
-  return 0, node_pool_gke_version
-
-
-def get_cluster_credentials(args: Namespace) -> None:
-  """Run cluster configuration command to set the kubectl config.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-  command = (
-      'gcloud container clusters get-credentials'
-      f' {args.cluster} --region={zone_to_region(args.zone)}'
-      f' --project={args.project} &&'
-      ' kubectl config view && kubectl config set-context --current'
-      ' --namespace=default'
-  )
-  task = f'get-credentials to cluster {args.cluster}'
-  return_code = run_command_with_updates_retry(
-      command, task, args, verbose=False
-  )
-  if return_code != 0:
-    xpk_print(f'{task} returned ERROR {return_code}')
-    xpk_exit(return_code)
-
-
-def validate_docker_image(docker_image, args) -> int:
-  """Validates that the user provided docker image exists in your project.
-
-  Args:
-    docker_image: The docker image to verify.
-    args: user provided arguments for running the command.
-
-  Returns:
-    0 if successful and 1 otherwise.
-  """
-
-  project = args.project
-
-  if not any(repo in docker_image for repo in ['gcr.io', 'docker.pkg.dev']):
-    return 0
-
-  command = (
-      f'gcloud container images describe {docker_image} --project {project}'
-  )
-  return_code = run_command_with_updates(
-      command, 'Validate Docker Image', args, verbose=False
-  )
-  if return_code != 0:
-    xpk_print(
-        'Failed to validate your docker image, check that the docker image'
-        f' exists. You may be able to find the {docker_image} in {project}.'
-        ' If the docker image exists, the service account of this'
-        ' project maybe be missing the permissions to access the docker image.'
-    )
-    return return_code
-  else:
-    return 0
-
-
-def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
-  """Adds script dir to the base docker image and uploads the image.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    Tuple of:
-      0 if successful and 1 otherwise.
-      Name of the Docker image created.
-  """
-
-  # Pick a name for the docker image.
-  docker_image_prefix = os.getenv('USER', 'unknown')
-  docker_name = f'{docker_image_prefix}-runner'
-
-  script_dir_dockerfile = """FROM {base_docker_image}
-
-  # Set the working directory in the container
-  WORKDIR /app
-
-  # Copy all files from local workspace into docker container
-  COPY . .
-
-  WORKDIR /app
-  """
-
-  docker_file = script_dir_dockerfile.format(
-      base_docker_image=args.base_docker_image,
-  )
-  tmp = write_tmp_file(docker_file)
-  docker_build_command = (
-      f'docker buildx build --platform={PLATFORM} -f {str(tmp.file.name)} -t'
-      f' {docker_name} {args.script_dir}'
-  )
-  xpk_print(f'Building {args.script_dir} into docker image.')
-  return_code = run_command_with_updates(
-      docker_build_command,
-      'Building script_dir into docker image',
-      args,
-      verbose=verbose,
-  )
-  if return_code != 0:
-    xpk_print(
-        'Failed to add script_dir to docker image, check the base docker image.'
-        f' You should be able to navigate to the URL {args.base_docker_image}'
-        f' in {args.project}.'
-    )
-    xpk_exit(1)
-
-  # Pick a randomly generated `tag_length` character docker tag.
-  tag_length = 4
-  tag_random_prefix = ''.join(
-      random.choices(string.ascii_lowercase, k=tag_length)
-  )
-  tag_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
-  tag_name = f'{tag_random_prefix}-{tag_datetime}'
-  cloud_docker_image = f'gcr.io/{args.project}/{docker_name}:{tag_name}'
-  xpk_print(f'Adding Docker Image: {cloud_docker_image} to {args.project}')
-
-  # Tag the docker image.
-  tag_docker_image_command = f'docker tag {docker_name} {cloud_docker_image}'
-  return_code = run_command_with_updates(
-      tag_docker_image_command, 'Tag Docker Image', args, verbose=verbose
-  )
-  if return_code != 0:
-    xpk_print(
-        f'Failed to tag docker image with tag: {tag_name}.'
-        f' You should be able to navigate to the URL {cloud_docker_image} in'
-        f' {args.project}.'
-    )
-    xpk_exit(1)
-
-  # Upload image to Artifact Registry.
-  upload_docker_image_command = f'docker push {cloud_docker_image}'
-  return_code = run_command_with_updates(
-      upload_docker_image_command, 'Upload Docker Image', args, verbose=verbose
-  )
-  if return_code != 0:
-    xpk_print(
-        'Failed to upload docker image.'
-        f' You should be able to navigate to the URL {cloud_docker_image} in'
-        f' {args.project}.'
-    )
-    xpk_exit(1)
-  return return_code, cloud_docker_image
-
-
-def check_if_workload_exists(args) -> bool:
-  """Check if workload exists.
-
-  Args:
-     args: user provided arguments for running the command.
-
-  Returns:
-    returns true if workload exist, otherwise returns false.
-  """
-  columns = {
-      'Jobset': '.metadata.ownerReferences[0].name',
-  }
-
-  s = ','.join([key + ':' + value for key, value in columns.items()])
-
-  command = f"kubectl get workloads -o=custom-columns='{s}'"
-  return_code, return_msg = run_command_for_value(
-      command, 'Check if Workload Already Exists', args
-  )
-
-  if return_code != 0:
-    xpk_print(f'List Job request returned ERROR {return_code}')
-    xpk_exit(return_code)
-
-  lines = return_msg.split('\n')
-  new_workload_name = args.workload
-  for line in lines:
-    if line == new_workload_name:
-      return True
-  return False
-
-
-def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
-  """Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
-
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics
-
-  Returns:
-    returns true if workload can schedule, otherwise returns false.
-  """
-  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
-  cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
-
-  # Prevents workload creation failure for existing clusters with no ConfigMap
-  if cluster_config_map is None:
-    xpk_print(
-        'No ConfigMap exist for cluster with the name'
-        f' {resources_configmap_name}.'
-    )
-    return True
-
-  # Check for gke accelerator type:
-  missing_gke_accelerator_type = False
-  if not cluster_config_map.get(system.gke_accelerator):
-    xpk_print(
-        f'Gke Accelerator Type Check: {args.workload} is requesting'
-        f' {system.gke_accelerator} but cluster only contains'
-        f' {cluster_config_map.keys()}. '
-    )
-    missing_gke_accelerator_type = True
-  elif (
-      cluster_config_map[system.gke_accelerator]
-      == AUTOPROVISIONING_CONFIG_VALUE
-  ):
-    # Run total chip check when in autoprovisioning mode.
-    max_chips_in_cluster = int(
-        cluster_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY]
-    )
-    num_chips_in_workload = get_total_chips_requested_from_args(args, system)
-
-    if num_chips_in_workload > max_chips_in_cluster:
-      xpk_print(
-          f'{args.workload} is requesting {num_chips_in_workload} chips but'
-          f' the cluster {args.cluster} supports up to {max_chips_in_cluster}.'
-          '  Resize the cluster to support more chips with'
-          ' `xpk cluster create --autoprovisioning-max-chips=X ...`'
-      )
-      return False
-    return True
-
-  # Check for device type
-  missing_device_type = False
-  device_type = system.device_type
-  if device_type not in cluster_config_map:
-    xpk_print(
-        f'Device Type Check: {args.workload} is requesting {device_type} but '
-        f'cluster only contains {cluster_config_map.keys()}. '
-    )
-    missing_device_type = True
-
-  if missing_device_type and missing_gke_accelerator_type:
-    xpk_print(
-        'Both Device Type and GKE Accelerator Type checks failed.'
-        f' XPK will not create the workload {args.workload}.'
-    )
-    return False
-  else:
-    # Check if the size of the workload will fit in the cluster.
-    max_vm_in_cluster = int(cluster_config_map[device_type])
-    if system.accelerator_type == AcceleratorType['GPU']:
-      vm_required_by_workload = args.num_nodes
-    else:
-      vm_required_by_workload = args.num_slices * system.vms_per_slice
-    if vm_required_by_workload > max_vm_in_cluster:
-      xpk_print(
-          f'{args.workload} is requesting {args.num_slices} slice/slices of'
-          f' {device_type}, which is {vm_required_by_workload} VMs, but the'
-          f' cluster only contains {max_vm_in_cluster} VMs of {device_type}.'
-          ' XPK will not create this workload.'
-      )
-      return False
-
-  return True
-
-
-def use_base_docker_image_or_docker_image(args) -> bool:
-  """Checks for correct docker image arguments.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    True if intended to use base docker image, False to use docker image.
-  """
-  use_base_docker_image = True
-  # Check if (base_docker_image and script_dir) or (docker_image) is set.
-  if args.docker_image is not None:
-    if args.script_dir is not default_script_dir:
-      xpk_print(
-          '`--script-dir` and --docker-image can not be used together. Please'
-          ' see `--help` command for more details.'
-      )
-      xpk_exit(1)
-    if args.base_docker_image is not default_docker_image:
-      xpk_print(
-          '`--base-docker-image` and --docker-image can not be used together.'
-          ' Please see `--help` command for more details.'
-      )
-      xpk_exit(1)
-    use_base_docker_image = False
-  return use_base_docker_image
-
-
-def setup_docker_image(args) -> tuple[int, str]:
-  """Does steps to verify docker args, check image, and build image (if asked).
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    tuple:
-      0 if successful and 1 otherwise.
-      Name of the docker image to use.
-  """
-  use_base_docker_image = use_base_docker_image_or_docker_image(args)
-
-  docker_image = args.base_docker_image
-  if use_base_docker_image:
-    validate_docker_image_code = validate_docker_image(docker_image, args)
-    if validate_docker_image_code != 0:
-      xpk_exit(validate_docker_image_code)
-    build_docker_image_code, docker_image = build_docker_image_from_base_image(
-        args
-    )
-    if build_docker_image_code != 0:
-      xpk_exit(build_docker_image_code)
-  else:
-    docker_image = args.docker_image
-    validate_docker_image_code = validate_docker_image(args.docker_image, args)
-    if validate_docker_image_code != 0:
-      xpk_exit(validate_docker_image_code)
-
-  return 0, docker_image
-
-
-def get_main_and_sidecar_container(args, system, docker_image) -> str:
-  """Generate yaml for main and sidecar container.
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics
-    docker_image: docker image
-
-  Returns:
-    str:
-      yaml for main and sidecar container
-  """
-  resource_type = AcceleratorTypeToAcceleratorCharacteristics[
-      system.accelerator_type
-  ].resource_type
-  main_container = get_main_container(args, system, docker_image, resource_type)
-  yaml = """- name: stacktrace-explorer
-                image: busybox:1.28
-                args: [/bin/sh, -c, "check_signal() (while [ ! -f /shared-volume/stacktrace_signal ]; do sleep 1; done; pid=$(pidof 'tail'); kill $pid;); check_signal & while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*; exit 0;"]
-                volumeMounts:
-                - name: tpu-stack-trace
-                  readOnly: true
-                  mountPath: /tmp/debugging
-                - name: shared-data
-                  mountPath: /shared-volume
-              {main_container}
-  """
-  return yaml.format(main_container=main_container)
-
-
-def get_main_container(args, system, docker_image, resource_type) -> str:
-  """Generate yaml for main container including the xpk command.
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics
-    docker_image: docker image
-    resource_type: The label to describe the resource type for TPUs/GPUs/CPUs.
-
-  Returns:
-    str:
-      yaml for main container
-  """
-
-  xpk_internal_commands = ''
-  gsutil_test_command = ''
-  if not args.use_pathways and args.debug_dump_gcs:
-    gsutil_test_command = (
-        'which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil'
-        ' is required but not installed. Aborting"; exit 24;};'
-    )
-    xpk_internal_commands += (
-        'WORKER_ID=$HOSTNAME;'
-        f'gsutil -m cp -r /tmp/xla_dump/ {args.debug_dump_gcs}/$WORKER_ID;'
-    )
-
-  command = args.command
-  if args.enable_debug_logs:
-    command = (
-        'export TPU_STDERR_LOG_LEVEL=0 &&'
-        ' export TPU_MIN_LOG_LEVEL=0 &&'
-        ' export TF_CPP_MIN_LOG_LEVEL=0 &&'
-        ' export TPU_VMODULE=real_program_continuator=1 &&'
-        f' {args.command}'
-    )
-
-  gpu_workload_terminate_command = ''
-  if system.accelerator_type == AcceleratorType['GPU']:
-    gpu_workload_terminate_command = (
-        'echo Main app is done > /usr/share/workload/workload_terminated; '
-    )
-
-  tpu_stacktrace_terminate_command = ''
-  if (
-      not args.use_pathways
-      and system.accelerator_type == AcceleratorType['TPU']
-      and args.deploy_stacktrace_sidecar
-  ):
-    tpu_stacktrace_terminate_command = (
-        'touch /shared-volume/stacktrace_signal; '
-    )
-
-  yaml = """- name: {docker_name}
-                image: {docker_image}
-                {image_pull_policy}
-                env: {env}
-                ports:
-                {container_ports}
-                {jax_coordinator_port}
-                securityContext:
-                  privileged: true
-                command:
-                - bash
-                - -c
-                - |
-                  echo XPK Start: $(date);
-                  _sigterm() (kill -SIGTERM $! 2>/dev/null;);
-                  trap _sigterm SIGTERM;
-                  {gsutil_test_command}
-                  ({command}) & PID=$!;
-                  while kill -0 $PID 2>/dev/null;
-                      do sleep 5;
-                  done;
-                  wait $PID;
-                  EXIT_CODE=$?;
-                  {xpk_internal_commands}
-                  echo XPK End: $(date);
-                  echo EXIT_CODE=$EXIT_CODE;
-                  {tpu_stacktrace_terminate_command}
-                  {gpu_workload_terminate_command}
-                  exit $EXIT_CODE
-                resources:
-                  limits:
-                    {resources}
-"""
-  volume_mounts = get_volume_mounts(args, system)
-  if volume_mounts != '':
-    yaml += """
-                volumeMounts:
-                {volume_mounts}
-"""
-  return yaml.format(
-      args=args,
-      system=system,
-      image_pull_policy=add_image_pull_policy_for_pw_or_gpu(args, system),
-      env=get_env_container(args, system),
-      container_ports=add_container_ports(args, system),
-      jax_coordinator_port=add_jax_coordinator_port(system),
-      docker_name=get_main_container_docker_image(args, system),
-      docker_image=docker_image,
-      gsutil_test_command=gsutil_test_command,
-      command=command,
-      tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
-      gpu_workload_terminate_command=gpu_workload_terminate_command,
-      xpk_internal_commands=xpk_internal_commands,
-      resources=get_main_container_resources(args, system, resource_type),
-      volume_mounts=volume_mounts,
-  )
-
-
-def add_image_pull_policy_for_pw_or_gpu(args, system: SystemCharacteristics):
-  """Add image pull policy only for Pathways containers.
-  Args:
-    args: user provided args.
-    system: system characteristics
-
-  Returns:
-    str:
-      YAML stating that the image will be pulled fro GCR every time.
-  """
-  yaml = """imagePullPolicy: Always"""
-
-  if args.use_pathways or system.accelerator_type == AcceleratorType['GPU']:
-    return yaml.format(args=args)
-  return ''
-
-
-def get_main_container_docker_image(args, system: SystemCharacteristics) -> str:
-  """Docker name for the main container.
-  Args:
-    args: user provided args.
-    system: system characteristics.
-
-  Returns:
-    str:
-      Workload docker image as a YAML string
-  """
-
-  if system.accelerator_type == AcceleratorType['GPU']:
-    return 'gpu-image'
-
-  return f'{args.docker_name}'
-
-
-def get_volumes(args, system: SystemCharacteristics) -> str:
-  """Get volumes accessible to the containers in the pod.
-  Args:
-    args: user provided args.
-    system: system characteristics.
-
-  Returns:
-    str:
-      YAML for the volumes.
-  """
-  volumes = """- emptyDir:
-                  medium: Memory
-                name: dshm-2
-              """
-
-  if args.ramdisk_directory != '':
-    volumes += """
-              - name: cache
-                csi:
-                  driver: phase1-checkpoint.csi.storage.gke.io"""
-
-  if (
-      system.accelerator_type == AcceleratorType['TPU']
-      and args.deploy_stacktrace_sidecar
-  ):
-    volumes += """
-              - name: tpu-stack-trace
-              - name: shared-data
-              """
-
-  storages: list[Storage] = get_storages_to_mount(
-      setup_k8s_env(args), args.storage
-  )
-  for storage in storages:
-    if storage.type == GCS_FUSE_TYPE:
-      volumes += f"""- name: {storage.pv}
-                persistentVolumeClaim:
-                  claimName: {storage.pvc}
-                  readOnly: {storage.readonly}
-              """
-  return volumes
-
-
-def get_volume_mounts(args, system: SystemCharacteristics) -> str:
-  """Resources for the main container.
-  Args:
-    args: user provided args.
-
-  Returns:
-    str:
-      YAML for the volumes mounted within a Pathways container or GPU container as a YAML string.
-  """
-  volume_mount_yaml = """- mountPath: /dev/shm
-                  name: dshm-2
-                """
-
-  if args.ramdisk_directory != '':
-    volume_mount_yaml += f"""
-                - mountPath: /{args.ramdisk_directory}
-                  name: cache"""
-
-  if args.use_pathways:
-    volume_mount_yaml = """- mountPath: /tmp
-                  name: shared-tmp
-                """
-  elif (
-      system.accelerator_type == AcceleratorType['TPU']
-      and args.deploy_stacktrace_sidecar
-  ):
-    volume_mount_yaml += """- name: tpu-stack-trace
-                  mountPath: /tmp/debugging
-                - name: shared-data
-                  mountPath: /shared-volume
-                """
-  elif system.accelerator_type == AcceleratorType['GPU']:
-    if system.device_type == h100_device_type:
-      volume_mount_yaml = """- name: nvidia-install-dir-host
-                  mountPath: /usr/local/nvidia/lib64
-                - name: tcpx-nccl-plugin-volume
-                  mountPath: /usr/local/tcpx
-                - name: tcpd-socket
-                  mountPath: /tmp
-                - name: shared-memory
-                  mountPath: /dev/shm
-                - name: workload-terminated-volume
-                  mountPath: /usr/share/workload"""
-    elif (
-        system.device_type == h100_mega_device_type
-        or system.device_type == h200_device_type
-    ):
-      volume_mount_yaml = ''
-
-  storages: list[Storage] = get_storages_to_mount(
-      setup_k8s_env(args), args.storage
-  )
-  for storage in storages:
-    if storage.type == GCS_FUSE_TYPE:
-      volume_mount_yaml += f"""- name: {storage.pv}
-                  mountPath: {storage.mount_point}
-                  readOnly: {storage.readonly}
-                """
-  return volume_mount_yaml
-
-
-def get_user_workload_container(args, system: SystemCharacteristics):
-  """Deploy user workload container
-
-  Args:
-      args: user provided args.
-      system: system characteristics.
-
-  Returns:
-      container: main container
-      debugging_dashboard_id: id of the GKE dashboard
-  """
-
-  setup_docker_image_code, docker_image = setup_docker_image(args)
-  if setup_docker_image_code != 0:
-    xpk_exit(setup_docker_image_code)
-
-  # Determine if we deploy a sidecar and if we deploy a container.
-  debugging_dashboard_id = None
-  resource_type = AcceleratorTypeToAcceleratorCharacteristics[
-      system.accelerator_type
-  ].resource_type
-  if (
-      not args.use_pathways
-      and system.accelerator_type == AcceleratorType['TPU']
-      and args.deploy_stacktrace_sidecar
-  ):
-    xpk_print(
-        'Sidecar container to display stack traces for TPU workloads will also'
-        ' be deployed.'
-    )
-    container = get_main_and_sidecar_container(args, system, docker_image)
-    # Get GKE debugging dashboard only when sidecar container is deployed for TPU workloads
-    debugging_dashboard_id = get_gke_debugging_dashboard(args)
-  else:
-    container = get_main_container(args, system, docker_image, resource_type)
-  return container, debugging_dashboard_id
-
-
-def get_env_container(args, system: SystemCharacteristics):
-  """Environment configuration for the main container.
-  Args:
-    args: user provided args.
-    system: system characteristics.
-
-  Returns:
-    str:
-      YAML with the env config for the main container, as a YAML string.
-  """
-  pw_env_yaml = """
-                - name: XCLOUD_ENVIRONMENT
-                  value: GCP
-                - name: JAX_PLATFORMS
-                  value: proxy
-                - name: JAX_BACKEND_TARGET
-                  value: {proxy_address}
-                - name: JOBSET_NAME
-                  valueFrom:
-                    fieldRef:
-                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']"""
-  if args.use_pathways:
-    return pw_env_yaml.format(
-        args=args, proxy_address=args.pathways_proxy_address
-    )
-
-  gpu_env_yaml = """
-                  - name: REPLICATED_JOB_NAME
-                    valueFrom:
-                      fieldRef:
-                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
-                  - name: JOBSET_NAME
-                    valueFrom:
-                      fieldRef:
-                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
-                  - name: JAX_COORDINATOR_ADDRESS
-                    value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
-                  - name: NNODES
-                    value: "{args.num_nodes}"
-                  - name: NODE_RANK
-                    valueFrom:
-                      fieldRef:
-                        fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
-                  - name: USE_GPUDIRECT
-                    value: {gpu_direct_name}
-                  - name: GPUS_PER_NODE
-                    value: "{system.chips_per_vm}"
-                  - name: JAX_COORDINATOR_PORT
-                    value: "6002"
-                  - name: COMMAND
-                    value: "{args.command}"
-                  {args.env}"""
-
-  if system.accelerator_type == AcceleratorType['GPU']:
-    gpu_direct_name = 'fastrak'
-    if args.device_type == h100_device_type:
-      gpu_direct_name = 'tcpx'
-      gpu_env_yaml += """
-                  - name: LD_LIBRARY_PATH
-                    value: /usr/local/nvidia/lib64
-"""
-    elif args.device_type == h100_mega_device_type:
-      gpu_direct_name = 'tcpxo'
-    elif args.device_type == h200_device_type:
-      gpu_direct_name = 'rdma'
-    return gpu_env_yaml.format(
-        args=args, system=system, gpu_direct_name=gpu_direct_name
-    )
-
-  if system.accelerator_type == AcceleratorType['CPU']:
-    return get_cpu_env(args.num_slices, args.env, system)
-
-  return args.env
-
-
-def get_main_container_resources(
-    args, system: SystemCharacteristics, resource_type
-) -> str:
-  """Resources for the main container.
-  Args:
-    args: user provided args.
-    system: system characteristics.
-    resource_type: TPU / GPU / CPU
-
-  Returns:
-    str:
-      Workload resources port as a YAML string
-  """
-  # Resources requirements for Pathways workload containers are known.
-  resources_yaml = """cpu: "24"
-                    memory: 100G"""
-  if args.use_pathways:
-    return resources_yaml
-
-  gpu_resources_yaml = """nvidia.com/gpu: {system.chips_per_vm}"""
-  if system.accelerator_type == AcceleratorType['GPU']:
-    return gpu_resources_yaml.format(system=system)
-
-  if system.accelerator_type == AcceleratorType['CPU']:
-    # CPUs don't have chips, but have a subresource called vCPUs.
-    # system.chips_per_vm is used as a proxy for vCPUs.
-    # Some vCPUs get used in hosting system pods of the workloads,
-    # hence an offset of 0.95 is introduced.
-    offset_vCPUs = int(system.chips_per_vm) * 0.95
-    return f'{resource_type}: {offset_vCPUs}'
-
-  return f'{resource_type}: {system.chips_per_vm}'
-
-
-def add_container_ports(args, system: SystemCharacteristics) -> str:
-  """Add slice builder and megascale container ports,
-  for non-pathways workloads.
-
-  Args:
-    args: user provided args.
-
-  Returns:
-    str:
-      Pathways server port as a YAML string
-  """
-  port_yaml = """- containerPort: 8471
-                - containerPort: 8080"""
-  if args.use_pathways:
-    return ''
-
-  gpu_port_yaml = """- containerPort: 6002"""
-  if system.accelerator_type == AcceleratorType['GPU']:
-    return gpu_port_yaml
-  return port_yaml
-
-
-def add_jax_coordinator_port(system) -> str:
-  """Add jax coordinator port only for CPUs
-
-  Args:
-    system: system characteristics.
-
-  Returns:
-    str:
-      jax coordinator port as a YAML string
-  """
-  if system.accelerator_type == AcceleratorType['CPU']:
-    return '- containerPort: 1234'
-  return ''
-
-
-def get_gke_dashboard(args, dashboard_filter):
-  """Get the identifier of GKE dashboard deployed in the project.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    bool:
-      True if 'gcloud monitoring dashboards list' returned an error or
-      multiple dashboards with same filter exist in the project,
-      False otherwise.
-    str:
-      identifier of dashboard if deployed in project,
-      None otherwise.
-  """
-  command = (
-      'gcloud monitoring dashboards list'
-      f' --project={args.project} --filter="{dashboard_filter}"'
-      ' --format="value(name)" --verbosity=error'
-  )
-
-  return_code, return_value = run_command_for_value(
-      command, 'GKE Dashboard List', args
-  )
-
-  if return_code != 0:
-    xpk_print(
-        f'GKE Dashboard List request returned ERROR {return_code}. If there is'
-        ' a permissions error, please check'
-        ' https://github.com/google/xpk/blob/main/README.md#roles-needed-based-on-permission-errors'
-        ' for possible solutions.'
-    )
-    return True, None
-
-  if not return_value:
-    xpk_print(
-        f'No dashboard with {dashboard_filter} found in the'
-        f' project:{args.project}.'
-    )
-    return False, return_value
-
-  dashboards = return_value.strip().split('\n')
-  if len(dashboards) > 1:
-    xpk_print(
-        f'Multiple dashboards with same {dashboard_filter} exist in the'
-        f' project:{args.project}. Delete all but one dashboard deployed using'
-        ' https://github.com/google/cloud-tpu-monitoring-debugging.'
-    )
-    return True, None
-
-  if dashboards[0]:
-    return False, dashboards[0].strip().split('/')[-1]
-
-  return True, None
-
-
-def get_gke_outlier_dashboard(args):
-  """Get the identifier of GKE outlier dashboard deployed in the project.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    str:
-      identifier of outlier dashboard if deployed in project,
-      None otherwise.
-  """
-  outlier_dashboard_filter = "displayName:'GKE - TPU Monitoring Dashboard'"
-  is_error, dashboard_id = get_gke_dashboard(args, outlier_dashboard_filter)
-
-  # 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
-  if is_error:
-    return None
-
-  # 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
-  if not is_error and not dashboard_id:
-    xpk_print(
-        'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
-        ' deploy monitoring dashboard to view statistics and outlier mode of'
-        ' GKE metrics.'
-    )
-    return None
-
-  return dashboard_id
-
-
-def get_gke_debugging_dashboard(args):
-  """Get the identifier of GKE debugging dashboard deployed in the project.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    str:
-      identifier of debugging dashboard if deployed in project,
-      None otherwise.
-  """
-  debugging_dashboard_filter = "displayName:'GKE - TPU Logging Dashboard'"
-  is_error, dashboard_id = get_gke_dashboard(args, debugging_dashboard_filter)
-
-  # 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
-  if is_error:
-    return None
-
-  # 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
-  if not is_error and not dashboard_id:
-    xpk_print(
-        'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
-        ' deploy debugging dashboard to view stack traces collected in Cloud'
-        ' Logging.'
-    )
-    return None
-
-  return dashboard_id
-
-
-def create_accelerator_label(accelerator_type, system) -> str:
-  """Generates accelerator label.
-
-  Args:
-    accelerator_type: type of accelerator.
-    system: system characteristics.
-
-  Returns:
-    The accelerator label.
-  """
-  if accelerator_type == AcceleratorType['CPU']:
-    return ''
-  return (
-      f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].accelerator_label}:'
-      f' {system.gke_accelerator}'
-  )
-
-
-def create_machine_label(
-    accelerator_type, system, autoprovisioning_enabled: bool = False
-) -> str:
-  """Generates machine label.
-
-  Args:
-    accelerator_type: type of accelerator.
-    system: system characteristics.
-    autoprovisioning_enabled: describes autoprovisioning enablement.
-
-  Returns:
-    The machine label.
-  """
-  if (
-      accelerator_type == AcceleratorType['TPU']
-      and not autoprovisioning_enabled
-  ):
-    return (
-        f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].machine_label}:'
-        f' {system.topology}'
-    )
-  return ''
-
-
-def calculate_process_count(num_slices, vms_per_slice) -> str:
-  """Calculates the total number of processes in the workload.
-  Args:
-    num_slices: Number of slices to be used in the workload.
-    vms_per_slice: number of VMs in each slice.
-
-  Returns:
-    str: total number of processes.
-  """
-  num_processes = int(num_slices) * int(vms_per_slice)
-
-  return f'{num_processes}'
-
-
-def get_cpu_env(num_slices, env_vars, system) -> str:
-  """Generate environment variables for CPU nodepools
-  Args:
-    num_slices: Number of slices to be used in the workload.
-    env_vars: Environment variables, processed from user args.
-    system: system characteristics
-
-  Returns:
-    str: yaml containing env variables
-  """
-  yaml = """
-                - name: REPLICATED_JOB_NAME
-                  valueFrom:
-                    fieldRef:
-                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
-                - name: JOB_INDEX
-                  valueFrom:
-                    fieldRef:
-                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/job-index']
-                - name: JOB_COMPLETION_INDEX
-                  valueFrom:
-                    fieldRef:
-                      fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
-                - name: PROCESSES_IN_JOB
-                  value: "{processes_in_job}"
-                - name: JAX_PROCESS_COUNT
-                  value: "{process_count}"
-                {env_vars}
-                - name: JAX_COORDINATOR_ADDRESS
-                  value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
-  """
-  return yaml.format(
-      processes_in_job=system.vms_per_slice,
-      process_count=calculate_process_count(num_slices, system.vms_per_slice),
-      env_vars=env_vars,
-  )
-
-
-def get_cpu_affinity(accelerator_type) -> str:
-  """Generate affinity rules for CPU nodepools, so that workload pods are
-  not scheduled on the default pool machines.
-  Args:
-    accelerator_type: TPU / GPU / CPU
-
-  Returns:
-    str: yaml containing affinity constraints
-  """
-  yaml = """affinity:
-                nodeAffinity:
-                  requiredDuringSchedulingIgnoredDuringExecution:
-                    nodeSelectorTerms:
-                    - matchExpressions:
-                      - key: cloud.google.com/gke-nodepool
-                        operator: NotIn
-                        values:
-                        - default-pool
-"""
-  if accelerator_type == AcceleratorType['CPU']:
-    return yaml
-  return ''
-
-
-def get_gpu_scheduler(
-    args, system: SystemCharacteristics, autoprovisioning_args: str
-) -> tuple[str, int]:
-  """Get gpu scheduler configuration.
-
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics.
-    autoprovisioning_args: a string of arguments for Autoprovisioning.
-
-  Returns:
-    str: yaml containing gpu scheduler configuration
-    int of 0 if successful and 1 otherwise.
-  """
-  gpu_scheduler = ''
-  return_code = 0
-
-  if args.scheduler == 'gke.io/topology-aware-auto':
-    gpu_scheduler = f"""schedulingGates:
-              - name: "{args.scheduler}-{args.workload}"
-              """
-  elif args.scheduler == 'default-scheduler':
-    gpu_scheduler_yaml = """schedulerName: {scheduler_name}
-              affinity:
-                nodeAffinity:
-                  requiredDuringSchedulingIgnoredDuringExecution:
-                    nodeSelectorTerms:
-                    - matchExpressions:
-                      - key: cloud.google.com/gke-accelerator
-                        operator: Exists
-                      - key: cloud.google.com/gke-nodepool
-                        operator: In
-                        values: [{node_pool_name}]
-              nodeSelector:
-                {accelerator_label}
-                {machine_label}
-                {autoprovisioning_args}
-              """
-    gpu_scheduler = gpu_scheduler_yaml.format(
-        scheduler_name=args.scheduler,
-        accelerator_label=create_accelerator_label(
-            system.accelerator_type, system
-        ),
-        machine_label=create_machine_label(system.accelerator_type, system),
-        node_pool_name=f'{args.cluster}-np-0',
-        autoprovisioning_args=autoprovisioning_args,
-    )
-  else:
-    return_code = 1
-    xpk_print(
-        '--scheduler needs to be set as either `default-scheduler`'
-        ' or `gke.io/topology-aware-auto` in order to schedule the'
-        ' workloads on GPUs.'
-    )
-
-  return gpu_scheduler, return_code
-
-
-def get_gpu_volume(system: SystemCharacteristics) -> str:
-  """Get gpu volume based on user provided arguments.
-
-  Args:
-    system: system characteristics.
-
-  Returns:
-    str: yaml containing gpu volume
-  """
-  gpu_volume = ''
-  if system.device_type == h100_device_type:
-    gpu_volume = """- name: nvidia-install-dir-host
-                hostPath:
-                  path: /home/kubernetes/bin/nvidia/lib64
-              - name: tcpd-socket
-                hostPath:
-                  path: /run/tcpx
-              - name: shared-memory
-                emptyDir:
-                  medium: "Memory"
-                  sizeLimit: 200Gi
-              - name: workload-terminated-volume
-                emptyDir:
-              - name: tcpx-nccl-plugin-volume
-                emptyDir:"""
-  elif system.device_type == h100_mega_device_type:
-    gpu_volume = """- name: nvidia-install-dir-host
-                hostPath:
-                  path: /home/kubernetes/bin/nvidia/lib64
-              - name: shared-memory
-                emptyDir:
-                  medium: "Memory"
-                  sizeLimit: 1Gi
-              - name: workload-terminated-volume
-                emptyDir:"""
-  return gpu_volume
-
-
-def get_gpu_rxdm_image(system: SystemCharacteristics) -> str:
-  """Get config of rxdm based on user provided arguments.
-
-  Args:
-    system: system characteristics.
-
-  Returns:
-    str: yaml containing the rxdm name and image
-  """
-  gpu_rxdm_image = ''
-  if system.device_type == h100_device_type:
-    gpu_rxdm_image = """- name: tcpd-daemon
-                image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9"""
-  elif system.device_type == h100_mega_device_type:
-    gpu_rxdm_image = """- name: fastrak-daemon
-                image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.9"""
-  return gpu_rxdm_image
-
-
-def get_gpu_rxdm_cmd(system: SystemCharacteristics) -> str:
-  """Get rxdm command based on user provided arguments.
-
-  Args:
-    system: system characteristics.
-
-  Returns:
-    str: command of running rxdm container
-  """
-  gpu_rxdm_cmd = ''
-  if system.device_type == h100_device_type:
-    gpu_rxdm_cmd = (
-        '/tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm'
-        ' --gpu_shmem_type fd --setup_param "--verbose 128 2 0"'
-    )
-  elif system.device_type == h100_mega_device_type:
-    gpu_rxdm_cmd = (
-        'set -ex; chmod 755 /fts/entrypoint_rxdm_container.sh;'
-        ' /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid='
-        ' --alsologtostderr'
-    )
-  return gpu_rxdm_cmd
-
-
-def get_gpu_tcp_volume(system: SystemCharacteristics) -> str:
-  """Get gpu tcp volume based on user provided arguments.
-
-  Args:
-    system: system characteristics.
-
-  Returns:
-    str: yaml containing gpu tcp volume
-  """
-  gpu_tcp_volume = ''
-  if system.device_type == h100_device_type:
-    gpu_tcp_volume = """- name: tcpd-socket
-                  mountPath: /tmp"""
-  return gpu_tcp_volume
-
-
-def wait_for_job_completion(args) -> int:
-  """Function to wait for job completion.
-
-  Args:
-    args: user provided arguments for running the command.
-
-  Returns:
-    return_code: 0 if successful, 124 if timeout, 125 if unsuccessful job, 1 otherwise
-  """
-  # Check that the workload exists
-  args.workload = args.wait_for_job_completion
-  workload_exists = check_if_workload_exists(args)
-  if not workload_exists:
-    xpk_print(f'Workload named {args.workload} does not exist.')
-    return 1
-
-  # Get the full workload name
-  get_workload_name_cmd = f'kubectl get workloads | grep jobset-{args.workload}'
-  return_code, return_value = run_command_for_value(
-      get_workload_name_cmd, 'Get full workload name', args
-  )
-  if return_code != 0:
-    xpk_print(f'Get full workload name request returned ERROR {return_code}')
-    return return_code
-  full_workload_name = return_value.split(' ')[0]
-
-  # Call kubectl wait on the workload using the full workload name
-  timeout_val = args.timeout if args.timeout is not None else -1
-  timeout_msg = (
-      f'{timeout_val}s' if timeout_val != -1 else 'max timeout (1 week)'
-  )
-  wait_cmd = (
-      "kubectl  wait --for jsonpath='.status.conditions[-1].type'=Finished"
-      f' workload {full_workload_name} --timeout={timeout_val}s'
-  )
-  return_code, return_value = run_command_for_value(
-      wait_cmd,
-      f'Wait for workload to finish with timeout of {timeout_msg}',
-      args,
-      print_timer=True,
-  )
-  if return_code != 0:
-    if 'timed out' in return_value:
-      xpk_print(
-          f'Timed out waiting for your workload after {timeout_msg}, see your'
-          ' workload here:'
-          # pylint: disable=line-too-long
-          f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
-      )
-      return 124
-    else:
-      xpk_print(f'{return_value}')
-      xpk_print(f'Wait for workload returned ERROR {return_code}')
-      return return_code
-  xpk_print(
-      'Finished waiting for your workload, see your workload here:'
-      # pylint: disable=line-too-long
-      f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
-  )
-  status_cmd = (
-      f'kubectl get jobset {args.workload} -o'
-      " jsonpath='{.status.conditions[-1].type}'"
-  )
-  return_code, return_value = run_command_for_value(
-      status_cmd, 'Get jobset status', args
-  )
-  if return_code != 0:
-    xpk_print(f'Get workload status request returned ERROR {return_code}')
-    return return_code
-  xpk_print(f'Your workload finished with status: {return_value}')
-  if return_value != 'Completed':
-    xpk_print('Your workload did not complete successfully')
-    return 125
-  return 0
diff --git a/src/xpk/core/docker_container.py b/src/xpk/core/docker_container.py
new file mode 100644
index 000000000..a6dede979
--- /dev/null
+++ b/src/xpk/core/docker_container.py
@@ -0,0 +1,225 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ..utils.console import xpk_exit, xpk_print
+from .docker_image import setup_docker_image
+from .docker_resources import (
+    add_container_ports,
+    add_image_pull_policy_for_pw_or_gpu,
+    add_jax_coordinator_port,
+    get_env_container,
+    get_main_container_resources,
+    get_volume_mounts,
+)
+from .monitoring import get_gke_debugging_dashboard
+from .system_characteristics import (
+    AcceleratorType,
+    AcceleratorTypeToAcceleratorCharacteristics,
+    SystemCharacteristics,
+)
+
+
+def get_main_and_sidecar_container(args, system, docker_image) -> str:
+  """Generate yaml for main and sidecar container.
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics
+    docker_image: docker image
+
+  Returns:
+    str:
+      yaml for main and sidecar container
+  """
+  resource_type = AcceleratorTypeToAcceleratorCharacteristics[
+      system.accelerator_type
+  ].resource_type
+  main_container = get_main_container(args, system, docker_image, resource_type)
+  yaml = """- name: stacktrace-explorer
+                image: busybox:1.28
+                args: [/bin/sh, -c, "check_signal() (while [ ! -f /shared-volume/stacktrace_signal ]; do sleep 1; done; pid=$(pidof 'tail'); kill $pid;); check_signal & while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*; exit 0;"]
+                volumeMounts:
+                - name: tpu-stack-trace
+                  readOnly: true
+                  mountPath: /tmp/debugging
+                - name: shared-data
+                  mountPath: /shared-volume
+              {main_container}
+  """
+  return yaml.format(main_container=main_container)
+
+
+def get_main_container(args, system, docker_image, resource_type) -> str:
+  """Generate yaml for main container including the xpk command.
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics
+    docker_image: docker image
+    resource_type: The label to describe the resource type for TPUs/GPUs/CPUs.
+
+  Returns:
+    str:
+      yaml for main container
+  """
+
+  xpk_internal_commands = ''
+  gsutil_test_command = ''
+  if not args.use_pathways and args.debug_dump_gcs:
+    gsutil_test_command = (
+        'which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil'
+        ' is required but not installed. Aborting"; exit 24;};'
+    )
+    xpk_internal_commands += (
+        'WORKER_ID=$HOSTNAME;'
+        f'gsutil -m cp -r /tmp/xla_dump/ {args.debug_dump_gcs}/$WORKER_ID;'
+    )
+
+  command = args.command
+  if args.enable_debug_logs:
+    command = (
+        'export TPU_STDERR_LOG_LEVEL=0 &&'
+        ' export TPU_MIN_LOG_LEVEL=0 &&'
+        ' export TF_CPP_MIN_LOG_LEVEL=0 &&'
+        ' export TPU_VMODULE=real_program_continuator=1 &&'
+        f' {args.command}'
+    )
+
+  gpu_workload_terminate_command = ''
+  if system.accelerator_type == AcceleratorType['GPU']:
+    gpu_workload_terminate_command = (
+        'echo Main app is done > /usr/share/workload/workload_terminated; '
+    )
+
+  tpu_stacktrace_terminate_command = ''
+  if (
+      not args.use_pathways
+      and system.accelerator_type == AcceleratorType['TPU']
+      and args.deploy_stacktrace_sidecar
+  ):
+    tpu_stacktrace_terminate_command = (
+        'touch /shared-volume/stacktrace_signal; '
+    )
+
+  yaml = """- name: {docker_name}
+                image: {docker_image}
+                {image_pull_policy}
+                env: {env}
+                ports:
+                {container_ports}
+                {jax_coordinator_port}
+                securityContext:
+                  privileged: true
+                command:
+                - bash
+                - -c
+                - |
+                  echo XPK Start: $(date);
+                  _sigterm() (kill -SIGTERM $! 2>/dev/null;);
+                  trap _sigterm SIGTERM;
+                  {gsutil_test_command}
+                  ({command}) & PID=$!;
+                  while kill -0 $PID 2>/dev/null;
+                      do sleep 5;
+                  done;
+                  wait $PID;
+                  EXIT_CODE=$?;
+                  {xpk_internal_commands}
+                  echo XPK End: $(date);
+                  echo EXIT_CODE=$EXIT_CODE;
+                  {tpu_stacktrace_terminate_command}
+                  {gpu_workload_terminate_command}
+                  exit $EXIT_CODE
+                resources:
+                  limits:
+                    {resources}
+"""
+  volume_mounts = get_volume_mounts(args, system)
+  if volume_mounts != '':
+    yaml += """
+                volumeMounts:
+                {volume_mounts}
+"""
+  return yaml.format(
+      args=args,
+      system=system,
+      image_pull_policy=add_image_pull_policy_for_pw_or_gpu(args, system),
+      env=get_env_container(args, system),
+      container_ports=add_container_ports(args, system),
+      jax_coordinator_port=add_jax_coordinator_port(system),
+      docker_name=get_main_container_docker_image(args, system),
+      docker_image=docker_image,
+      gsutil_test_command=gsutil_test_command,
+      command=command,
+      tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
+      gpu_workload_terminate_command=gpu_workload_terminate_command,
+      xpk_internal_commands=xpk_internal_commands,
+      resources=get_main_container_resources(args, system, resource_type),
+      volume_mounts=volume_mounts,
+  )
+
+
+def get_user_workload_container(args, system: SystemCharacteristics):
+  """Deploy user workload container
+
+  Args:
+      args: user provided args.
+      system: system characteristics.
+
+  Returns:
+      container: main container
+      debugging_dashboard_id: id of the GKE dashboard
+  """
+
+  setup_docker_image_code, docker_image = setup_docker_image(args)
+  if setup_docker_image_code != 0:
+    xpk_exit(setup_docker_image_code)
+
+  # Determine if we deploy a sidecar and if we deploy a container.
+  debugging_dashboard_id = None
+  resource_type = AcceleratorTypeToAcceleratorCharacteristics[
+      system.accelerator_type
+  ].resource_type
+  if (
+      not args.use_pathways
+      and system.accelerator_type == AcceleratorType['TPU']
+      and args.deploy_stacktrace_sidecar
+  ):
+    xpk_print(
+        'Sidecar container to display stack traces for TPU workloads will also'
+        ' be deployed.'
+    )
+    container = get_main_and_sidecar_container(args, system, docker_image)
+    # Get GKE debugging dashboard only when sidecar container is deployed for TPU workloads
+    debugging_dashboard_id = get_gke_debugging_dashboard(args)
+  else:
+    container = get_main_container(args, system, docker_image, resource_type)
+  return container, debugging_dashboard_id
+
+
+def get_main_container_docker_image(args, system: SystemCharacteristics) -> str:
+  """Docker name for the main container.
+  Args:
+    args: user provided args.
+    system: system characteristics.
+
+  Returns:
+    str:
+      Workload docker image as a YAML string
+  """
+
+  if system.accelerator_type == AcceleratorType['GPU']:
+    return 'gpu-image'
+
+  return f'{args.docker_name}'
diff --git a/src/xpk/core/docker_image.py b/src/xpk/core/docker_image.py
new file mode 100644
index 000000000..7425b0fd6
--- /dev/null
+++ b/src/xpk/core/docker_image.py
@@ -0,0 +1,210 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import datetime
+import os
+import random
+import string
+
+from ..utils.console import xpk_exit, xpk_print
+from ..utils.file import write_tmp_file
+from .commands import run_command_with_updates
+
+DEFAULT_DOCKER_IMAGE = 'python:3.10'
+DEFAULT_SCRIPT_DIR = os.getcwd()
+PLATFORM = 'linux/amd64'
+
+
+def validate_docker_image(docker_image, args) -> int:
+  """Validates that the user provided docker image exists in your project.
+
+  Args:
+    docker_image: The docker image to verify.
+    args: user provided arguments for running the command.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+
+  project = args.project
+
+  if not any(repo in docker_image for repo in ['gcr.io', 'docker.pkg.dev']):
+    return 0
+
+  command = (
+      f'gcloud container images describe {docker_image} --project {project}'
+  )
+  return_code = run_command_with_updates(
+      command, 'Validate Docker Image', args, verbose=False
+  )
+  if return_code != 0:
+    xpk_print(
+        'Failed to validate your docker image, check that the docker image'
+        f' exists. You may be able to find the {docker_image} in {project}.'
+        ' If the docker image exists, the service account of this'
+        ' project maybe be missing the permissions to access the docker image.'
+    )
+    return return_code
+  else:
+    return 0
+
+
+def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
+  """Adds script dir to the base docker image and uploads the image.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    Tuple of:
+      0 if successful and 1 otherwise.
+      Name of the Docker image created.
+  """
+
+  # Pick a name for the docker image.
+  docker_image_prefix = os.getenv('USER', 'unknown')
+  docker_name = f'{docker_image_prefix}-runner'
+
+  script_dir_dockerfile = """FROM {base_docker_image}
+
+  # Set the working directory in the container
+  WORKDIR /app
+
+  # Copy all files from local workspace into docker container
+  COPY . .
+
+  WORKDIR /app
+  """
+
+  docker_file = script_dir_dockerfile.format(
+      base_docker_image=args.base_docker_image,
+  )
+  tmp = write_tmp_file(docker_file)
+  docker_build_command = (
+      f'docker buildx build --platform={PLATFORM} -f {str(tmp.file.name)} -t'
+      f' {docker_name} {args.script_dir}'
+  )
+  xpk_print(f'Building {args.script_dir} into docker image.')
+  return_code = run_command_with_updates(
+      docker_build_command,
+      'Building script_dir into docker image',
+      args,
+      verbose=verbose,
+  )
+  if return_code != 0:
+    xpk_print(
+        'Failed to add script_dir to docker image, check the base docker image.'
+        f' You should be able to navigate to the URL {args.base_docker_image}'
+        f' in {args.project}.'
+    )
+    xpk_exit(1)
+
+  # Pick a randomly generated `tag_length` character docker tag.
+  tag_length = 4
+  tag_random_prefix = ''.join(
+      random.choices(string.ascii_lowercase, k=tag_length)
+  )
+  tag_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
+  tag_name = f'{tag_random_prefix}-{tag_datetime}'
+  cloud_docker_image = f'gcr.io/{args.project}/{docker_name}:{tag_name}'
+  xpk_print(f'Adding Docker Image: {cloud_docker_image} to {args.project}')
+
+  # Tag the docker image.
+  tag_docker_image_command = f'docker tag {docker_name} {cloud_docker_image}'
+  return_code = run_command_with_updates(
+      tag_docker_image_command, 'Tag Docker Image', args, verbose=verbose
+  )
+  if return_code != 0:
+    xpk_print(
+        f'Failed to tag docker image with tag: {tag_name}.'
+        f' You should be able to navigate to the URL {cloud_docker_image} in'
+        f' {args.project}.'
+    )
+    xpk_exit(1)
+
+  # Upload image to Artifact Registry.
+  upload_docker_image_command = f'docker push {cloud_docker_image}'
+  return_code = run_command_with_updates(
+      upload_docker_image_command, 'Upload Docker Image', args, verbose=verbose
+  )
+  if return_code != 0:
+    xpk_print(
+        'Failed to upload docker image.'
+        f' You should be able to navigate to the URL {cloud_docker_image} in'
+        f' {args.project}.'
+    )
+    xpk_exit(1)
+  return return_code, cloud_docker_image
+
+
+def setup_docker_image(args) -> tuple[int, str]:
+  """Does steps to verify docker args, check image, and build image (if asked).
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    tuple:
+      0 if successful and 1 otherwise.
+      Name of the docker image to use.
+  """
+  use_base_docker_image = use_base_docker_image_or_docker_image(args)
+
+  docker_image = args.base_docker_image
+  if use_base_docker_image:
+    validate_docker_image_code = validate_docker_image(docker_image, args)
+    if validate_docker_image_code != 0:
+      xpk_exit(validate_docker_image_code)
+    build_docker_image_code, docker_image = build_docker_image_from_base_image(
+        args
+    )
+    if build_docker_image_code != 0:
+      xpk_exit(build_docker_image_code)
+  else:
+    docker_image = args.docker_image
+    validate_docker_image_code = validate_docker_image(args.docker_image, args)
+    if validate_docker_image_code != 0:
+      xpk_exit(validate_docker_image_code)
+
+  return 0, docker_image
+
+
+def use_base_docker_image_or_docker_image(args) -> bool:
+  """Checks for correct docker image arguments.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    True if intended to use base docker image, False to use docker image.
+  """
+  use_base_docker_image = True
+  # Check if (base_docker_image and script_dir) or (docker_image) is set.
+  if args.docker_image is not None:
+    if args.script_dir is not DEFAULT_SCRIPT_DIR:
+      xpk_print(
+          '`--script-dir` and --docker-image can not be used together. Please'
+          ' see `--help` command for more details.'
+      )
+      xpk_exit(1)
+    if args.base_docker_image is not DEFAULT_DOCKER_IMAGE:
+      xpk_print(
+          '`--base-docker-image` and --docker-image can not be used together.'
+          ' Please see `--help` command for more details.'
+      )
+      xpk_exit(1)
+    use_base_docker_image = False
+  return use_base_docker_image
diff --git a/src/xpk/core/docker_resources.py b/src/xpk/core/docker_resources.py
new file mode 100644
index 000000000..11b67b233
--- /dev/null
+++ b/src/xpk/core/docker_resources.py
@@ -0,0 +1,339 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
+from .cluster import setup_k8s_env
+from .storage import GCS_FUSE_TYPE, Storage, get_storages_to_mount
+from .system_characteristics import AcceleratorType, SystemCharacteristics
+
+
+def get_main_container_resources(
+    args, system: SystemCharacteristics, resource_type
+) -> str:
+  """Resources for the main container.
+  Args:
+    args: user provided args.
+    system: system characteristics.
+    resource_type: TPU / GPU / CPU
+
+  Returns:
+    str:
+      Workload resources port as a YAML string
+  """
+  # Resources requirements for Pathways workload containers are known.
+  resources_yaml = """cpu: "24"
+                    memory: 100G"""
+  if args.use_pathways:
+    return resources_yaml
+
+  gpu_resources_yaml = """nvidia.com/gpu: {system.chips_per_vm}"""
+  if system.accelerator_type == AcceleratorType['GPU']:
+    return gpu_resources_yaml.format(system=system)
+
+  if system.accelerator_type == AcceleratorType['CPU']:
+    # CPUs don't have chips, but have a subresource called vCPUs.
+    # system.chips_per_vm is used as a proxy for vCPUs.
+    # Some vCPUs get used in hosting system pods of the workloads,
+    # hence an offset of 0.95 is introduced.
+    offset_vCPUs = int(system.chips_per_vm) * 0.95
+    return f'{resource_type}: {offset_vCPUs}'
+
+  return f'{resource_type}: {system.chips_per_vm}'
+
+
+def get_env_container(args, system: SystemCharacteristics) -> str:
+  """Environment configuration for the main container.
+  Args:
+    args: user provided args.
+    system: system characteristics.
+
+  Returns:
+    str:
+      YAML with the env config for the main container, as a YAML string.
+  """
+  pw_env_yaml = """
+                - name: XCLOUD_ENVIRONMENT
+                  value: GCP
+                - name: JAX_PLATFORMS
+                  value: proxy
+                - name: JAX_BACKEND_TARGET
+                  value: {proxy_address}
+                - name: JOBSET_NAME
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']"""
+  if args.use_pathways:
+    return pw_env_yaml.format(
+        args=args, proxy_address=args.pathways_proxy_address
+    )
+
+  gpu_env_yaml = """
+                  - name: REPLICATED_JOB_NAME
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
+                  - name: JOBSET_NAME
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
+                  - name: JAX_COORDINATOR_ADDRESS
+                    value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
+                  - name: NNODES
+                    value: "{args.num_nodes}"
+                  - name: NODE_RANK
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+                  - name: USE_GPUDIRECT
+                    value: {gpu_direct_name}
+                  - name: GPUS_PER_NODE
+                    value: "{system.chips_per_vm}"
+                  - name: JAX_COORDINATOR_PORT
+                    value: "6002"
+                  - name: COMMAND
+                    value: "{args.command}"
+                  {args.env}"""
+
+  if system.accelerator_type == AcceleratorType['GPU']:
+    gpu_direct_name = 'fastrak'
+    if args.device_type == H100_DEVICE_TYPE:
+      gpu_direct_name = 'tcpx'
+      gpu_env_yaml += """
+                  - name: LD_LIBRARY_PATH
+                    value: /usr/local/nvidia/lib64
+"""
+    elif args.device_type == H100_MEGA_DEVICE_TYPE:
+      gpu_direct_name = 'tcpxo'
+    elif args.device_type == H200_DEVICE_TYPE:
+      gpu_direct_name = 'rdma'
+    return gpu_env_yaml.format(
+        args=args, system=system, gpu_direct_name=gpu_direct_name
+    )
+
+  if system.accelerator_type == AcceleratorType['CPU']:
+    return get_cpu_env(args.num_slices, args.env, system)
+
+  return args.env  # pytype: disable=bad-return-type
+
+
+def get_cpu_env(num_slices, env_vars, system) -> str:
+  """Generate environment variables for CPU nodepools
+  Args:
+    num_slices: Number of slices to be used in the workload.
+    env_vars: Environment variables, processed from user args.
+    system: system characteristics
+
+  Returns:
+    str: yaml containing env variables
+  """
+  yaml = """
+                - name: REPLICATED_JOB_NAME
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
+                - name: JOB_INDEX
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/job-index']
+                - name: JOB_COMPLETION_INDEX
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+                - name: PROCESSES_IN_JOB
+                  value: "{processes_in_job}"
+                - name: JAX_PROCESS_COUNT
+                  value: "{process_count}"
+                {env_vars}
+                - name: JAX_COORDINATOR_ADDRESS
+                  value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
+  """
+  return yaml.format(
+      processes_in_job=system.vms_per_slice,
+      process_count=calculate_process_count(num_slices, system.vms_per_slice),
+      env_vars=env_vars,
+  )
+
+
+def get_volumes(args, system: SystemCharacteristics) -> str:
+  """Get volumes accessible to the containers in the pod.
+  Args:
+    args: user provided args.
+    system: system characteristics.
+
+  Returns:
+    str:
+      YAML for the volumes.
+  """
+  volumes = """- emptyDir:
+                  medium: Memory
+                name: dshm-2
+              """
+
+  if args.ramdisk_directory != '':
+    volumes += """
+              - name: cache
+                csi:
+                  driver: phase1-checkpoint.csi.storage.gke.io"""
+
+  if (
+      system.accelerator_type == AcceleratorType['TPU']
+      and args.deploy_stacktrace_sidecar
+  ):
+    volumes += """
+              - name: tpu-stack-trace
+              - name: shared-data
+              """
+
+  storages: list[Storage] = get_storages_to_mount(
+      setup_k8s_env(args), args.storage
+  )
+  for storage in storages:
+    if storage.type == GCS_FUSE_TYPE:
+      volumes += f"""- name: {storage.pv}
+                persistentVolumeClaim:
+                  claimName: {storage.pvc}
+                  readOnly: {storage.readonly}
+              """
+  return volumes
+
+
+def get_volume_mounts(args, system: SystemCharacteristics) -> str:
+  """Resources for the main container.
+  Args:
+    args: user provided args.
+
+  Returns:
+    str:
+      YAML for the volumes mounted within a Pathways container or GPU container as a YAML string.
+  """
+  volume_mount_yaml = """- mountPath: /dev/shm
+                  name: dshm-2
+                """
+
+  if args.ramdisk_directory != '':
+    volume_mount_yaml += f"""
+                - mountPath: /{args.ramdisk_directory}
+                  name: cache"""
+
+  if args.use_pathways:
+    volume_mount_yaml = """- mountPath: /tmp
+                  name: shared-tmp
+                """
+  elif (
+      system.accelerator_type == AcceleratorType['TPU']
+      and args.deploy_stacktrace_sidecar
+  ):
+    volume_mount_yaml += """- name: tpu-stack-trace
+                  mountPath: /tmp/debugging
+                - name: shared-data
+                  mountPath: /shared-volume
+                """
+  elif system.accelerator_type == AcceleratorType['GPU']:
+    if system.device_type == H100_DEVICE_TYPE:
+      volume_mount_yaml = """- name: nvidia-install-dir-host
+                  mountPath: /usr/local/nvidia/lib64
+                - name: tcpx-nccl-plugin-volume
+                  mountPath: /usr/local/tcpx
+                - name: tcpd-socket
+                  mountPath: /tmp
+                - name: shared-memory
+                  mountPath: /dev/shm
+                - name: workload-terminated-volume
+                  mountPath: /usr/share/workload"""
+    elif (
+        system.device_type == H100_MEGA_DEVICE_TYPE
+        or system.device_type == H200_DEVICE_TYPE
+    ):
+      volume_mount_yaml = ''
+
+  storages: list[Storage] = get_storages_to_mount(
+      setup_k8s_env(args), args.storage
+  )
+  for storage in storages:
+    if storage.type == GCS_FUSE_TYPE:
+      volume_mount_yaml += f"""- name: {storage.pv}
+                  mountPath: {storage.mount_point}
+                  readOnly: {storage.readonly}
+                """
+  return volume_mount_yaml
+
+
+def calculate_process_count(num_slices, vms_per_slice) -> str:
+  """Calculates the total number of processes in the workload.
+  Args:
+    num_slices: Number of slices to be used in the workload.
+    vms_per_slice: number of VMs in each slice.
+
+  Returns:
+    str: total number of processes.
+  """
+  num_processes = int(num_slices) * int(vms_per_slice)
+
+  return f'{num_processes}'
+
+
+def add_container_ports(args, system: SystemCharacteristics) -> str:
+  """Add slice builder and megascale container ports,
+  for non-pathways workloads.
+
+  Args:
+    args: user provided args.
+
+  Returns:
+    str:
+      Pathways server port as a YAML string
+  """
+  port_yaml = """- containerPort: 8471
+                - containerPort: 8080"""
+  if args.use_pathways:
+    return ''
+
+  gpu_port_yaml = """- containerPort: 6002"""
+  if system.accelerator_type == AcceleratorType['GPU']:
+    return gpu_port_yaml
+  return port_yaml
+
+
+def add_jax_coordinator_port(system) -> str:
+  """Add jax coordinator port only for CPUs
+
+  Args:
+    system: system characteristics.
+
+  Returns:
+    str:
+      jax coordinator port as a YAML string
+  """
+  if system.accelerator_type == AcceleratorType['CPU']:
+    return '- containerPort: 1234'
+  return ''
+
+
+def add_image_pull_policy_for_pw_or_gpu(args, system: SystemCharacteristics):
+  """Add image pull policy only for Pathways containers.
+  Args:
+    args: user provided args.
+    system: system characteristics
+
+  Returns:
+    str:
+      YAML stating that the image will be pulled fro GCR every time.
+  """
+  yaml = """imagePullPolicy: Always"""
+
+  if args.use_pathways or system.accelerator_type == AcceleratorType['GPU']:
+    return yaml.format(args=args)
+  return ''
diff --git a/src/xpk/core/gcloud_context.py b/src/xpk/core/gcloud_context.py
new file mode 100644
index 000000000..c1e386b85
--- /dev/null
+++ b/src/xpk/core/gcloud_context.py
@@ -0,0 +1,196 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import subprocess
+import sys
+from dataclasses import dataclass
+
+from ..utils.console import xpk_print
+from .commands import run_command_for_value
+
+
+def get_project():
+  """Get GCE project from `gcloud config get project`.
+
+  Returns:
+     The project name.
+  """
+  completed_command = subprocess.run(
+      ['gcloud', 'config', 'get', 'project'], check=True, capture_output=True
+  )
+  project_outputs = completed_command.stdout.decode().strip().split('\n')
+  if len(project_outputs) < 1 or project_outputs[-1] == '':
+    sys.exit(
+        'You must specify the project in the project flag or set it with'
+        " 'gcloud config set project <project>'"
+    )
+  return project_outputs[
+      -1
+  ]  # The project name lives on the last line of the output
+
+
+def get_zone():
+  """Get GCE zone from `gcloud config get compute/zone`.
+
+  Returns:
+     The zone name.
+  """
+  completed_command = subprocess.run(
+      ['gcloud', 'config', 'get', 'compute/zone'],
+      check=True,
+      capture_output=True,
+  )
+  zone_outputs = completed_command.stdout.decode().strip().split('\n')
+  if len(zone_outputs) < 1 or zone_outputs[-1] == '':
+    sys.exit(
+        "You must specify the zone in the zone flag or set it with 'gcloud"
+        " config set compute/zone <zone>'"
+    )
+  return zone_outputs[-1]  # The zone name lives on the last line of the output
+
+
+def add_zone_and_project(args):
+  """Obtains the zone and project names from gcloud configs if not defined.
+
+  Args:
+    args: user provided arguments for running the command.
+  """
+  if not args.project:
+    args.project = get_project()
+  if not args.zone:
+    args.zone = get_zone()
+  xpk_print(f'Working on {args.project} and {args.zone}')
+
+
+def zone_to_region(zone) -> str:
+  """Helper function converts zone name to region name.
+
+  Args:
+    zone: zone name.
+
+  Returns:
+     The region name.
+  """
+  zone_terms = zone.split('-')
+  return zone_terms[0] + '-' + zone_terms[1]  # pytype: disable=bad-return-type
+
+
+@dataclass
+class GkeServerConfig:
+  """Stores the valid gke versions based on gcloud recommendations."""
+
+  default_rapid_gke_version: str
+  valid_versions: set[str]
+
+
+def get_gke_server_config(args) -> tuple[int, GkeServerConfig | None]:
+  """Determine the GKE versions supported by gcloud currently.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    Tuple of
+    int: 0 if successful and 1 otherwise.
+    GkeServerConfig: stores valid gke version to use in node pool and cluster.
+  """
+  base_command = (
+      'gcloud container get-server-config'
+      f' --project={args.project} --region={zone_to_region(args.zone)}'
+  )
+  default_rapid_gke_version_cmd = (
+      base_command
+      + ' --flatten="channels" --filter="channels.channel=RAPID"'
+      ' --format="value(channels.defaultVersion)"'
+  )
+  valid_versions_cmd = (
+      base_command
+      + ' --flatten="channels" --filter="channels.channel=RAPID"'
+      ' --format="value(channels.validVersions)"'
+  )
+  base_command_description = 'Determine server supported GKE versions for '
+
+  server_config_commands_and_descriptions = [
+      (
+          default_rapid_gke_version_cmd,
+          base_command_description + 'default rapid gke version',
+      ),
+      (
+          valid_versions_cmd,
+          base_command_description + 'valid versions',
+      ),
+  ]
+  command_outputs = []
+
+  for command, command_description in server_config_commands_and_descriptions:
+    return_code, cmd_output = run_command_for_value(
+        command,
+        command_description,
+        args,
+        hide_error=True,
+    )
+    if return_code != 0:
+      xpk_print(f'Unable to get server config for {command_description}.')
+      return return_code, None
+    command_outputs.append(cmd_output)
+
+  return 0, GkeServerConfig(
+      default_rapid_gke_version=command_outputs[0].strip(),
+      valid_versions=set(command_outputs[1].split(';')),
+  )
+
+
+def get_gke_control_plane_version(
+    args, gke_server_config: GkeServerConfig
+) -> tuple[int, str | None]:
+  """Determine gke control plane version for cluster creation.
+
+  Args:
+    args: user provided arguments for running the command.
+    gke_server_config: holds valid gke versions and recommended default version.
+
+  Returns:
+    Tuple of
+    int: 0 if successful and 1 otherwise.
+    str: gke control plane version to use.
+  """
+
+  # Override with user provide gke version if specified.
+  if args.gke_version is not None:
+    master_gke_version = args.gke_version
+  else:
+    master_gke_version = gke_server_config.default_rapid_gke_version
+
+  is_valid_version = master_gke_version in gke_server_config.valid_versions
+
+  if not is_valid_version:
+    xpk_print(
+        f'Planned GKE Version: {master_gke_version}\n Valid Versions:'
+        f'\n{gke_server_config.valid_versions}\nRecommended / Default GKE'
+        f' Version: {gke_server_config.default_rapid_gke_version}'
+    )
+    xpk_print(
+        f'Error: Planned GKE Version {master_gke_version} is not valid.'
+        f'Checks failed: Is Version Valid: {is_valid_version}'
+    )
+    xpk_print(
+        'Please select a gke version from the above list using --gke-version=x'
+        ' argument or rely on the default gke version:'
+        f' {gke_server_config.default_rapid_gke_version}'
+    )
+    return 1, None
+
+  return 0, master_gke_version
diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py
index 3191c7166..978e67aaa 100644
--- a/src/xpk/core/kueue.py
+++ b/src/xpk/core/kueue.py
@@ -15,18 +15,24 @@
 """
 
 from argparse import Namespace
-from packaging.version import Version
+
 import packaging
+from packaging.version import Version
+
+from ..utils.console import xpk_exit, xpk_print
 from ..utils.file import write_tmp_file
-from ..utils.console import xpk_print, xpk_exit
-from .commands import run_command_with_updates, run_command_with_updates_retry, run_command_for_value
-from .core import (
-    AutoprovisioningConfig,
+from .commands import (
+    run_command_for_value,
+    run_command_with_updates,
+    run_command_with_updates_retry,
+)
+from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
+from .resources import AutoprovisioningConfig
+from .scheduling import (
     create_accelerator_label,
     create_machine_label,
     get_total_chips_requested_from_args,
 )
-from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
 from .system_characteristics import (
     AcceleratorTypeToAcceleratorCharacteristics,
     SystemCharacteristics,
diff --git a/src/xpk/core/monitoring.py b/src/xpk/core/monitoring.py
new file mode 100644
index 000000000..a1a791824
--- /dev/null
+++ b/src/xpk/core/monitoring.py
@@ -0,0 +1,134 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ..utils.console import xpk_print
+from .commands import run_command_for_value
+
+
+def get_gke_dashboard(args, dashboard_filter) -> tuple[bool, str | None]:
+  """Get the identifier of GKE dashboard deployed in the project.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    bool:
+      True if 'gcloud monitoring dashboards list' returned an error or
+      multiple dashboards with same filter exist in the project,
+      False otherwise.
+    str:
+      identifier of dashboard if deployed in project,
+      None otherwise.
+  """
+  command = (
+      'gcloud monitoring dashboards list'
+      f' --project={args.project} --filter="{dashboard_filter}"'
+      ' --format="value(name)" --verbosity=error'
+  )
+
+  return_code, return_value = run_command_for_value(
+      command, 'GKE Dashboard List', args
+  )
+
+  if return_code != 0:
+    xpk_print(
+        f'GKE Dashboard List request returned ERROR {return_code}. If there is'
+        ' a permissions error, please check'
+        ' https://github.com/google/xpk/blob/main/README.md#roles-needed-based-on-permission-errors'
+        ' for possible solutions.'
+    )
+    return True, None
+
+  if not return_value:
+    xpk_print(
+        f'No dashboard with {dashboard_filter} found in the'
+        f' project:{args.project}.'
+    )
+    return False, return_value
+
+  dashboards = return_value.strip().split('\n')
+  if len(dashboards) > 1:
+    xpk_print(
+        f'Multiple dashboards with same {dashboard_filter} exist in the'
+        f' project:{args.project}. Delete all but one dashboard deployed using'
+        ' https://github.com/google/cloud-tpu-monitoring-debugging.'
+    )
+    return True, None
+
+  if dashboards[0]:
+    return False, dashboards[0].strip().split('/')[-1]
+
+  return True, None
+
+
+def get_gke_outlier_dashboard(args) -> str | None:
+  """Get the identifier of GKE outlier dashboard deployed in the project.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    str:
+      identifier of outlier dashboard if deployed in project,
+      None otherwise.
+  """
+  outlier_dashboard_filter = "displayName:'GKE - TPU Monitoring Dashboard'"
+  is_error, dashboard_id = get_gke_dashboard(args, outlier_dashboard_filter)
+
+  # 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
+  if is_error:
+    return None
+
+  # 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
+  if not is_error and not dashboard_id:
+    xpk_print(
+        'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
+        ' deploy monitoring dashboard to view statistics and outlier mode of'
+        ' GKE metrics.'
+    )
+    return None
+
+  return str(dashboard_id)
+
+
+def get_gke_debugging_dashboard(args) -> str | None:
+  """Get the identifier of GKE debugging dashboard deployed in the project.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    str:
+      identifier of debugging dashboard if deployed in project,
+      None otherwise.
+  """
+  debugging_dashboard_filter = "displayName:'GKE - TPU Logging Dashboard'"
+  is_error, dashboard_id = get_gke_dashboard(args, debugging_dashboard_filter)
+
+  # 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
+  if is_error:
+    return None
+
+  # 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
+  if not is_error and not dashboard_id:
+    xpk_print(
+        'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
+        ' deploy debugging dashboard to view stack traces collected in Cloud'
+        ' Logging.'
+    )
+    return None
+
+  return str(dashboard_id)
diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py
index 666dbc036..ff35fd3c7 100644
--- a/src/xpk/core/nap.py
+++ b/src/xpk/core/nap.py
@@ -14,29 +14,31 @@
 limitations under the License.
 """
 
-from ..core.core import (
+from ..utils.console import xpk_print
+from ..utils.file import write_tmp_file
+from ..utils.objects import get_value_from_map
+from .capacity import (
     AUTOPROVISIONING_CONFIG_VALUE,
     CAPACITY_TYPE_CONFIG_KEY,
-    CLUSTER_METADATA_CONFIGMAP,
-    CLUSTER_RESOURCES_CONFIGMAP,
     RESERVATION_CONFIG_KEY,
-    AutoprovisioningConfig,
     CapacityType,
-    get_all_nodepools_programmatic,
     get_capacity_node_selectors_from_capacity_type,
     get_capacity_type,
-    get_cluster_configmap,
-    get_total_chips_requested_from_args,
     verify_reservation_exists,
-    zone_to_region,
 )
-from ..utils.objects import get_value_from_map
-from ..utils.file import write_tmp_file
-from ..utils.console import xpk_print
 from .commands import run_command_with_updates, run_commands
+from .gcloud_context import zone_to_region
+from .nodepool import get_all_nodepools_programmatic
+from .resources import (
+    CLUSTER_METADATA_CONFIGMAP,
+    CLUSTER_RESOURCES_CONFIGMAP,
+    AutoprovisioningConfig,
+    get_cluster_configmap,
+)
+from .scheduling import get_total_chips_requested_from_args
 from .system_characteristics import AcceleratorType, SystemCharacteristics
 
-autoprovisioning_config_file = """
+AUTOPROVISIONING_CONFIG_FILE = """
 management:
   autoRepair: true
   autoUpgrade: true
@@ -44,8 +46,7 @@
   {zones}
 {resource_limits}
 """
-
-autoprovisioning_resource_limits = """
+AUTOPROVISIONING_RESOURCE_LIMITS = """
 resourceLimits:
 - resourceType: 'cpu'
   {cpu_limits}
@@ -53,8 +54,7 @@
   {memory_limits}
 {custom_resource_type}
 """
-
-autoprovisioning_custom_resource_type = """
+AUTOPROVISIONING_CUSTOM_RESOURCE_TYPE = """
 - resourceType: {resource_type}
   minimum: {minimum}
   maximum: {maximum}
@@ -218,19 +218,19 @@ def create_autoprovisioning_config(
       ' small, rescaling will not work well.'
   )
 
-  custom_resource_string = autoprovisioning_custom_resource_type.format(
+  custom_resource_string = AUTOPROVISIONING_CUSTOM_RESOURCE_TYPE.format(
       resource_type=system.gke_accelerator,
       minimum=minimum,
       maximum=maximum,
   )
 
-  resource_limits = autoprovisioning_resource_limits.format(
+  resource_limits = AUTOPROVISIONING_RESOURCE_LIMITS.format(
       cpu_limits=cpu_limits,
       memory_limits=memory_limits,
       custom_resource_type=custom_resource_string,
   )
 
-  yml_string = autoprovisioning_config_file.format(
+  yml_string = AUTOPROVISIONING_CONFIG_FILE.format(
       resource_limits=resource_limits,
       zones=f'- {args.zone}',
   )
@@ -347,3 +347,15 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
     return node_selector_args, return_code
 
   return node_selector_args, return_code
+
+
+def get_cluster_provisioner(args) -> str:
+  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
+  cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
+  cluster_provisioner = 'gcloud'
+  if not cluster_config_map is None:
+    provisioner = cluster_config_map.get('provisioner')
+    if not provisioner is None:
+      cluster_provisioner = provisioner
+  xpk_print(f'Cluster provisioner: {cluster_provisioner}')
+  return cluster_provisioner
diff --git a/src/xpk/core/network.py b/src/xpk/core/network.py
new file mode 100644
index 000000000..1d69d01f8
--- /dev/null
+++ b/src/xpk/core/network.py
@@ -0,0 +1,367 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ..utils.console import xpk_print
+from ..utils.file import write_tmp_file
+from .capacity import H100_DEVICE_TYPE
+from .commands import run_command_for_value, run_command_with_updates
+from .gcloud_context import zone_to_region
+from .system_characteristics import SystemCharacteristics
+
+# cluster_network_yaml: the config when creating the network for a3 cluster
+CLUSTER_NETWORK_YAML = """
+apiVersion: networking.gke.io/v1
+kind: Network
+metadata:
+  name: vpc1
+spec:
+  parametersRef:
+    group: networking.gke.io
+    kind: GKENetworkParamSet
+    name: vpc1
+  type: Device
+---
+apiVersion: networking.gke.io/v1
+kind: Network
+metadata:
+  name: vpc2
+spec:
+  parametersRef:
+    group: networking.gke.io
+    kind: GKENetworkParamSet
+    name: vpc2
+  type: Device
+---
+apiVersion: networking.gke.io/v1
+kind: Network
+metadata:
+  name: vpc3
+spec:
+  parametersRef:
+    group: networking.gke.io
+    kind: GKENetworkParamSet
+    name: vpc3
+  type: Device
+---
+apiVersion: networking.gke.io/v1
+kind: Network
+metadata:
+  name: vpc4
+spec:
+  parametersRef:
+    group: networking.gke.io
+    kind: GKENetworkParamSet
+    name: vpc4
+  type: Device
+---
+apiVersion: networking.gke.io/v1
+kind: GKENetworkParamSet
+metadata:
+  name: vpc1
+spec:
+  vpc: {cluster_name}-net-1
+  vpcSubnet: {cluster_name}-sub-1
+  deviceMode: NetDevice
+---
+apiVersion: networking.gke.io/v1
+kind: GKENetworkParamSet
+metadata:
+  name: vpc2
+spec:
+  vpc: {cluster_name}-net-2
+  vpcSubnet: {cluster_name}-sub-2
+  deviceMode: NetDevice
+---
+apiVersion: networking.gke.io/v1
+kind: GKENetworkParamSet
+metadata:
+  name: vpc3
+spec:
+  vpc: {cluster_name}-net-3
+  vpcSubnet: {cluster_name}-sub-3
+  deviceMode: NetDevice
+---
+apiVersion: networking.gke.io/v1
+kind: GKENetworkParamSet
+metadata:
+  name: vpc4
+spec:
+  vpc: {cluster_name}-net-4
+  vpcSubnet: {cluster_name}-sub-4
+  deviceMode: NetDevice
+"""
+
+
+def create_cluster_network(args, index) -> int:
+  """Create one GKE Cluster network.
+
+  Args:
+    args: user provided arguments for running the command.
+    index: index number for the network to be created.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  existing_network_names, return_code = get_all_networks_programmatic(args)
+  if return_code > 0:
+    xpk_print('Listing all networks failed!')
+    return return_code
+
+  network_name = f'{args.cluster}-net-{index}'
+  if network_name not in existing_network_names:
+    command = (
+        f'gcloud compute --project={args.project}'
+        f' networks create {network_name}'
+        ' --subnet-mode=custom --mtu=8244'
+    )
+    return_code = run_command_with_updates(
+        command, 'Create Cluster Network', args, verbose=False
+    )
+
+    if return_code != 0:
+      xpk_print(f'Create Cluster Network request returned ERROR {return_code}')
+      return 1
+  else:
+    xpk_print(f'Reusing existing network {network_name}')
+
+  return 0
+
+
+def create_cluster_subnet(args, index) -> int:
+  """Create one GKE Cluster subnet.
+
+  Args:
+    args: user provided arguments for running the command.
+    index: index number for the subnet to be created.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  existing_subnet_names, return_code = get_all_subnets_programmatic(args)
+  if return_code > 0:
+    xpk_print('Listing all subnets failed!')
+    return return_code
+  subnet_name = f'{args.cluster}-{zone_to_region(args.zone)}-sub-{index}'
+  if subnet_name not in existing_subnet_names:
+    command = (
+        f'gcloud compute --project={args.project}'
+        f' networks subnets create {subnet_name}'
+        f' --network={args.cluster}-net-{index}'
+        f' --region={zone_to_region(args.zone)} --range=192.168.{index}.0/24'
+    )
+    return_code = run_command_with_updates(
+        command, 'Create Cluster Subnet', args, verbose=False
+    )
+
+    if return_code != 0:
+      xpk_print(f'Create Cluster Subnet request returned ERROR {return_code}')
+      return 1
+  else:
+    xpk_print(f'Reusing existing subnet {subnet_name}')
+
+  return 0
+
+
+def create_cluster_firewall_rule(args, index) -> int:
+  """Create one GKE Cluster firewall rule.
+
+  Args:
+    args: user provided arguments for running the command.
+    index: index number for the firewall rule to be created.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  existing_firewall_rules_names, return_code = (
+      get_all_firewall_rules_programmatic(args)
+  )
+  if return_code > 0:
+    xpk_print('Listing all firewall rules failed!')
+    return return_code
+  firewall_rule_name = f'{args.cluster}-internal-{index}'
+  if firewall_rule_name not in existing_firewall_rules_names:
+    command = (
+        f'gcloud compute --project={args.project} firewall-rules create'
+        f' {firewall_rule_name} --network={args.cluster}-net-{index} --action=ALLOW'
+        ' --rules=tcp:0-65535,udp:0-65535,icmp --source-ranges=192.168.0.0/16'
+    )
+    return_code = run_command_with_updates(
+        command, 'Create Cluster Firewall Rule', args, verbose=False
+    )
+
+    if return_code != 0:
+      xpk_print(
+          f'Create Cluster Firewall Rule request returned ERROR {return_code}'
+      )
+      return 1
+  else:
+    xpk_print(f'Reusing existing firewall rule {firewall_rule_name}')
+  return 0
+
+
+def create_cluster_network_config(args) -> int:
+  """Run the Create GKE Cluster Network Config request.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
+  tmp = write_tmp_file(yml_string)
+  command = f'kubectl apply -f {str(tmp.file.name)}'
+
+  return_code = run_command_with_updates(
+      command, 'GKE Cluster Create Network Config', args
+  )
+  if return_code != 0:
+    xpk_print(
+        f'GKE Cluster Create ConfigMap request returned ERROR {return_code}'
+    )
+    return 1
+
+  return 0
+
+
+def set_up_cluster_network_for_gpu(args, system: SystemCharacteristics) -> int:
+  """Set up GKE Cluster networks, subnets and firewall rules for A3/A3+.
+  Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node,
+  and there are 8 NICs for GPU-GPU bw and 1 NIC for host in an A3+ node.
+
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  num_networks = 5 if system.device_type == H100_DEVICE_TYPE else 9
+  for i in range(1, num_networks):
+    return_code = create_cluster_network(args, i)
+    if return_code != 0:
+      return 1
+    return_code = create_cluster_subnet(args, i)
+    if return_code != 0:
+      return 1
+    return_code = create_cluster_firewall_rule(args, i)
+    if return_code != 0:
+      return 1
+  return 0
+
+
+def delete_cluster_subnets(args) -> int:
+  """Delete GKE Cluster subnets.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  existing_subnet_names, return_code = get_all_subnets_programmatic(args)
+  if return_code > 0:
+    xpk_print('Listing all subnets failed!')
+    return return_code
+
+  for subnet_name in existing_subnet_names:
+    command = (
+        f'gcloud compute networks subnets delete {subnet_name}'
+        f' --region={zone_to_region(args.zone)} --project={args.project} --quiet'
+    )
+
+    return_code = run_command_with_updates(
+        command, 'Delete Cluster Subnet', args, verbose=False
+    )
+
+    if return_code != 0:
+      xpk_print(f'Delete Cluster Subnet request returned ERROR {return_code}')
+      return 1
+    else:
+      xpk_print(f'Deleted existing subnet {subnet_name}')
+
+  return 0
+
+
+def get_all_networks_programmatic(args) -> tuple[list[str], int]:
+  """Gets all the networks associated with project .
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    List of networks and 0 if successful and 1 otherwise.
+  """
+  command = 'gcloud compute networks list --format="csv[no-heading](name)"'
+  return_code, raw_network_output = run_command_for_value(
+      command, 'Get All Networks', args
+  )
+  if return_code != 0:
+    xpk_print(f'Get All Networks returned ERROR {return_code}')
+    return [], 1
+
+  return raw_network_output.splitlines(), 0
+
+
+def get_all_subnets_programmatic(args) -> tuple[list[str], int]:
+  """Gets all the subnets associated with the project.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    List of subnets and 0 if successful and 1 otherwise.
+  """
+  subnet_name_filter = f'{args.cluster}-{zone_to_region(args.zone)}-sub-*'
+
+  command = (
+      'gcloud compute networks subnets list'
+      f' --filter=name~"{subnet_name_filter}" --project={args.project}'
+  )
+  return_code, raw_subnets_output = run_command_for_value(
+      command, 'Get All Subnets', args
+  )
+  if return_code != 0:
+    xpk_print(f'Get All Subnets returned ERROR {return_code}')
+    return [], 1
+
+  all_outputs = raw_subnets_output.splitlines()
+  all_networks = [
+      all_outputs[i].split(' ')[0] for i in range(1, len(all_outputs))
+  ]
+  return all_networks, 0
+
+
+def get_all_firewall_rules_programmatic(args) -> tuple[list[str], int]:
+  """Gets all the firewall rules associated with the project.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    List of firewall rules and 0 if successful and 1 otherwise.
+  """
+  command = (
+      'gcloud compute firewall-rules list --format="csv[no-heading](name)"'
+  )
+  return_code, raw_subnets_output = run_command_for_value(
+      command, 'Get All Firewall Rules', args
+  )
+  if return_code != 0:
+    xpk_print(f'Get All Firewall Rules returned ERROR {return_code}')
+    return [], 1
+
+  return raw_subnets_output.splitlines(), 0
diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py
new file mode 100644
index 000000000..b1ebddeb1
--- /dev/null
+++ b/src/xpk/core/nodepool.py
@@ -0,0 +1,581 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ..utils.console import get_user_input, xpk_print
+from .capacity import (
+    AUTOPROVISIONING_CONFIG_VALUE,
+    H100_MEGA_DEVICE_TYPE,
+    CapacityType,
+    get_capacity_arguments_from_capacity_type,
+    get_capacity_type,
+    print_reservations,
+)
+from .commands import run_command_for_value, run_commands
+from .gcloud_context import GkeServerConfig, zone_to_region
+from .resources import (
+    CLUSTER_CONFIGMAP_YAML,
+    CLUSTER_RESOURCES_CONFIGMAP,
+    check_cluster_resources,
+    create_or_update_cluster_configmap,
+)
+from .system_characteristics import AcceleratorType
+
+CLOUD_PLATFORM_AUTH_SCOPE_URL = (
+    '"https://www.googleapis.com/auth/cloud-platform"'
+)
+
+
+def run_gke_node_pool_create_command(
+    args, system, gke_node_pool_version
+) -> int:
+  """Run the Create GKE Node Pool request.
+
+  Args:
+    args: user provided arguments for running the command.
+    system: System characteristics based on device type/topology.
+    gke_node_pool_version: GKE version to use to create node pools.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  device_type = args.tpu_type if args.tpu_type else args.device_type
+  xpk_print(
+      f'Creating {args.num_slices} node pool or pools of {device_type}\n'
+      f'We assume that the underlying system is: {system}'
+  )
+  existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
+  if return_code > 0:
+    xpk_print('Listing all node pools failed!')
+    return return_code
+
+  capacity_type, return_code = get_capacity_type(args)
+  if return_code > 0:
+    xpk_print('Parsing capacity type failed!')
+    return return_code
+  if capacity_type == CapacityType.UNKNOWN:
+    return_code = print_reservations(args)
+    xpk_print(
+        'ERROR: User needs to provide the capacity type. Please specify one of'
+        ' the following `--reservation=$RESERVATION_NAME`, `--on-demand`'
+        ' or `--spot`. See the above list of reservations to choose from.'
+    )
+    if return_code > 0:
+      xpk_print('Listing all reservations failed!')
+    return_code = 1
+  capacity_args, return_code = get_capacity_arguments_from_capacity_type(
+      args, capacity_type
+  )
+  if return_code > 0:
+    xpk_print('Parsing capacity arguments failed!')
+    return return_code
+
+  if system.accelerator_type == AcceleratorType['GPU']:
+    xpk_print(
+        f'Creating 1 node pool with {args.num_nodes} nodes of'
+        f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
+    )
+    desired_node_pool_names = [f'{args.cluster}-np-0']
+  else:
+    xpk_print(
+        f'Creating {args.num_slices} node pool or pools of'
+        f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
+    )
+    desired_node_pool_names = [
+        f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
+    ]
+
+  node_pools_to_remain = []
+  delete_commands = []
+  delete_task_names = []
+  node_pools_to_update_WI = []
+  update_WI_commands = []
+  update_WI_task_names = []
+  if existing_node_pool_names:
+    return_code, existing_node_pool_zone = get_nodepool_zone(
+        args, existing_node_pool_names[0]
+    )
+    if return_code != 0:
+      return 1
+
+    if existing_node_pool_zone and existing_node_pool_zone != args.zone:
+      xpk_print(
+          f'Cluster {args.cluster} already has nodepools in zone:'
+          f' {existing_node_pool_zone}. Use the same zone to update nodepools'
+          ' in the cluster.'
+      )
+      return 1
+
+    node_pools_to_delete = get_node_pools_to_delete(
+        args, system, existing_node_pool_names, desired_node_pool_names
+    )
+    for node_pool_name in existing_node_pool_names:
+      if node_pool_name.find(f'{args.cluster}-np-') != 0:
+        continue
+
+      if node_pool_name in node_pools_to_delete:
+        command = (
+            'gcloud beta container node-pools delete'
+            f' {node_pool_name} --cluster={args.cluster}'
+            f' --zone={zone_to_region(args.zone)}'
+            f' --project={args.project} --quiet'
+        )
+        task = f'NodepoolDelete-{node_pool_name}'
+        delete_commands.append(command)
+        delete_task_names.append(task)
+      else:
+        node_pools_to_remain.append(node_pool_name)
+
+    # Workload Identity for existing nodepools
+    if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
+      for node_pool_name in existing_node_pool_names:
+        if not node_pool_name in node_pools_to_delete:
+          # Check if workload identity is not already enabled:
+          return_code, existing_node_pool_medadata_mode = (
+              get_nodepool_workload_metadata_mode(args, node_pool_name)
+          )
+          if return_code != 0:
+            return 1
+
+          if (
+              existing_node_pool_zone
+              and existing_node_pool_medadata_mode != 'GKE_METADATA'
+          ):
+            command = (
+                'gcloud container node-pools update'
+                f' {node_pool_name} --cluster={args.cluster}'
+                f' --zone={zone_to_region(args.zone)}'
+                f' --project={args.project} --quiet'
+                ' --workload-metadata=GKE_METADATA'
+            )
+            task = (
+                'Update nodepool with Workload Identity enabled'
+                f' {node_pool_name}'
+            )
+            update_WI_commands.append(command)
+            update_WI_task_names.append(task)
+            node_pools_to_update_WI.append(node_pool_name)
+
+  # Deletion of nodepools should happen before attempting to create new nodepools for the case
+  # when cluster is getting updated from 'x' device_type/gke_accelerator to 'y' device_type/gke_accelerator.
+  # In that case, '{args.cluster}-np-i' nodepool will be re-created for 'y' device_type/gke_accelerator.
+  if delete_commands:
+    will_delete = True
+    if node_pools_to_delete and not args.force:
+      will_delete = get_user_input(
+          f'Planning to delete {len(node_pools_to_delete)} node pools including'
+          f' {node_pools_to_delete}. \nDo you wish to delete: y (yes) / n'
+          ' (no):\n'
+      )
+    if not will_delete:
+      xpk_print(
+          'You have requested to not delete the existing nodepools in the'
+          ' cluster. There will be no change to the cluster.'
+      )
+      return 1
+
+    for i, command in enumerate(delete_commands):
+      xpk_print(
+          f'To complete {delete_task_names[i]} we are executing {command}'
+      )
+    max_return_code = run_commands(
+        delete_commands,
+        'Delete Nodepools',
+        delete_task_names,
+        dry_run=args.dry_run,
+    )
+    if max_return_code != 0:
+      xpk_print(f'Delete Nodepools returned ERROR {max_return_code}')
+      return 1
+
+  # Enable Workload Identity on existing Nodepools
+  if update_WI_commands:
+    will_update_WI = True
+    if node_pools_to_update_WI and not args.force:
+      will_update_WI = get_user_input(
+          'Planning to enable Workload Identity Federation on'
+          f' {len(node_pools_to_update_WI)} existing node pools including'
+          f' {node_pools_to_update_WI}.This immediately enables Workload'
+          ' Identity Federation for GKE for any workloads running in the node'
+          ' pool. Also, xpk does not support disabling Workload Identity on'
+          ' clusters that have it enabled already \nDo you wish to update: y'
+          ' (yes) / n (no):\n'
+      )
+    if not will_update_WI:
+      for i, command in enumerate(update_WI_commands):
+        xpk_print(
+            f'To complete {update_WI_task_names[i]} we are executing {command}'
+        )
+      max_return_code = run_commands(
+          update_WI_commands,
+          'Enable Workload Identity on existing Nodepools',
+          update_WI_task_names,
+          dry_run=args.dry_run,
+      )
+      if max_return_code != 0:
+        xpk_print(
+            'Enable Workload Identity on existing Nodepools returned ERROR'
+            f' {max_return_code}'
+        )
+        return 1
+
+    # Update {args.cluster}-{_CLUSTER_RESOURCES_CONFIGMAP} ConfigMap to 'y': '0'
+    # and remove 'x' from the ConfigMap when cluster is getting updated from
+    # 'x' device_type/gke_accelerator to 'y' device_type/gke_accelerator.
+    if not node_pools_to_remain:
+      if args.enable_autoprovisioning:
+        resources_data = (
+            f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
+        )
+      else:
+        resources_data = f'{device_type}: "0"'
+      resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
+      resources_yml = CLUSTER_CONFIGMAP_YAML.format(
+          args=args, name=resources_configmap_name, data=resources_data
+      )
+      configmap_yml = {}
+      configmap_yml[resources_configmap_name] = resources_yml
+      return_code = create_or_update_cluster_configmap(configmap_yml)
+      if return_code != 0:
+        return 1
+
+  create_commands = []
+  create_task_names = []
+  for node_pool_name in desired_node_pool_names:
+    if node_pool_name in node_pools_to_remain:
+      continue
+    command = (
+        'gcloud beta container node-pools create'
+        f' {node_pool_name}'
+        f' --region={zone_to_region(args.zone)}'
+        f' --cluster={args.cluster}'
+        f' --project={args.project} --node-locations={args.zone}'
+        f' --machine-type={system.gce_machine_type}'
+        f' --host-maintenance-interval={args.host_maintenance_interval}'
+        f' {capacity_args}'
+        ' --enable-gvnic'
+        f' {args.custom_nodepool_arguments}'
+    )
+    if system.accelerator_type == AcceleratorType['TPU']:
+      command += f' --node-version={gke_node_pool_version}'
+      command += f' --num-nodes={system.vms_per_slice}'
+      command += ' --placement-type=COMPACT  --max-pods-per-node 15'
+      command += (
+          f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
+      )
+      command += f' --tpu-topology={system.topology}'
+      command += f' {args.custom_tpu_nodepool_arguments}'
+    elif system.accelerator_type == AcceleratorType['GPU']:
+      subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
+      command += f' --num-nodes={args.num_nodes}'
+      command += (
+          ' --accelerator'
+          f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
+          ' --no-enable-autoupgrade '
+          f' --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL} --additional-node-network'
+          f' network={args.cluster}-net-1,subnetwork={subnet_prefix}-sub-1'
+          ' --additional-node-network'
+          f' network={args.cluster}-net-2,subnetwork={subnet_prefix}-sub-2'
+          ' --additional-node-network'
+          f' network={args.cluster}-net-3,subnetwork={subnet_prefix}-sub-3'
+          ' --additional-node-network'
+          f' network={args.cluster}-net-4,subnetwork={subnet_prefix}-sub-4'
+      )
+      if device_type == H100_MEGA_DEVICE_TYPE:
+        command += (
+            ' --additional-node-network'
+            f' network={args.cluster}-net-5,subnetwork={subnet_prefix}-sub-5'
+            ' --additional-node-network'
+            f' network={args.cluster}-net-6,subnetwork={subnet_prefix}-sub-6'
+            ' --additional-node-network'
+            f' network={args.cluster}-net-7,subnetwork={subnet_prefix}-sub-7'
+            ' --additional-node-network'
+            f' network={args.cluster}-net-8,subnetwork={subnet_prefix}-sub-8'
+            ' --max-pods-per-node=32'
+        )
+    elif system.accelerator_type == AcceleratorType['CPU']:
+      command += f' --num-nodes={system.vms_per_slice}'
+      command += (
+          f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
+      )
+
+    if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
+      command += ' --workload-metadata=GKE_METADATA'
+
+    task = f'NodepoolCreate-{node_pool_name}'
+    create_commands.append(command)
+    create_task_names.append(task)
+
+  desired_pw_cpu_node_pools = ['cpu-user-np', 'cpu-rm-np', 'cpu-proxy-np']
+  if args.enable_pathways:
+    # Pathways needs CPU nodepools in addition to TPU nodepools
+    for node_pool_name in desired_pw_cpu_node_pools:
+      if node_pool_name in existing_node_pool_names:
+        continue
+      command = (
+          'gcloud beta container node-pools create'
+          f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
+          f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
+          ' --min-nodes=1 --max-nodes=20'
+      )
+      task = f'NodepoolCreate-{node_pool_name}'
+      create_commands.append(command)
+      create_task_names.append(task)
+
+  for i, command in enumerate(create_commands):
+    xpk_print(f'To complete {create_task_names[i]} we are executing {command}')
+  max_return_code = run_commands(
+      create_commands,
+      'Create Nodepools',
+      create_task_names,
+      dry_run=args.dry_run,
+  )
+  if max_return_code != 0:
+    xpk_print(f'Create Nodepools returned ERROR {max_return_code}')
+    return 1
+
+  xpk_print('Create or delete node pool request complete.')
+  return 0
+
+
+def get_node_pools_to_delete(
+    args, system, existing_node_pool_names, desired_node_pool_names
+) -> list:
+  """Get list of nodepools to delete from the cluster.
+
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics.
+    existing_node_pool_names: names of nodepools that already exist in the cluster.
+    desired_node_pool_names: names of nodepools that should exist in the cluster.
+
+  Returns:
+    List of nodepool names to delete.
+  """
+  node_pools_to_delete = []
+  check_resource, is_requested_resource_in_cluster = check_cluster_resources(
+      args, system
+  )
+  for existing_node_pool_name in existing_node_pool_names:
+    # Deletion logic would leave behind any Pathways CPU nodepools.
+    if existing_node_pool_name.find(f'{args.cluster}-np-') != 0:
+      continue
+
+    # Nodepools will be deleted in two scenarios:
+    # Scenario 1: Cluster exists with 3 nodepools of 'x' device_type/gke_accelerator and now we are updating
+    # the cluster to 2 nodepools of 'x' device_type/gke_accelerator. In this case, we will delete
+    # '{args.cluster}-np-2' from the cluster.
+    # Scenario 2: Cluster exists with 2 nodepools of 'x' device_type/gke_accelerator and now we are updating
+    # the cluster to 2 nodepools of 'y' device_type/gke_accelerator. In this case, we will delete
+    # '{args.cluster}-np-0' and '{args.cluster}-np-1' from the cluster.
+    if existing_node_pool_name not in desired_node_pool_names or (
+        check_resource and not is_requested_resource_in_cluster
+    ):
+      node_pools_to_delete.append(existing_node_pool_name)
+
+  return node_pools_to_delete
+
+
+def get_all_nodepools_programmatic(args) -> tuple[list[str], int]:
+  """Gets all the nodepools associated with the cluster / project / region.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    List of nodepools and 0 if successful and 1 otherwise.
+  """
+  command = (
+      'gcloud beta container node-pools list'
+      ' --cluster'
+      f' {args.cluster} --project={args.project} --region={zone_to_region(args.zone)}'
+      ' --format="csv[no-heading](name)"'
+  )
+  return_code, raw_nodepool_output = run_command_for_value(
+      command, 'Get All Node Pools', args
+  )
+  if return_code != 0:
+    xpk_print(f'Get All Node Pools returned ERROR {return_code}')
+    return [], 1
+
+  return raw_nodepool_output.splitlines(), 0
+
+
+def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]:
+  """Return zone in which nodepool exists in the cluster.
+
+  Args:
+    args: user provided arguments for running the command.
+    nodepool_name: name of nodepool.
+
+  Returns:
+    Tuple of int, str where
+    int is the return code - 0 if successful, 1 otherwise.
+    str is the zone of nodepool.
+  """
+  command = (
+      f'gcloud beta container node-pools describe {nodepool_name}'
+      f' --cluster {args.cluster} --project={args.project}'
+      f' --region={zone_to_region(args.zone)} --format="value(locations)"'
+  )
+  return_code, nodepool_zone = run_command_for_value(
+      command, 'Get Node Pool Zone', args
+  )
+  if return_code != 0:
+    xpk_print(f'Get Node Pool Zone returned ERROR {return_code}')
+    return 1, None
+
+  return 0, nodepool_zone.strip()
+
+
+def get_gke_node_pool_version(
+    args, gke_server_config: GkeServerConfig
+) -> tuple[int, str | None]:
+  """Determine the gke node pool version for the node pool.
+
+  Args:
+    args: user provided arguments for running the command.
+    gke_server_config: holds valid gke versions and recommended default version.
+
+  Returns:
+    Tuple of
+    int: 0 if successful and 1 otherwise.
+    str: gke control plane version to use.
+  """
+
+  # By default use the current gke master version for creating node pools.
+  command_description = 'Determine current gke master version'
+  command = (
+      f'gcloud beta container clusters describe {args.cluster}'
+      f' --region {zone_to_region(args.zone)} --project {args.project}'
+      ' --format="value(currentMasterVersion)"'
+  )
+
+  return_code, current_gke_master_version = run_command_for_value(
+      command, command_description, args
+  )
+  if return_code != 0:
+    xpk_print(
+        f'Unable to get server config for command: {command_description}.'
+    )
+    return return_code, None
+
+  # Override with user provide gke version if specified.
+  if args.gke_version is not None:
+    node_pool_gke_version = args.gke_version
+  else:
+    master_gke_version = current_gke_master_version.strip()
+    node_pool_gke_version = ''
+    # Select minimum version which is >= master_gke_version and has the same minor version.
+    # If this does not exist select maximum version which is < master_gke_version.
+    for version in gke_server_config.valid_versions:
+      if (
+          (node_pool_gke_version == '' or node_pool_gke_version < version)
+          and version < master_gke_version
+      ) or (
+          (node_pool_gke_version == '' or node_pool_gke_version > version)
+          and master_gke_version <= version
+          and master_gke_version.split('.')[:2] == version.split('.')[:2]
+      ):
+        node_pool_gke_version = version
+
+  is_supported_node_pool_version = (
+      node_pool_gke_version in gke_server_config.valid_versions
+  )
+  # In rare cases, user's provided gke version may be invalid, but gke will return an error if so.
+  # An example scenario is if the user provided gke version is greater than the master version.
+  if not is_supported_node_pool_version:
+    xpk_print(
+        f'Planned node pool version {node_pool_gke_version} is not supported in'
+        ' valid version'
+        f' {gke_server_config.valid_versions}\nPlease adjust the gke version'
+        ' using --gke-version=x or remove the arg and depend on xpk default of'
+        f' {current_gke_master_version}'
+    )
+    return 1, None
+  return 0, node_pool_gke_version
+
+
+def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
+  """Upgrade nodepools in the cluster to default rapid gke version. Recreates the nodes.
+
+  Args:
+    args: user provided arguments for running the command.
+    default_rapid_gke_version: Rapid default version for the upgrade.
+
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
+  if return_code != 0:
+    xpk_print('Listing all node pools failed!')
+    return return_code
+
+  # Batch execution to upgrade node pools simultaneously
+  commands = []
+  task_names = []
+  for node_pool_name in existing_node_pool_names:
+    commands.append(
+        'gcloud container clusters upgrade'
+        f' {args.cluster} --project={args.project}'
+        f' --region={zone_to_region(args.zone)}'
+        f' --cluster-version={default_rapid_gke_version}'
+        f' --node-pool={node_pool_name}'
+        ' --quiet'
+    )
+    task_names.append(f'Upgrading node pool {node_pool_name}.')
+
+  for i, command in enumerate(commands):
+    xpk_print(f'To complete {task_names[i]} we are executing {command}')
+  max_return_code = run_commands(
+      commands, 'Update GKE node pools to default RAPID GKE version', task_names
+  )
+  if max_return_code != 0:
+    xpk_print(
+        'GKE node pools update to default RAPID GKE version returned ERROR:'
+        f' {max_return_code}'
+    )
+    return int(max_return_code)
+  return 0
+
+
+def get_nodepool_workload_metadata_mode(
+    args, nodepool_name
+) -> tuple[int, str | None]:
+  """Return Workload Identity metadata mode of the nodepool.
+  Args:
+    args: user provided arguments for running the command.
+    nodepool_name: name of nodepool.
+  Returns:
+    Tuple of int, str where
+    int is the return code - 0 if successful, 1 otherwise.
+    str is the workload metadata mode of nodepool.
+  """
+  command = (
+      f'gcloud beta container node-pools describe {nodepool_name}'
+      f' --cluster {args.cluster} --project={args.project}'
+      f' --region={zone_to_region(args.zone)} --format="value(config.workloadMetadataConfig.mode)"'
+  )
+  return_code, nodepool_WI_mode = run_command_for_value(
+      command, 'Get Node Pool Workload Identity Metadata Mode', args
+  )
+  if return_code != 0:
+    xpk_print(
+        'Get Node Pool Workload Identity Metadata Mode returned ERROR'
+        f' {return_code}'
+    )
+    return 1, None
+
+  return 0, nodepool_WI_mode.strip()
diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
index 8ef54bece..b9501bb04 100644
--- a/src/xpk/core/pathways.py
+++ b/src/xpk/core/pathways.py
@@ -14,14 +14,11 @@
 limitations under the License.
 """
 
+from ..core.docker_container import get_user_workload_container
+from ..core.gcloud_context import zone_to_region
+from ..core.nodepool import get_all_nodepools_programmatic
 from ..utils.console import xpk_exit, xpk_print
-from .core import (
-    GCS_FUSE_ANNOTATION,
-    AcceleratorType,
-    get_all_nodepools_programmatic,
-    get_user_workload_container,
-    zone_to_region,
-)
+from .config import GCS_FUSE_ANNOTATION, AcceleratorType
 from .storage import XPK_SA, Storage, get_storage_volumes_yaml
 from .system_characteristics import SystemCharacteristics
 
diff --git a/src/xpk/core/resources.py b/src/xpk/core/resources.py
new file mode 100644
index 000000000..925784087
--- /dev/null
+++ b/src/xpk/core/resources.py
@@ -0,0 +1,216 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from dataclasses import dataclass
+
+from ..utils.console import xpk_print
+from ..utils.file import write_tmp_file
+from .capacity import (
+    AUTOPROVISIONING_CONFIG_MAXIMUM_KEY,
+    AUTOPROVISIONING_CONFIG_MINIMUM_KEY,
+    AUTOPROVISIONING_CONFIG_VALUE,
+    CAPACITY_TYPE_CONFIG_KEY,
+    RESERVATION_CONFIG_KEY,
+    CapacityType,
+    get_capacity_type,
+)
+from .commands import run_command_for_value, run_commands
+from .config import XPK_CURRENT_VERSION
+from .system_characteristics import AcceleratorType
+
+CLUSTER_RESOURCES_CONFIGMAP = 'resources-configmap'
+CLUSTER_METADATA_CONFIGMAP = 'metadata-configmap'
+
+CLUSTER_CONFIGMAP_YAML = """kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: {name}
+data:
+  {data}
+"""
+
+
+@dataclass
+class AutoprovisioningConfig:
+  config_filename: str
+  minimum_chips: int
+  maximum_chips: int
+
+
+def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
+  """Run the Get GKE Cluster ConfigMap request.
+
+  Args:
+    args: user provided arguments for running the command.
+    configmap_name: name of the configmap.
+
+  Returns:
+    key:value pairs stored in cluster ConfigMap.
+  """
+  command = (
+      'kubectl get configmap'
+      f' {configmap_name} -o=custom-columns="ConfigData:data" --no-headers=true'
+  )
+
+  return_code, return_value = run_command_for_value(
+      command, 'GKE Cluster Get ConfigMap', args
+  )
+  if return_code != 0:
+    xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
+    return None
+
+  config_map = {}
+  return_value = return_value.strip()
+
+  if return_value:
+    # Format of ConfigMap: map[key1:value1 key2:value2]
+    return_value = return_value[return_value.index('map') :]
+    configs = return_value[4:-1].split(' ')
+
+    for config in configs:
+      key, value = config.strip().split(':')
+      config_map[key] = value
+  return config_map
+
+
+def create_cluster_configmaps(
+    args,
+    system,
+    tensorboard_config: dict,
+    autoprovisioning_config: AutoprovisioningConfig | None,
+) -> int:
+  """Run the Create GKE Cluster ConfigMap request.
+
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics.
+    tensorboard_config: map that contains Vertex Tensorboard name, id and location
+    autoprovisioning_config: Config used in autoprovisioning.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  configmap_yml = {}
+
+  # ConfigMap to store resources available in the cluster.
+  device_type = system.device_type
+  if system.accelerator_type == AcceleratorType['GPU']:
+    resources_data = f'{device_type}: "{int(args.num_nodes)}"'
+  elif (
+      not args.enable_pathways
+      and args.enable_autoprovisioning
+      and autoprovisioning_config
+  ):
+    # Currently autoprovisioning is not supported with Pathways.
+    # Auto provisioning will have variable topologies for a gke accelerator type.
+    resources_data = (
+        f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
+    )
+    resources_data += (
+        f'\n  {AUTOPROVISIONING_CONFIG_MINIMUM_KEY}:'
+        f' "{autoprovisioning_config.minimum_chips}"'
+    )
+    resources_data += (
+        f'\n  {AUTOPROVISIONING_CONFIG_MAXIMUM_KEY}:'
+        f' "{autoprovisioning_config.maximum_chips}"'
+    )
+  else:
+    resources_data = (
+        f'{device_type}: "{int(args.num_slices) * system.vms_per_slice}"'
+    )
+  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
+  resources_yml = CLUSTER_CONFIGMAP_YAML.format(
+      args=args, name=resources_configmap_name, data=resources_data
+  )
+  configmap_yml[resources_configmap_name] = resources_yml
+
+  # ConfigMap to store cluster metadata.
+  # XPK Version.
+  metadata = f'xpk_version: {XPK_CURRENT_VERSION}'
+  # Vertex Tensorboard information
+  for key, value in tensorboard_config.items():
+    metadata += f'\n  {key}: "{value}"'
+  # Capacity Type.
+  capacity_type, return_code = get_capacity_type(args)
+  if return_code != 0:
+    xpk_print('Unable to determine capacity type.')
+    return return_code
+  metadata += f'\n  {CAPACITY_TYPE_CONFIG_KEY}: {capacity_type.name}'
+  # Reservation ID if applicable.
+  if capacity_type == CapacityType.RESERVATION:
+    metadata += f'\n  {RESERVATION_CONFIG_KEY}: {args.reservation}'
+  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
+  metadata_yml = CLUSTER_CONFIGMAP_YAML.format(
+      args=args, name=metadata_configmap_name, data=metadata
+  )
+  configmap_yml[metadata_configmap_name] = metadata_yml
+  return create_or_update_cluster_configmap(configmap_yml)
+
+
+def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
+  """
+  Args:
+    configmap_yml: dict containing ConfigMap name and yml string.
+
+  Returns:
+    0 if successful, 1 otherwise.
+  """
+  commands = []
+  task_names = []
+  for configmap_name, yml_string in configmap_yml.items():
+    tmp = write_tmp_file(yml_string)
+    command = f'kubectl apply -f {str(tmp.file.name)}'
+    commands.append(command)
+    task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
+    task_names.append(task_name)
+
+  return_code = run_commands(
+      commands, 'GKE Cluster CreateOrUpdate ConfigMap(s)', task_names
+  )
+  if return_code != 0:
+    xpk_print(
+        'GKE Cluster Create/Update ConfigMap(s) request returned ERROR'
+        f' {return_code}'
+    )
+    return 1
+  return 0
+
+
+def check_cluster_resources(args, system) -> tuple[bool, bool]:
+  """Check if cluster has resources of a specified device_type/gke_accelerator.
+  This check will be skipped if <args.cluster>-<_CLUSTER_RESOURCES_CONFIGMAP> ConfigMap doesn't exist for the cluster.
+
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics.
+
+  Returns:
+    Tuple of bool, bool
+    True if resources in the cluster should be checked, False otherwise.
+    True if device_type/gke_accelerator exists in the cluster, False otherwise.
+  """
+  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
+  resources_config_map = get_cluster_configmap(args, resources_configmap_name)
+  if resources_config_map is None:
+    xpk_print(
+        f'No ConfigMap exist for cluster with the name {resources_config_map}.'
+        ' Cluster resources check will be skipped.'
+    )
+    return False, False
+  if system.device_type in resources_config_map:
+    return True, True
+  elif system.gke_accelerator in resources_config_map:
+    return True, True
+  return True, False
diff --git a/src/xpk/core/scheduling.py b/src/xpk/core/scheduling.py
new file mode 100644
index 000000000..fe9774dd9
--- /dev/null
+++ b/src/xpk/core/scheduling.py
@@ -0,0 +1,253 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ..utils.console import xpk_print
+from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
+from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
+from .system_characteristics import (
+    AcceleratorType,
+    AcceleratorTypeToAcceleratorCharacteristics,
+    SystemCharacteristics,
+)
+
+
+def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
+  """Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
+
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics
+
+  Returns:
+    returns true if workload can schedule, otherwise returns false.
+  """
+  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
+  cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
+
+  # Prevents workload creation failure for existing clusters with no ConfigMap
+  if cluster_config_map is None:
+    xpk_print(
+        'No ConfigMap exist for cluster with the name'
+        f' {resources_configmap_name}.'
+    )
+    return True
+
+  # Check for gke accelerator type:
+  missing_gke_accelerator_type = False
+  if not cluster_config_map.get(system.gke_accelerator):
+    xpk_print(
+        f'Gke Accelerator Type Check: {args.workload} is requesting'
+        f' {system.gke_accelerator} but cluster only contains'
+        f' {cluster_config_map.keys()}. '
+    )
+    missing_gke_accelerator_type = True
+  elif (
+      cluster_config_map[system.gke_accelerator]
+      == AUTOPROVISIONING_CONFIG_VALUE
+  ):
+    # Run total chip check when in autoprovisioning mode.
+    max_chips_in_cluster = int(
+        cluster_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY]
+    )
+    num_chips_in_workload = get_total_chips_requested_from_args(args, system)
+
+    if num_chips_in_workload > max_chips_in_cluster:
+      xpk_print(
+          f'{args.workload} is requesting {num_chips_in_workload} chips but'
+          f' the cluster {args.cluster} supports up to {max_chips_in_cluster}.'
+          '  Resize the cluster to support more chips with'
+          ' `xpk cluster create --autoprovisioning-max-chips=X ...`'
+      )
+      return False
+    return True
+
+  # Check for device type
+  missing_device_type = False
+  device_type = system.device_type
+  if device_type not in cluster_config_map:
+    xpk_print(
+        f'Device Type Check: {args.workload} is requesting {device_type} but '
+        f'cluster only contains {cluster_config_map.keys()}. '
+    )
+    missing_device_type = True
+
+  if missing_device_type and missing_gke_accelerator_type:
+    xpk_print(
+        'Both Device Type and GKE Accelerator Type checks failed.'
+        f' XPK will not create the workload {args.workload}.'
+    )
+    return False
+  else:
+    # Check if the size of the workload will fit in the cluster.
+    max_vm_in_cluster = int(cluster_config_map[device_type])
+    if system.accelerator_type == AcceleratorType['GPU']:
+      vm_required_by_workload = args.num_nodes
+    else:
+      vm_required_by_workload = args.num_slices * system.vms_per_slice
+    if vm_required_by_workload > max_vm_in_cluster:
+      xpk_print(
+          f'{args.workload} is requesting {args.num_slices} slice/slices of'
+          f' {device_type}, which is {vm_required_by_workload} VMs, but the'
+          f' cluster only contains {max_vm_in_cluster} VMs of {device_type}.'
+          ' XPK will not create this workload.'
+      )
+      return False
+
+  return True
+
+
+def get_total_chips_requested_from_args(
+    args, system: SystemCharacteristics
+) -> int:
+  """Return the total chips requested based on user args.
+
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics.
+
+  Returns:
+    num of chips for the current request.
+  """
+  if system.accelerator_type == AcceleratorType['GPU']:
+    num_chips = system.vms_per_slice * system.chips_per_vm * args.num_nodes
+  else:
+    num_chips = system.vms_per_slice * system.chips_per_vm * args.num_slices
+
+  return int(num_chips)
+
+
+def get_cpu_affinity(accelerator_type) -> str:
+  """Generate affinity rules for CPU nodepools, so that workload pods are
+  not scheduled on the default pool machines.
+  Args:
+    accelerator_type: TPU / GPU / CPU
+
+  Returns:
+    str: yaml containing affinity constraints
+  """
+  yaml = """affinity:
+                nodeAffinity:
+                  requiredDuringSchedulingIgnoredDuringExecution:
+                    nodeSelectorTerms:
+                    - matchExpressions:
+                      - key: cloud.google.com/gke-nodepool
+                        operator: NotIn
+                        values:
+                        - default-pool
+"""
+  if accelerator_type == AcceleratorType['CPU']:
+    return yaml
+  return ''
+
+
+def get_gpu_scheduler(
+    args, system: SystemCharacteristics, autoprovisioning_args: str
+) -> tuple[str, int]:
+  """Get gpu scheduler configuration.
+
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics.
+    autoprovisioning_args: a string of arguments for Autoprovisioning.
+
+  Returns:
+    str: yaml containing gpu scheduler configuration
+    int of 0 if successful and 1 otherwise.
+  """
+  gpu_scheduler = ''
+  return_code = 0
+
+  if args.scheduler == 'gke.io/topology-aware-auto':
+    gpu_scheduler = f"""schedulingGates:
+              - name: "{args.scheduler}-{args.workload}"
+              """
+  elif args.scheduler == 'default-scheduler':
+    gpu_scheduler_yaml = """schedulerName: {scheduler_name}
+              affinity:
+                nodeAffinity:
+                  requiredDuringSchedulingIgnoredDuringExecution:
+                    nodeSelectorTerms:
+                    - matchExpressions:
+                      - key: cloud.google.com/gke-accelerator
+                        operator: Exists
+                      - key: cloud.google.com/gke-nodepool
+                        operator: In
+                        values: [{node_pool_name}]
+              nodeSelector:
+                {accelerator_label}
+                {machine_label}
+                {autoprovisioning_args}
+              """
+    gpu_scheduler = gpu_scheduler_yaml.format(
+        scheduler_name=args.scheduler,
+        accelerator_label=create_accelerator_label(
+            system.accelerator_type, system
+        ),
+        machine_label=create_machine_label(system.accelerator_type, system),
+        node_pool_name=f'{args.cluster}-np-0',
+        autoprovisioning_args=autoprovisioning_args,
+    )
+  else:
+    return_code = 1
+    xpk_print(
+        '--scheduler needs to be set as either `default-scheduler`'
+        ' or `gke.io/topology-aware-auto` in order to schedule the'
+        ' workloads on GPUs.'
+    )
+
+  return gpu_scheduler, return_code
+
+
+def create_accelerator_label(accelerator_type, system) -> str:
+  """Generates accelerator label.
+
+  Args:
+    accelerator_type: type of accelerator.
+    system: system characteristics.
+
+  Returns:
+    The accelerator label.
+  """
+  if accelerator_type == AcceleratorType['CPU']:
+    return ''
+  return (
+      f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].accelerator_label}:'
+      f' {system.gke_accelerator}'
+  )
+
+
+def create_machine_label(
+    accelerator_type, system, autoprovisioning_enabled: bool = False
+) -> str:
+  """Generates machine label.
+
+  Args:
+    accelerator_type: type of accelerator.
+    system: system characteristics.
+    autoprovisioning_enabled: describes autoprovisioning enablement.
+
+  Returns:
+    The machine label.
+  """
+  if (
+      accelerator_type == AcceleratorType['TPU']
+      and not autoprovisioning_enabled
+  ):
+    return (
+        f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].machine_label}:'
+        f' {system.topology}'
+    )
+  return ''
diff --git a/src/xpk/core/tests/integration/test_gcluster_a3ultra.py b/src/xpk/core/tests/integration/test_gcluster_a3ultra.py
index 9687bea61..57a873214 100644
--- a/src/xpk/core/tests/integration/test_gcluster_a3ultra.py
+++ b/src/xpk/core/tests/integration/test_gcluster_a3ultra.py
@@ -14,14 +14,16 @@
 limitations under the License.
 """
 
+import os
+import shutil
+
+import pytest
+
 from xpk.commands.cluster_gcluster import get_unique_name
+from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
+from xpk.core.capacity import CapacityType
 from xpk.core.docker_manager import DockerManager
 from xpk.core.gcluster_manager import GclusterManager
-from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
-from xpk.core.core import CapacityType
-import pytest
-import os
-import shutil
 
 ctk_gcloud_cfg = os.getenv("GCLOUD_CFG_PATH")
 project_id = os.getenv("PROJECT_ID")
diff --git a/src/xpk/core/tests/unit/test_blueprint.py b/src/xpk/core/tests/unit/test_blueprint.py
index 7c408b762..46b3fb176 100644
--- a/src/xpk/core/tests/unit/test_blueprint.py
+++ b/src/xpk/core/tests/unit/test_blueprint.py
@@ -14,12 +14,14 @@
 limitations under the License.
 """
 
+import os
 import shutil
-from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
-from xpk.core.blueprint.blueprint_definitions import Blueprint
-from xpk.core.core import CapacityType
+
 import ruamel.yaml
-import os
+
+from xpk.core.blueprint.blueprint_definitions import Blueprint
+from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
+from xpk.core.capacity import CapacityType
 
 yaml = ruamel.yaml.YAML()
 
diff --git a/src/xpk/core/vertex.py b/src/xpk/core/vertex.py
new file mode 100644
index 000000000..6507e1856
--- /dev/null
+++ b/src/xpk/core/vertex.py
@@ -0,0 +1,105 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ..utils.console import xpk_print
+from .resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
+
+DEFAULT_VERTEX_TENSORBOARD_NAME = 'tb-instance'
+
+
+def create_vertex_tensorboard(args) -> dict:
+  """Creates a Tensorboard instance in Vertex AI.
+
+  Args:
+    args: user provided arguments.
+
+  Returns:
+    dict containing Tensorboard instance name, id and location.
+  """
+  from cloud_accelerator_diagnostics import (  # pylint: disable=import-outside-toplevel
+      tensorboard,
+  )
+
+  tensorboard_config = {}
+  tensorboard_name = args.tensorboard_name
+  if tensorboard_name is None:
+    tensorboard_name = f'{args.cluster}-{DEFAULT_VERTEX_TENSORBOARD_NAME}'
+  instance_id = tensorboard.create_instance(  # pylint: disable=used-before-assignment
+      project=args.project,
+      location=args.tensorboard_region,
+      tensorboard_name=tensorboard_name,
+  )
+  if instance_id:
+    xpk_print(
+        f'Tensorboard instance {tensorboard_name} is successfully created.'
+    )
+    tensorboard_config['tensorboard_region'] = args.tensorboard_region
+    tensorboard_config['tensorboard_name'] = tensorboard_name
+    tensorboard_config['tensorboard_id'] = instance_id
+  return tensorboard_config
+
+
+def create_vertex_experiment(args) -> dict | None:
+  """Creates an Experiment in Vertex AI.
+
+  Args:
+    args: user provided arguments.
+
+  Returns:
+    map containing Vertex Tensorboard configurations.
+  """
+  from cloud_accelerator_diagnostics import (  # pylint: disable=import-outside-toplevel
+      tensorboard,
+  )
+
+  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
+  cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
+
+  if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map:
+    xpk_print(
+        'No Vertex Tensorboard instance has been created in cluster create. Run'
+        ' `xpk cluster create --create-vertex-tensorboard` before running `xpk'
+        ' workload create --use-vertex-tensorboard` to create a Vertex'
+        ' Tensorboard instance. Alternatively, use `xpk cluster create-pathways'
+        ' --create-vertex-tensorboard` before running `xpk workload'
+        ' create-pathways --use-vertex-tensorboard`.'
+    )
+    return None
+
+  tensorboard_config = {}
+  tensorboard_config['tensorboard_project'] = args.project
+  tensorboard_config['tensorboard_region'] = cluster_config_map[
+      'tensorboard_region'
+  ]
+  tensorboard_config['tensorboard_name'] = cluster_config_map[
+      'tensorboard_name'
+  ]
+  experiment_name = args.experiment_name
+  if experiment_name is None:
+    experiment_name = f'{args.cluster}-{args.workload}'
+  tensorboard_config['experiment_name'] = experiment_name
+
+  _, tensorboard_url = tensorboard.create_experiment(
+      project=args.project,
+      location=tensorboard_config['tensorboard_region'],
+      experiment_name=experiment_name,
+      tensorboard_name=tensorboard_config['tensorboard_name'],
+  )
+  if tensorboard_url is None:
+    return None
+
+  xpk_print(f'You can view Vertex Tensorboard at: {tensorboard_url}')
+  return tensorboard_config
diff --git a/src/xpk/core/workload.py b/src/xpk/core/workload.py
index b23d482e7..f2a46c9db 100644
--- a/src/xpk/core/workload.py
+++ b/src/xpk/core/workload.py
@@ -14,7 +14,11 @@
 limitations under the License.
 """
 
+from ..utils.console import xpk_exit, xpk_print
+from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE
 from .commands import run_command_for_value
+from .gcloud_context import zone_to_region
+from .system_characteristics import SystemCharacteristics
 
 
 def workload_list_awk_command(filter_key) -> str:
@@ -131,3 +135,208 @@ def get_workload_list(args) -> tuple[int, str]:
   return_code, return_value = run_command_for_value(command, task, args)
 
   return return_code, return_value
+
+
+def check_if_workload_exists(args) -> bool:
+  """Check if workload exists.
+
+  Args:
+     args: user provided arguments for running the command.
+
+  Returns:
+    returns true if workload exist, otherwise returns false.
+  """
+  columns = {
+      'Jobset': '.metadata.ownerReferences[0].name',
+  }
+
+  s = ','.join([key + ':' + value for key, value in columns.items()])
+
+  command = f"kubectl get workloads -o=custom-columns='{s}'"
+  return_code, return_msg = run_command_for_value(
+      command, 'Check if Workload Already Exists', args
+  )
+
+  if return_code != 0:
+    xpk_print(f'List Job request returned ERROR {return_code}')
+    xpk_exit(return_code)
+
+  lines = return_msg.split('\n')
+  new_workload_name = args.workload
+  for line in lines:
+    if line == new_workload_name:
+      return True
+  return False
+
+
+def wait_for_job_completion(args) -> int:
+  """Function to wait for job completion.
+
+  Args:
+    args: user provided arguments for running the command.
+
+  Returns:
+    return_code: 0 if successful, 124 if timeout, 125 if unsuccessful job, 1 otherwise
+  """
+  # Check that the workload exists
+  args.workload = args.wait_for_job_completion
+  workload_exists = check_if_workload_exists(args)
+  if not workload_exists:
+    xpk_print(f'Workload named {args.workload} does not exist.')
+    return 1
+
+  # Get the full workload name
+  get_workload_name_cmd = f'kubectl get workloads | grep jobset-{args.workload}'
+  return_code, return_value = run_command_for_value(
+      get_workload_name_cmd, 'Get full workload name', args
+  )
+  if return_code != 0:
+    xpk_print(f'Get full workload name request returned ERROR {return_code}')
+    return return_code
+  full_workload_name = return_value.split(' ')[0]
+
+  # Call kubectl wait on the workload using the full workload name
+  timeout_val = args.timeout if args.timeout is not None else -1
+  timeout_msg = (
+      f'{timeout_val}s' if timeout_val != -1 else 'max timeout (1 week)'
+  )
+  wait_cmd = (
+      "kubectl  wait --for jsonpath='.status.conditions[-1].type'=Finished"
+      f' workload {full_workload_name} --timeout={timeout_val}s'
+  )
+  return_code, return_value = run_command_for_value(
+      wait_cmd,
+      f'Wait for workload to finish with timeout of {timeout_msg}',
+      args,
+      print_timer=True,
+  )
+  if return_code != 0:
+    if 'timed out' in return_value:
+      xpk_print(
+          f'Timed out waiting for your workload after {timeout_msg}, see your'
+          ' workload here:'
+          # pylint: disable=line-too-long
+          f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
+      )
+      return 124
+    else:
+      xpk_print(f'{return_value}')
+      xpk_print(f'Wait for workload returned ERROR {return_code}')
+      return return_code
+  xpk_print(
+      'Finished waiting for your workload, see your workload here:'
+      # pylint: disable=line-too-long
+      f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
+  )
+  status_cmd = (
+      f'kubectl get jobset {args.workload} -o'
+      " jsonpath='{.status.conditions[-1].type}'"
+  )
+  return_code, return_value = run_command_for_value(
+      status_cmd, 'Get jobset status', args
+  )
+  if return_code != 0:
+    xpk_print(f'Get workload status request returned ERROR {return_code}')
+    return return_code
+  xpk_print(f'Your workload finished with status: {return_value}')
+  if return_value != 'Completed':
+    xpk_print('Your workload did not complete successfully')
+    return 125
+  return 0
+
+
+def get_gpu_volume(system: SystemCharacteristics) -> str:
+  """Get gpu volume based on user provided arguments.
+
+  Args:
+    system: system characteristics.
+
+  Returns:
+    str: yaml containing gpu volume
+  """
+  gpu_volume = ''
+  if system.device_type == H100_DEVICE_TYPE:
+    gpu_volume = """- name: nvidia-install-dir-host
+                hostPath:
+                  path: /home/kubernetes/bin/nvidia/lib64
+              - name: tcpd-socket
+                hostPath:
+                  path: /run/tcpx
+              - name: shared-memory
+                emptyDir:
+                  medium: "Memory"
+                  sizeLimit: 200Gi
+              - name: workload-terminated-volume
+                emptyDir:
+              - name: tcpx-nccl-plugin-volume
+                emptyDir:"""
+  elif system.device_type == H100_MEGA_DEVICE_TYPE:
+    gpu_volume = """- name: nvidia-install-dir-host
+                hostPath:
+                  path: /home/kubernetes/bin/nvidia/lib64
+              - name: shared-memory
+                emptyDir:
+                  medium: "Memory"
+                  sizeLimit: 1Gi
+              - name: workload-terminated-volume
+                emptyDir:"""
+  return gpu_volume
+
+
+def get_gpu_rxdm_image(system: SystemCharacteristics) -> str:
+  """Get config of rxdm based on user provided arguments.
+
+  Args:
+    system: system characteristics.
+
+  Returns:
+    str: yaml containing the rxdm name and image
+  """
+  gpu_rxdm_image = ''
+  if system.device_type == H100_DEVICE_TYPE:
+    gpu_rxdm_image = """- name: tcpd-daemon
+                image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9"""
+  elif system.device_type == H100_MEGA_DEVICE_TYPE:
+    gpu_rxdm_image = """- name: fastrak-daemon
+                image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.9"""
+  return gpu_rxdm_image
+
+
+def get_gpu_rxdm_cmd(system: SystemCharacteristics) -> str:
+  """Get rxdm command based on user provided arguments.
+
+  Args:
+    system: system characteristics.
+
+  Returns:
+    str: command of running rxdm container
+  """
+  gpu_rxdm_cmd = ''
+  if system.device_type == H100_DEVICE_TYPE:
+    gpu_rxdm_cmd = (
+        '/tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm'
+        ' --gpu_shmem_type fd --setup_param "--verbose 128 2 0"'
+    )
+  elif system.device_type == H100_MEGA_DEVICE_TYPE:
+    gpu_rxdm_cmd = (
+        'set -ex; chmod 755 /fts/entrypoint_rxdm_container.sh;'
+        ' /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid='
+        ' --alsologtostderr'
+    )
+  return gpu_rxdm_cmd
+
+
+def get_gpu_tcp_volume(system: SystemCharacteristics) -> str:
+  """Get gpu tcp volume based on user provided arguments.
+
+  Args:
+    system: system characteristics.
+
+  Returns:
+    str: yaml containing gpu tcp volume
+  """
+  gpu_tcp_volume = ''
+  if system.device_type == H100_DEVICE_TYPE:
+    gpu_tcp_volume = """- name: tcpd-socket
+                  mountPath: /tmp"""
+  return gpu_tcp_volume
diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py
index 6aa15d12e..ace46f378 100644
--- a/src/xpk/parser/cluster.py
+++ b/src/xpk/parser/cluster.py
@@ -23,7 +23,7 @@
     cluster_describe,
     cluster_list,
 )
-from ..core.core import DEFAULT_VERTEX_TENSORBOARD_NAME
+from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
 from .common import add_shared_arguments
 from .validators import name_type
 
diff --git a/src/xpk/parser/config.py b/src/xpk/parser/config.py
index 6057be23f..ab328177c 100644
--- a/src/xpk/parser/config.py
+++ b/src/xpk/parser/config.py
@@ -14,9 +14,9 @@
 limitations under the License.
 """
 
-from ..commands.config import (set_config, get_config)
+from ..commands.config import get_config, set_config
+from ..core.config import DEFAULT_KEYS
 from .common import add_shared_arguments
-from ..core.config import default_keys
 
 
 def set_config_parsers(config_parser):
@@ -33,14 +33,14 @@ def set_config_parsers(config_parser):
   )
   config_set_parser.add_argument(
       'set_config_args',
-      help=f"""Pair of (key, value) to be set in config. Allowed keys are: {default_keys}.
+      help=f"""Pair of (key, value) to be set in config. Allowed keys are: {DEFAULT_KEYS}.
       Command usage: `xpk config set key value`""",
       type=str,
       nargs=2,
   )
   config_get_parser.add_argument(
       'get_config_key',
-      help=f"""Get key value from config. Allowed keys are: {default_keys} .
+      help=f"""Get key value from config. Allowed keys are: {DEFAULT_KEYS} .
       Command usage: `xpk config get key`""",
       type=str,
       nargs=1,
diff --git a/src/xpk/parser/workload.py b/src/xpk/parser/workload.py
index 1c6b1acd1..54fe298da 100644
--- a/src/xpk/parser/workload.py
+++ b/src/xpk/parser/workload.py
@@ -20,9 +20,9 @@
     workload_delete,
     workload_list,
 )
-from ..core.core import default_docker_image, default_script_dir
-from .validators import directory_path_type, name_type
+from ..core.docker_image import DEFAULT_DOCKER_IMAGE, DEFAULT_SCRIPT_DIR
 from .common import add_shared_arguments
+from .validators import directory_path_type, name_type
 
 
 def set_workload_parsers(workload_parser):
@@ -638,10 +638,10 @@ def add_shared_workload_base_docker_image_arguments(args_parsers):
     custom_parser.add_argument(
         '--base-docker-image',
         type=str,
-        default=default_docker_image,
+        default=DEFAULT_DOCKER_IMAGE,
         help=(
             'The base docker-image to use, default'
-            f' {default_docker_image}. If using a custom docker image it'
+            f' {DEFAULT_DOCKER_IMAGE}. If using a custom docker image it'
             ' is typically addressed as gcr.io/${PROJECT}/${NAME}:latest.'
             ' This docker image will be used as a base image by default and'
             ' the `--script-dir` by default will be added to the image.'
@@ -650,7 +650,7 @@ def add_shared_workload_base_docker_image_arguments(args_parsers):
     custom_parser.add_argument(
         '--script-dir',
         type=directory_path_type,
-        default=default_script_dir,
+        default=DEFAULT_SCRIPT_DIR,
         help=(
             'The local location of the directory to copy to the docker image'
             ' and run the main command from. Defaults to current working'