From 7b06620a3c98b4cd06ded45da6cc1d40bf3b2cb6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 16 Aug 2024 00:19:27 +0000 Subject: [PATCH 1/3] Fix None issue when no provision timeout is provided --- sky/provision/gcp/instance_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/provision/gcp/instance_utils.py b/sky/provision/gcp/instance_utils.py index 933df5e08a1..9a8fb04765a 100644 --- a/sky/provision/gcp/instance_utils.py +++ b/sky/provision/gcp/instance_utils.py @@ -1081,6 +1081,9 @@ def create_instances( run_duration=managed_instance_group_config['run_duration']) cls.wait_for_operation(operation, project_id, zone=zone) + provision_timeout = managed_instance_group_config.get('provision_timeout') + if provision_timeout is None: + provision_timeout = constants.DEFAULT_MANAGED_INSTANCE_GROUP_PROVISION_TIMEOUT # This will block the provisioning until the nodes are ready, which # makes the failover not effective. We rely on the request timeout set # by user to trigger failover. @@ -1088,9 +1091,7 @@ def create_instances( project_id, zone, managed_instance_group_name, - timeout=managed_instance_group_config.get( - 'provision_timeout', - constants.DEFAULT_MANAGED_INSTANCE_GROUP_PROVISION_TIMEOUT)) + timeout=provision_timeout) pending_running_instance_names = cls._add_labels_and_find_head( cluster_name, project_id, zone, labels, potential_head_instances) From 7e034f3e002167a4cd0ca4ee2f711e2db57f871e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 16 Aug 2024 00:50:21 +0000 Subject: [PATCH 2/3] raies instead of print --- sky/provision/gcp/mig_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/provision/gcp/mig_utils.py b/sky/provision/gcp/mig_utils.py index 9e33f5171e2..3a7d0db1805 100644 --- a/sky/provision/gcp/mig_utils.py +++ b/sky/provision/gcp/mig_utils.py @@ -207,3 +207,4 @@ def wait_for_managed_group_to_be_stable(project_id: str, zone: str, except subprocess.CalledProcessError as e: stderr = e.stderr.decode('ascii') logger.info(stderr) + raise From b205166fad6194b3c21f427d8036d029d7af69ff Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 16 Aug 2024 07:22:34 +0000 Subject: [PATCH 3/3] Change to check provision_timeout is none --- sky/provision/gcp/instance_utils.py | 7 +++---- sky/templates/gcp-ray.yml.j2 | 2 ++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sky/provision/gcp/instance_utils.py b/sky/provision/gcp/instance_utils.py index 9a8fb04765a..933df5e08a1 100644 --- a/sky/provision/gcp/instance_utils.py +++ b/sky/provision/gcp/instance_utils.py @@ -1081,9 +1081,6 @@ def create_instances( run_duration=managed_instance_group_config['run_duration']) cls.wait_for_operation(operation, project_id, zone=zone) - provision_timeout = managed_instance_group_config.get('provision_timeout') - if provision_timeout is None: - provision_timeout = constants.DEFAULT_MANAGED_INSTANCE_GROUP_PROVISION_TIMEOUT # This will block the provisioning until the nodes are ready, which # makes the failover not effective. We rely on the request timeout set # by user to trigger failover. @@ -1091,7 +1088,9 @@ def create_instances( project_id, zone, managed_instance_group_name, - timeout=provision_timeout) + timeout=managed_instance_group_config.get( + 'provision_timeout', + constants.DEFAULT_MANAGED_INSTANCE_GROUP_PROVISION_TIMEOUT)) pending_running_instance_names = cls._add_labels_and_find_head( cluster_name, project_id, zone, labels, potential_head_instances) diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index d986adbf6df..d7e787953d9 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -88,7 +88,9 @@ available_node_types: {%- if gcp_use_managed_instance_group %} managed-instance-group: run_duration: {{ run_duration }} + {%- if provision_timeout is defined and provision_timeout is not none %} provision_timeout: {{ provision_timeout }} + {%- endif %} {%- endif %} {%- if specific_reservations %} reservationAffinity: