diff --git a/docs/source/examples/auto-failover.rst b/docs/source/examples/auto-failover.rst index 99ee5703738..c23f6273697 100644 --- a/docs/source/examples/auto-failover.rst +++ b/docs/source/examples/auto-failover.rst @@ -60,18 +60,22 @@ provisioner handles such a request: .. code-block:: console $ sky launch -c gpu --gpus V100 - ... # optimizer output - I 02-11 21:17:43 cloud_vm_ray_backend.py:1034] Creating a new cluster: "gpu" [1x GCP(n1-highmem-8, {'V100': 1.0})]. - I 02-11 21:17:43 cloud_vm_ray_backend.py:1034] Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters. - I 02-11 21:17:43 cloud_vm_ray_backend.py:614] To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-11-21-17-43-171661/provision.log - I 02-11 21:17:43 cloud_vm_ray_backend.py:624] - I 02-11 21:17:43 cloud_vm_ray_backend.py:624] Launching on GCP us-central1 (us-central1-a) - W 02-11 21:17:56 cloud_vm_ray_backend.py:358] Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) + + ... + Creating a new cluster: "gpu" [1x GCP(n1-highmem-8, {'V100': 1.0})]. + Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters. + To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-11-21-17-43-171661/provision.log + + Launching on GCP us-central1 (us-central1-a) + Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) + ... + + Launching on GCP us-central1 (us-central1-f) + Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-f (message: The zone 'projects/intercloud-320520/zones/us-central1-f' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) + ... + + Launching on GCP us-west1 (us-west1-a) ... - I 02-11 21:18:24 cloud_vm_ray_backend.py:624] Launching on GCP us-central1 (us-central1-f) - W 02-11 21:18:38 cloud_vm_ray_backend.py:358] Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-f (message: The zone 'projects/intercloud-320520/zones/us-central1-f' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) - I 02-11 21:18:38 cloud_vm_ray_backend.py:624] - I 02-11 21:18:38 cloud_vm_ray_backend.py:624] Launching on GCP us-west1 (us-west1-a) Successfully connected to 35.230.120.87. GCP was chosen as the best cloud to run the task. There was no capacity in any of the regions in US Central, so the auto-failover provisioner moved to US West instead, allowing for our instance to be successfully provisioned. @@ -88,21 +92,24 @@ AWS, where it succeeded after two regions: .. code-block:: console $ sky launch -c v100-8 --gpus V100:8 - ... # optimizer output - I 02-23 16:39:59 cloud_vm_ray_backend.py:1010] Creating a new cluster: "v100-8" [1x GCP(n1-highmem-8, {'V100': 8.0})]. - I 02-23 16:39:59 cloud_vm_ray_backend.py:1010] Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters. - I 02-23 16:39:59 cloud_vm_ray_backend.py:658] To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-23-16-39-58-577551/provision.log - I 02-23 16:39:59 cloud_vm_ray_backend.py:668] - I 02-23 16:39:59 cloud_vm_ray_backend.py:668] Launching on GCP us-central1 (us-central1-a) - W 02-23 16:40:17 cloud_vm_ray_backend.py:403] Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) + ... - I 02-23 16:42:15 cloud_vm_ray_backend.py:668] Launching on AWS us-east-2 (us-east-2a,us-east-2b,us-east-2c) - W 02-23 16:42:26 cloud_vm_ray_backend.py:477] Got error(s) in all zones of us-east-2: - W 02-23 16:42:26 cloud_vm_ray_backend.py:479] create_instances: Attempt failed with An error occurred (InsufficientInstanceCapacity) when calling the RunInstances operation (reached max retries: 0): We currently do not have sufficient p3.16xlarge capacity in the Availability Zone you requested (us-east-2a). Our system will be working on provisioning additional capacity. You can currently get p3.16xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-2b., retrying. + Creating a new cluster: "v100-8" [1x GCP(n1-highmem-8, {'V100': 8.0})]. + Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters. + To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-23-16-39-58-577551/provision.log + + Launching on GCP us-central1 (us-central1-a) + Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) + ... + + Launching on AWS us-east-2 (us-east-2a,us-east-2b,us-east-2c) + Got error(s) in all zones of us-east-2: + create_instances: Attempt failed with An error occurred (InsufficientInstanceCapacity) when calling the RunInstances operation (reached max retries: 0): We currently do not have sufficient p3.16xlarge capacity in the Availability Zone you requested (us-east-2a). Our system will be working on provisioning additional capacity. You can currently get p3.16xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-2b., retrying. ... - I 02-23 16:42:26 cloud_vm_ray_backend.py:668] - I 02-23 16:42:26 cloud_vm_ray_backend.py:668] Launching on AWS us-west-2 (us-west-2a,us-west-2b,us-west-2c,us-west-2d) - I 02-23 16:47:04 cloud_vm_ray_backend.py:740] Successfully provisioned or found existing VM. Setup completed. + + Launching on AWS us-west-2 (us-west-2a,us-west-2b,us-west-2c,us-west-2d) + ... + Successfully provisioned or found existing VM. Setup completed. Multiple Candidate GPUs @@ -125,13 +132,13 @@ A10, L4, and A10g GPUs, using :code:`sky launch task.yaml`. $ sky launch task.yaml ... - I 11-19 08:07:45 optimizer.py:910] ----------------------------------------------------------------------------------------------------- - I 11-19 08:07:45 optimizer.py:910] CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN - I 11-19 08:07:45 optimizer.py:910] ----------------------------------------------------------------------------------------------------- - I 11-19 08:07:45 optimizer.py:910] Azure Standard_NV6ads_A10_v5 6 55 A10:1 eastus 0.45 ✔ - I 11-19 08:07:45 optimizer.py:910] GCP g2-standard-4 4 16 L4:1 us-east4-a 0.70 - I 11-19 08:07:45 optimizer.py:910] AWS g5.xlarge 4 16 A10G:1 us-east-1 1.01 - I 11-19 08:07:45 optimizer.py:910] ----------------------------------------------------------------------------------------------------- + ----------------------------------------------------------------------------------------------------- + CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN + ----------------------------------------------------------------------------------------------------- + Azure Standard_NV6ads_A10_v5 6 55 A10:1 eastus 0.45 ✔ + GCP g2-standard-4 4 16 L4:1 us-east4-a 0.70 + AWS g5.xlarge 4 16 A10G:1 us-east-1 1.01 + ----------------------------------------------------------------------------------------------------- @@ -212,15 +219,15 @@ This will generate the following output: $ sky launch -c mycluster task.yaml ... - I 12-20 23:55:56 optimizer.py:717] - I 12-20 23:55:56 optimizer.py:840] Considered resources (1 node): - I 12-20 23:55:56 optimizer.py:910] --------------------------------------------------------------------------------------------- - I 12-20 23:55:56 optimizer.py:910] CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN - I 12-20 23:55:56 optimizer.py:910] --------------------------------------------------------------------------------------------- - I 12-20 23:55:56 optimizer.py:910] GCP g2-standard-96 96 384 L4:8 us-east4-a 7.98 ✔ - I 12-20 23:55:56 optimizer.py:910] AWS g5.48xlarge 192 768 A10G:8 us-east-1 16.29 - I 12-20 23:55:56 optimizer.py:910] GCP a2-highgpu-8g 96 680 A100:8 us-east1-b 29.39 - I 12-20 23:55:56 optimizer.py:910] AWS p4d.24xlarge 96 1152 A100:8 us-east-1 32.77 - I 12-20 23:55:56 optimizer.py:910] --------------------------------------------------------------------------------------------- - I 12-20 23:55:56 optimizer.py:910] + + Considered resources (1 node): + --------------------------------------------------------------------------------------------- + CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN + --------------------------------------------------------------------------------------------- + GCP g2-standard-96 96 384 L4:8 us-east4-a 7.98 ✔ + AWS g5.48xlarge 192 768 A10G:8 us-east-1 16.29 + GCP a2-highgpu-8g 96 680 A100:8 us-east1-b 29.39 + AWS p4d.24xlarge 96 1152 A100:8 us-east-1 32.77 + --------------------------------------------------------------------------------------------- + Launching a new cluster 'mycluster'. Proceed? [Y/n]: diff --git a/sky/execution.py b/sky/execution.py index 792ca5fffc0..a2419c9ed2f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -344,7 +344,7 @@ def _execute( # # Disable the usage collection for this status command. env = dict(os.environ, - **{env_options.Options.DISABLE_LOGGING.value: '1'}) + **{str(env_options.Options.DISABLE_LOGGING): '1'}) subprocess_utils.run( 'sky status --no-show-managed-jobs --no-show-services', env=env) print() diff --git a/sky/optimizer.py b/sky/optimizer.py index 4326329579d..a4ce4f39f83 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -965,10 +965,10 @@ def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates): f'Multiple {cloud} instances satisfy ' f'{acc_name}:{int(acc_count)}. ' f'The cheapest {candidate_list[0]!r} is considered ' - f'among:\n{instance_list}.\n') + f'among:\n{instance_list}.') if is_multi_instances: logger.info( - f'To list more details, run \'sky show-gpus {acc_name}\'.') + f'To list more details, run: sky show-gpus {acc_name}\n') @staticmethod def _optimize_dag( @@ -1101,8 +1101,7 @@ def ordinal_number(n): Optimizer.print_optimized_plan(graph, topo_order, best_plan, total_time, total_cost, node_to_cost_map, minimize_cost) - if not env_options.Options.MINIMIZE_LOGGING.get(): - Optimizer._print_candidates(local_node_to_candidate_map) + Optimizer._print_candidates(local_node_to_candidate_map) return best_plan diff --git a/sky/sky_logging.py b/sky/sky_logging.py index c8a243c72cf..232fc6dd9d5 100644 --- a/sky/sky_logging.py +++ b/sky/sky_logging.py @@ -10,10 +10,11 @@ from sky.utils import env_options from sky.utils import rich_utils -# If the SKYPILOT_MINIMIZE_LOGGING environment variable is set to True, -# remove logging prefixes and unnecessary information in optimizer -_FORMAT = (None if env_options.Options.MINIMIZE_LOGGING.get() else - '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s') +# UX: Should we show logging prefixes and some extra information in optimizer? +_show_logging_prefix = (env_options.Options.SHOW_DEBUG_INFO.get() or + not env_options.Options.MINIMIZE_LOGGING.get()) +_FORMAT = ('%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s' + if _show_logging_prefix else None) _DATE_FORMAT = '%m-%d %H:%M:%S' diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 118f9a2b718..9bf12752174 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -380,7 +380,7 @@ def shared_controller_vars_to_fill( 'local_user_config_path': local_user_config_path, } env_vars: Dict[str, str] = { - env.value: '1' for env in env_options.Options if env.get() + str(env): str(int(env.get())) for env in env_options.Options } env_vars.update({ # Should not use $USER here, as that env var can be empty when @@ -388,7 +388,7 @@ def shared_controller_vars_to_fill( constants.USER_ENV_VAR: getpass.getuser(), constants.USER_ID_ENV_VAR: common_utils.get_user_hash(), # Skip cloud identity check to avoid the overhead. - env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value: '1', + str(env_options.Options.SKIP_CLOUD_IDENTITY_CHECK): '1', }) if skypilot_config.loaded(): # Only set the SKYPILOT_CONFIG env var if the user has a config file. diff --git a/sky/utils/env_options.py b/sky/utils/env_options.py index 166bf42ce80..48855e6cbf6 100644 --- a/sky/utils/env_options.py +++ b/sky/utils/env_options.py @@ -5,17 +5,27 @@ class Options(enum.Enum): """Environment variables for SkyPilot.""" - IS_DEVELOPER = 'SKYPILOT_DEV' - SHOW_DEBUG_INFO = 'SKYPILOT_DEBUG' - DISABLE_LOGGING = 'SKYPILOT_DISABLE_USAGE_COLLECTION' - MINIMIZE_LOGGING = 'SKYPILOT_MINIMIZE_LOGGING' + + # (env var name, default value) + IS_DEVELOPER = ('SKYPILOT_DEV', False) + SHOW_DEBUG_INFO = ('SKYPILOT_DEBUG', False) + DISABLE_LOGGING = ('SKYPILOT_DISABLE_USAGE_COLLECTION', False) + MINIMIZE_LOGGING = ('SKYPILOT_MINIMIZE_LOGGING', True) # Internal: this is used to skip the cloud user identity check, which is # used to protect cluster operations in a multi-identity scenario. # Currently, this is only used in the job and serve controller, as there # will not be multiple identities, and skipping the check can increase # robustness. - SKIP_CLOUD_IDENTITY_CHECK = 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK' + SKIP_CLOUD_IDENTITY_CHECK = ('SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK', False) + + def __init__(self, env_var: str, default: bool) -> None: + self.env_var = env_var + self.default = default + + def __repr__(self) -> str: + return self.env_var - def get(self): + def get(self) -> bool: """Check if an environment variable is set to True.""" - return os.getenv(self.value, 'False').lower() in ('true', '1') + return os.getenv(self.env_var, + str(self.default)).lower() in ('true', '1')