Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

[Provisioner] Support docker in Lambda Cloud and TPU #4115

Merged
merged 6 commits into from
Oct 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,6 @@ def make_deploy_resources_variables(
runcmd:
- sed -i 's/#Banner none/Banner none/' /etc/ssh/sshd_config
- echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/skypilot:ssh_user/.bashrc
- usermod -aG docker skypilot:ssh_user
write_files:
- path: /etc/apt/apt.conf.d/20auto-upgrades
content: |
Expand Down
3 changes: 3 additions & 0 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,9 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
'runtime_version']
resources_vars['tpu_node_name'] = r.accelerator_args.get(
'tpu_name')
# TPU VMs require privileged mode for docker containers to
# access TPU devices.
resources_vars['docker_run_options'] = ['--privileged']
else:
# Convert to GCP names:
# https://cloud.google.com/compute/docs/gpus
Expand Down
14 changes: 9 additions & 5 deletions sky/clouds/lambda_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ class Lambda(clouds.Cloud):
_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.STOP: 'Lambda cloud does not support stopping VMs.',
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is currently not supported on {_REPR}.',
clouds.CloudImplementationFeatures.DOCKER_IMAGE: (
f'Docker image is currently not supported on {_REPR}. '
'You can try running docker command inside the `run` section in task.yaml.'
),
clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.',
clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
Expand Down Expand Up @@ -173,12 +169,20 @@ def make_deploy_resources_variables(
else:
custom_resources = None

return {
resources_vars = {
'instance_type': resources.instance_type,
'custom_resources': custom_resources,
'region': region.name,
}

if acc_dict is not None:
# Lambda cloud's docker runtime information does not contain
# 'nvidia-container-runtime', causing no GPU option is added to
# the docker run command. We patch this by adding it here.
resources_vars['docker_run_options'] = ['--gpus all']

return resources_vars

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> 'resources_utils.FeasibleResources':
Expand Down
19 changes: 12 additions & 7 deletions sky/provision/docker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,12 +253,13 @@ def initialize(self) -> str:
# issue with nvidia container toolkit:
# https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
self._run(
'[ -f /etc/docker/daemon.json ] || '
'{ which jq || sudo apt update && sudo apt install -y jq; } && '
'{ [ -f /etc/docker/daemon.json ] || '
'echo "{}" | sudo tee /etc/docker/daemon.json;'
'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
'/etc/docker/daemon.json > /tmp/daemon.json;'
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
'sudo systemctl restart docker')
'sudo systemctl restart docker; } || true')
user_docker_run_options = self.docker_config.get('run_options', [])
start_command = docker_start_cmds(
specific_image,
Expand Down Expand Up @@ -335,7 +336,11 @@ def initialize(self) -> str:

def _check_docker_installed(self):
no_exist = 'NoExist'
# SkyPilot: Add the current user to the docker group first (if needed),
# before checking if docker is installed to avoid permission issues.
cleaned_output = self._run(
'id -nG $USER | grep -qw docker || '
'sudo usermod -aG docker $USER > /dev/null 2>&1;'
f'command -v {self.docker_cmd} || echo {no_exist!r}')
if no_exist in cleaned_output or 'docker' not in cleaned_output:
logger.error(
Expand Down Expand Up @@ -424,8 +429,8 @@ def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
def _check_container_exited(self) -> bool:
if self.initialized:
return True
output = (self._run(check_docker_running_cmd(self.container_name,
self.docker_cmd),
wait_for_docker_daemon=True))
return 'false' in output.lower(
) and 'no such object' not in output.lower()
output = self._run(check_docker_running_cmd(self.container_name,
self.docker_cmd),
wait_for_docker_daemon=True)
return ('false' in output.lower() and
'no such object' not in output.lower())
2 changes: 2 additions & 0 deletions sky/provision/paperspace/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ def set_sky_key_script(self, public_key: str) -> None:
'apt-get update \n'
'apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin \n' # pylint: disable=line-too-long
'fi \n'
# TODO(tian): Maybe remove this as well since we are now adding
# users to docker group in the DockerInitializer. Need to test.
'usermod -aG docker paperspace \n'
f'echo "{public_key}" >> /home/paperspace/.ssh/authorized_keys \n')
try:
Expand Down
23 changes: 11 additions & 12 deletions sky/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,12 +842,6 @@ def _try_validate_image_id(self) -> None:

if self.extract_docker_image() is not None:
# TODO(tian): validate the docker image exists / of reasonable size
if self.accelerators is not None:
for acc in self.accelerators.keys():
if acc.lower().startswith('tpu'):
with ux_utils.print_exception_no_traceback():
raise ValueError(
'Docker image is not supported for TPU VM.')
if self.cloud is not None:
self.cloud.check_features_are_supported(
self, {clouds.CloudImplementationFeatures.DOCKER_IMAGE})
Expand Down Expand Up @@ -1032,25 +1026,30 @@ def make_deploy_variables(self, cluster_name: resources_utils.ClusterName,
self.accelerators is not None):
initial_setup_commands = [constants.DISABLE_GPU_ECC_COMMAND]

docker_image = self.extract_docker_image()

# Cloud specific variables
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
self, cluster_name, region, zones, dryrun)

# Docker run options
docker_run_options = skypilot_config.get_nested(
('docker', 'run_options'),
default_value=[],
override_configs=self.cluster_config_overrides)
if isinstance(docker_run_options, str):
docker_run_options = [docker_run_options]
# Special accelerator runtime might require additional docker run
# options. e.g., for TPU, we need --privileged.
if 'docker_run_options' in cloud_specific_variables:
docker_run_options.extend(
cloud_specific_variables['docker_run_options'])
if docker_run_options and isinstance(self.cloud, clouds.Kubernetes):
logger.warning(
f'{colorama.Style.DIM}Docker run options are specified, '
'but ignored for Kubernetes: '
f'{" ".join(docker_run_options)}'
f'{colorama.Style.RESET_ALL}')

docker_image = self.extract_docker_image()

# Cloud specific variables
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
self, cluster_name, region, zones, dryrun)
return dict(
cloud_specific_variables,
**{
Expand Down
20 changes: 20 additions & 0 deletions sky/templates/lambda-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,26 @@ max_workers: {{num_nodes - 1}}
upscaling_speed: {{num_nodes - 1}}
idle_timeout_minutes: 60

{%- if docker_image is not none %}
docker:
image: {{docker_image}}
container_name: {{docker_container_name}}
run_options:
- --ulimit nofile=1048576:1048576
{%- for run_option in docker_run_options %}
- {{run_option}}
{%- endfor %}
{%- if docker_login_config is not none %}
docker_login_config:
username: |-
{{docker_login_config.username}}
password: |-
{{docker_login_config.password}}
server: |-
{{docker_login_config.server}}
{%- endif %}
{%- endif %}

provider:
type: external
module: sky.provision.lambda
Expand Down
4 changes: 3 additions & 1 deletion sky/utils/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,8 +502,10 @@ def close_cached_connection(self) -> None:
if self.ssh_control_name is not None:
control_path = _ssh_control_path(self.ssh_control_name)
if control_path is not None:
# Suppress the `Exit request sent.` output for this comamnd
# which would interrupt the CLI spinner.
cmd = (f'ssh -O exit -S {control_path}/%C '
f'{self.ssh_user}@{self.ip}')
f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
logger.debug(f'Closing cached connection {control_path!r} with '
f'cmd: {cmd}')
log_lib.run_with_log(cmd,
Expand Down
Loading