diff --git a/src/dstack/_internal/core/backends/lambdalabs/compute.py b/src/dstack/_internal/core/backends/lambdalabs/compute.py index 9f800790f0..ca49c1c077 100644 --- a/src/dstack/_internal/core/backends/lambdalabs/compute.py +++ b/src/dstack/_internal/core/backends/lambdalabs/compute.py @@ -179,13 +179,18 @@ def _setup_instance( ssh_private_key: str, ): setup_commands = ( - "mkdir /home/ubuntu/.dstack && " - "sudo apt-get update && " - "sudo apt-get install -y --no-install-recommends nvidia-container-toolkit && " - "sudo nvidia-ctk runtime configure --runtime=docker && " - "sudo pkill -SIGHUP dockerd" + "mkdir /home/ubuntu/.dstack", + "sudo apt-get update", + "sudo apt-get install -y --no-install-recommends nvidia-container-toolkit", + "sudo install -d -m 0755 /etc/docker", + # Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48 + """echo '{"exec-opts":["native.cgroupdriver=cgroupfs"]}' | sudo tee /etc/docker/daemon.json""", + "sudo nvidia-ctk runtime configure --runtime=docker", + "sudo systemctl restart docker.service", # `systemctl reload` (`kill -HUP`) won't work + ) + _run_ssh_command( + hostname=hostname, ssh_private_key=ssh_private_key, command=" && ".join(setup_commands) ) - _run_ssh_command(hostname=hostname, ssh_private_key=ssh_private_key, command=setup_commands) def _launch_runner(