Skip to content

Commit 66ffdd1

Browse files
authored
Fix NVIDIA container toolkit bug in all backends (#2877)
Use cgroupfs as a Docker cgroup driver on all backends by default to work around an NVIDIA container toolkit bug where the container looses access to the GPU. The patch to `/etc/docker/daemon.json` is automatically applied in all VM-based backends if `/etc/docker/daemon.json` exists, has the NVIDIA runtime, does not explicitly set another cgroup driver, and `jq` is installed. This is not the case in Nebius. Lambda, and CUDO, so they still need custom code to apply the workaround - either installing `jq` or just writing a hardcoded `/etc/docker/daemon.json` that is known to work on this backend.
1 parent f5c8fcf commit 66ffdd1

File tree

3 files changed

+44
-11
lines changed

3 files changed

+44
-11
lines changed

src/dstack/_internal/core/backends/base/compute.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,8 @@ def get_shim_commands(
559559
backend_shim_env: Optional[Dict[str, str]] = None,
560560
arch: Optional[str] = None,
561561
) -> List[str]:
562-
commands = get_shim_pre_start_commands(
562+
commands = get_setup_cloud_instance_commands()
563+
commands += get_shim_pre_start_commands(
563564
base_path=base_path,
564565
bin_path=bin_path,
565566
arch=arch,
@@ -641,6 +642,23 @@ def get_dstack_shim_download_url(arch: Optional[str] = None) -> str:
641642
return url_template.format(version=version, arch=arch)
642643

643644

645+
def get_setup_cloud_instance_commands() -> list[str]:
646+
return [
647+
# Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
648+
# Attempts to patch /etc/docker/daemon.json while keeping any custom settings it may have.
649+
(
650+
"/bin/sh -c '" # wrap in /bin/sh to avoid interfering with other cloud init commands
651+
" grep -q nvidia /etc/docker/daemon.json"
652+
" && ! grep -q native.cgroupdriver /etc/docker/daemon.json"
653+
" && jq '\\''.\"exec-opts\" = ((.\"exec-opts\" // []) + [\"native.cgroupdriver=cgroupfs\"])'\\'' /etc/docker/daemon.json > /tmp/daemon.json"
654+
" && sudo mv /tmp/daemon.json /etc/docker/daemon.json"
655+
" && sudo service docker restart"
656+
" || true"
657+
"'"
658+
),
659+
]
660+
661+
644662
def get_shim_pre_start_commands(
645663
base_path: Optional[PathLike] = None,
646664
bin_path: Optional[PathLike] = None,

src/dstack/_internal/core/backends/cudo/compute.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,13 @@ def create_instance(
6565
public_keys = instance_config.get_public_keys()
6666
memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
6767
disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
68-
commands = get_shim_commands(authorized_keys=public_keys)
6968
gpus_no = len(instance_offer.instance.resources.gpus)
70-
shim_commands = " ".join([" && ".join(commands)])
71-
startup_script = (
72-
shim_commands if gpus_no > 0 else f"{install_docker_script()} && {shim_commands}"
73-
)
69+
if gpus_no > 0:
70+
# we'll need jq for patching /etc/docker/daemon.json, see get_shim_commands()
71+
commands = install_jq_commands()
72+
else:
73+
commands = install_docker_commands()
74+
commands += get_shim_commands(authorized_keys=public_keys)
7475

7576
try:
7677
resp_data = self.api_client.create_virtual_machine(
@@ -85,7 +86,7 @@ def create_instance(
8586
memory_gib=memory_size,
8687
vcpus=instance_offer.instance.resources.cpus,
8788
vm_id=vm_id,
88-
start_script=startup_script,
89+
start_script=" && ".join(commands),
8990
password=None,
9091
customSshKeys=public_keys,
9192
)
@@ -151,6 +152,19 @@ def _get_image_id(cuda: bool) -> str:
151152
return image_name
152153

153154

154-
def install_docker_script():
155-
commands = 'export DEBIAN_FRONTEND="noninteractive" && mkdir -p /etc/apt/keyrings && curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && apt-get update && apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin'
156-
return commands
155+
def install_jq_commands():
156+
return [
157+
"export DEBIAN_FRONTEND=noninteractive",
158+
"apt-get --assume-yes install jq",
159+
]
160+
161+
162+
def install_docker_commands():
163+
return [
164+
"export DEBIAN_FRONTEND=noninteractive",
165+
"mkdir -p /etc/apt/keyrings",
166+
"curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg",
167+
'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null',
168+
"apt-get update",
169+
"apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin",
170+
]

src/dstack/_internal/core/backends/lambdalabs/compute.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import hashlib
2+
import shlex
23
import subprocess
34
import tempfile
45
from threading import Thread
@@ -98,7 +99,7 @@ def update_provisioning_data(
9899
arch=provisioning_data.instance_type.resources.cpu_arch,
99100
)
100101
# shim is assumed to be run under root
101-
launch_command = "sudo sh -c '" + "&& ".join(commands) + "'"
102+
launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands))
102103
thread = Thread(
103104
target=_start_runner,
104105
kwargs={

0 commit comments

Comments
 (0)