Skip to content

Commit b29c55e

Browse files
Update NVIDIA driver in dstack OS images (#3099)
* Update NVIDIA driver in dstack OS images Update the driver to support NVIDIA B200. - Update from the 535 to the 570 family. - Update to Ubuntu 24.04, since Ubuntu 22.04 does not have the gcc version required for building the 570 driver. - Switch from proprietary to open kernel modules. - Since pre-Turing GPUs aren't supported by NVIDIA open kernel modules, conditionally choose between old and new dstack OS images based on the GPU name. - Adjust handling `apt` race conditions - the existing hack did not work on OCI's Ubuntu 24.04. - Install `ufw` when building the image - it is missing in OCI's Ubuntu 24.04. * Fix tests * [Feature]: GCP A4 instances #3088 Bumped `base_image` to `0.11rc2` * [Feature]: GCP A4 instances #3088 Updated tests --------- Co-authored-by: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Co-authored-by: peterschmidt85 <andrey.cheptsov@gmail.com>
1 parent 30ff069 commit b29c55e

File tree

26 files changed

+153
-74
lines changed

26 files changed

+153
-74
lines changed

scripts/packer/aws-image-cuda.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"aws_secret_key": "{{env `AWS_SECRET_ACCESS_KEY`}}",
55
"region": "eu-west-1",
66
"ssh_username": "ubuntu",
7-
"base_ami": "ami-0cffefff2d52e0a23",
7+
"base_ami": "ami-0bc691261a82b32bc",
88
"instance_type": "c5.large",
99
"subnet_id": "subnet-c39cb6a5",
1010
"docker_version": "",

scripts/packer/aws-image.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"aws_secret_key": "{{env `AWS_SECRET_ACCESS_KEY`}}",
55
"region": "eu-west-1",
66
"ssh_username": "ubuntu",
7-
"base_ami": "ami-0cffefff2d52e0a23",
7+
"base_ami": "ami-0bc691261a82b32bc",
88
"instance_type": "c5.large",
99
"subnet_id": "subnet-c39cb6a5",
1010
"docker_version": "",

scripts/packer/azure-image-cuda.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
"managed_image_name": "{{user `build_prefix`}}dstack-cuda-{{user `image_version` | clean_resource_name}}",
2424
"os_type": "Linux",
2525
"image_publisher": "canonical",
26-
"image_offer": "0001-com-ubuntu-server-jammy",
27-
"image_sku": "22_04-lts-gen2",
26+
"image_offer": "ubuntu-24_04-lts",
27+
"image_sku": "server",
2828
"azure_tags": {
2929
"Name": "DSTACK-CUDA"
3030
},

scripts/packer/azure-image-grid.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
"managed_image_name": "{{user `build_prefix`}}dstack-grid-{{user `image_version` | clean_resource_name}}",
2323
"os_type": "Linux",
2424
"image_publisher": "canonical",
25-
"image_offer": "0001-com-ubuntu-server-jammy",
26-
"image_sku": "22_04-lts-gen2",
25+
"image_offer": "ubuntu-24_04-lts",
26+
"image_sku": "server",
2727
"azure_tags": {
2828
"Name": "DSTACK-GRID"
2929
},

scripts/packer/azure-image.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
"managed_image_name": "{{user `build_prefix`}}dstack-{{user `image_version` | clean_resource_name}}",
2323
"os_type": "Linux",
2424
"image_publisher": "canonical",
25-
"image_offer": "0001-com-ubuntu-server-jammy",
26-
"image_sku": "22_04-lts-gen2",
25+
"image_offer": "ubuntu-24_04-lts",
26+
"image_sku": "server",
2727
"azure_tags": {
2828
"Name": "DSTACK"
2929
},

scripts/packer/gcp-image-cuda.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
{
1111
"type": "googlecompute",
1212
"project_id": "dstack",
13-
"source_image": "ubuntu-2204-jammy-v20230714",
13+
"source_image": "ubuntu-2404-noble-amd64-v20250828",
1414
"image_name": "{{user `build_prefix`}}dstack-cuda-{{user `image_version` | clean_resource_name}}",
1515
"instance_name": "{{user `build_prefix`}}dstack-cuda-{{user `image_version` | clean_resource_name}}",
1616
"ssh_username": "ubuntu",

scripts/packer/gcp-image.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
{
1010
"type": "googlecompute",
1111
"project_id": "dstack",
12-
"source_image": "ubuntu-2204-jammy-v20230714",
12+
"source_image": "ubuntu-2404-noble-amd64-v20250828",
1313
"image_name": "{{user `build_prefix`}}dstack-{{user `image_version` | clean_resource_name}}",
1414
"instance_name": "{{user `build_prefix`}}dstack-{{user `image_version` | clean_resource_name}}",
1515
"ssh_username": "ubuntu",

scripts/packer/oci-image-cuda.json

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"compartment_ocid": "{{user `oci_compartment_ocid`}}",
1717
"subnet_ocid": "{{user `oci_subnet_ocid`}}",
1818
"shape": "VM.Standard2.1",
19-
"base_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaaxroekfbow3kdrdjlwao6tsxxfcb23xmqrdjtjcay2ow52eijvzqa",
19+
"base_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaahelib4o7g4fsjgck2lhxjmzonvbniwcmjjn2im4cxlksjgyzw5gq",
2020
"image_name": "{{user `build_prefix`}}dstack-cuda-{{user `image_version`}}",
2121
"instance_name": "packer-{{user `build_prefix`}}dstack-cuda-{{user `image_version`}}",
2222
"ssh_username": "ubuntu"
@@ -27,10 +27,6 @@
2727
"type": "shell",
2828
"inline": ["cloud-init status --long --wait"]
2929
},
30-
{
31-
"type": "shell",
32-
"script": "provisioners/wait-for-dpkg-lock.sh"
33-
},
3430
{
3531
"type": "shell",
3632
"scripts": [

scripts/packer/oci-image.json

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"compartment_ocid": "{{user `oci_compartment_ocid`}}",
1616
"subnet_ocid": "{{user `oci_subnet_ocid`}}",
1717
"shape": "VM.Standard2.1",
18-
"base_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaaxroekfbow3kdrdjlwao6tsxxfcb23xmqrdjtjcay2ow52eijvzqa",
18+
"base_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaahelib4o7g4fsjgck2lhxjmzonvbniwcmjjn2im4cxlksjgyzw5gq",
1919
"image_name": "{{user `build_prefix`}}dstack-{{user `image_version`}}",
2020
"instance_name": "packer-{{user `build_prefix`}}dstack-{{user `image_version`}}",
2121
"ssh_username": "ubuntu"
@@ -26,10 +26,6 @@
2626
"type": "shell",
2727
"inline": ["cloud-init status --long --wait"]
2828
},
29-
{
30-
"type": "shell",
31-
"script": "provisioners/wait-for-dpkg-lock.sh"
32-
},
3329
{
3430
"type": "shell",
3531
"scripts": [

scripts/packer/provisioners/cuda.sh

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,13 @@ ARCH=$(uname -m)
1010
CUDA_DISTRO=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g')
1111

1212
# based on https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#ubuntu-lts
13-
wget https://developer.download.nvidia.com/compute/cuda/repos/$CUDA_DISTRO/$ARCH/cuda-keyring_1.0-1_all.deb
14-
sudo dpkg -i cuda-keyring_1.0-1_all.deb
15-
rm cuda-keyring_1.0-1_all.deb
13+
wget https://developer.download.nvidia.com/compute/cuda/repos/$CUDA_DISTRO/$ARCH/cuda-keyring_1.1-1_all.deb
14+
sudo dpkg -i cuda-keyring_1.1-1_all.deb
15+
rm cuda-keyring_1.1-1_all.deb
1616

1717
sudo apt-get update
18-
# Pinned dkms due to https://bugs.launchpad.net/ubuntu/+source/dkms/+bug/2112114
1918
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
20-
dkms=2.8.7-2ubuntu2.2 \
21-
cuda-drivers-$CUDA_DRIVERS_VERSION \
19+
nvidia-driver-$CUDA_DRIVERS_VERSION-server-open \
2220
nvidia-fabricmanager-$CUDA_DRIVERS_VERSION \
2321
datacenter-gpu-manager-4-core datacenter-gpu-manager-4-proprietary datacenter-gpu-manager-exporter
2422
sudo systemctl enable nvidia-fabricmanager

0 commit comments

Comments
 (0)