Skip to content

Commit 2bf9460

Browse files
fix(gpu): e2e-validated hardening for CUDA prebake
Incorporates fixes surfaced by multi-SKU AgentBaker e2e validation of the prebake feature, and integrates with main's logs_to_events-wrapped GPU install: - cse_config.sh: only request install-skip-build when the prebake marker's driver_kind matches THIS node's NVIDIA_GPU_DRIVER_TYPE. A CUDA-prebaked marker on a shared VHD must NOT short-circuit a GRID node's install: the GRID image may not support install-skip-build and would fail to stage its userspace files (observed as CSE exit 84 in e2e). Pass the action through the timed installGPUDriverImage wrapper added on main. - cse_install_ubuntu.sh (cleanUpPrebakedGPUDriver): drop the slow per-version 'dkms remove --all' (~35s on the non-GPU provisioning critical path) in favor of removing the DKMS source tree + built module; also remove the driver userspace BINARIES (nvidia-smi etc.) so a non-GPU node is genuinely driver-free instead of leaving nvidia-smi on PATH erroring on missing libs. - install-dependencies.sh: install gcc/make/libc6-dev before the build-only bake -- the standard non-GPU VHD builder ships gcc/make but not libc6-dev, so nvidia-installer cannot compile the module without it. The boot-time fallback recompile already gets these via installDeps, so it stays intact. - specs: cover the driver_kind guard (match -> skip-build, mismatch -> full install) and the faster binaries+marker teardown. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 5283813 commit 2bf9460

5 files changed

Lines changed: 57 additions & 42 deletions

File tree

parts/linux/cloud-init/artifacts/cse_config.sh

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,11 +1221,16 @@ configGPUDrivers() {
12211221
mkdir -p /opt/{actions,gpu}
12221222
# When the kernel module was pre-built into the VHD (build-only at image-bake time),
12231223
# a marker is present. Ask aks-gpu to skip the ~100s DKMS recompile and run only the
1224-
# device-dependent steps. aks-gpu independently re-validates the marker (kernel +
1225-
# driver_version + driver_kind) and falls back to a full build on any mismatch, so this
1226-
# is safe even after a kernel upgrade or on a shared VHD with a different driver kind.
1224+
# device-dependent steps -- but ONLY when the marker's driver_kind matches THIS node's
1225+
# driver (NVIDIA_GPU_DRIVER_TYPE). A CUDA-prebaked marker on a GRID node (or vice-versa)
1226+
# must request a full "install": the other driver image may not even support
1227+
# install-skip-build and would fail to stage its userspace files (e.g. /opt/gpu/config.sh).
1228+
# aks-gpu still independently re-validates the marker (kernel + driver_version +
1229+
# driver_kind) and falls back to a full build on any remaining mismatch (e.g. kernel drift).
12271230
GPU_INSTALL_ACTION="install"
1228-
if [ -f "${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}" ]; then
1231+
GPU_DKMS_MARKER="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}"
1232+
if [ -f "$GPU_DKMS_MARKER" ] && \
1233+
[ "$(sed -n 's/^driver_kind=//p' "$GPU_DKMS_MARKER" | head -n1)" = "$NVIDIA_GPU_DRIVER_TYPE" ]; then
12291234
GPU_INSTALL_ACTION="install-skip-build"
12301235
fi
12311236
# The driver image is normally pre-pulled into the VHD; only hit the registry when it is

parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -240,17 +240,23 @@ cleanUpPrebakedGPUDriver() {
240240
fi
241241
echo "Removing pre-baked NVIDIA driver inherited from shared VHD on non-GPU node"
242242

243-
# Deregister the nvidia DKMS module so future kernel upgrades stop rebuilding it. Parsing stops
244-
# at the first ',', ':' or space so "nvidia/<ver>: added" and "nvidia/<ver>, <kernel>: installed"
245-
# both yield just <ver>.
246-
if command -v dkms >/dev/null 2>&1; then
247-
dkms status 2>/dev/null | sed -n 's#^nvidia/\([^,: ]*\).*#\1#p' | sort -u | while read -r nvidiaVersion; do
248-
[ -n "${nvidiaVersion}" ] && dkms remove "nvidia/${nvidiaVersion}" --all || true
249-
done
250-
fi
243+
# Deregister the nvidia DKMS module so future kernel upgrades stop rebuilding it, WITHOUT the
244+
# slow `dkms remove --all` (it dominated CSE duration on the non-GPU provisioning path, ~35s).
245+
# Removing the DKMS source tree deregisters it (dkms autoinstall iterates /var/lib/dkms/*), and
246+
# removing the built module reclaims disk. The module is never loaded on a non-GPU node, so no
247+
# depmod/initramfs refresh is required.
251248
rm -rf /var/lib/dkms/nvidia || true
249+
rm -f /lib/modules/*/updates/dkms/nvidia*.ko* 2>/dev/null || true
252250
# aks-gpu relocates the userspace libs under GPU_DEST/lib64; on Ubuntu GPU_DEST=/usr/bin.
253251
rm -rf /usr/bin/lib64 || true
252+
# nvidia-installer also drops driver userspace BINARIES under GPU_DEST (=/usr/bin on Ubuntu).
253+
# Remove them too so a non-GPU node looks genuinely driver-free: otherwise e.g. `nvidia-smi`
254+
# remains on PATH and, with its libs (lib64) gone, errors instead of being "command not found".
255+
for nvidiaBin in nvidia-smi nvidia-debugdump nvidia-persistenced nvidia-cuda-mps-control \
256+
nvidia-cuda-mps-server nvidia-modprobe nvidia-bug-report.sh nvidia-powerd \
257+
nvidia-ngx-updater nvidia-sleep.sh; do
258+
rm -f "/usr/bin/${nvidiaBin}" || true
259+
done
254260
rm -f /etc/ld.so.conf.d/nvidia.conf || true
255261
ldconfig || true
256262
rm -f "${marker}" || true

spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1890,35 +1890,53 @@ SETUP_EOF
18901890

18911891
Describe 'configGPUDrivers'
18921892
# Mock everything the Ubuntu path touches so the test exercises only the
1893-
# marker -> aks-gpu action selection (install vs install-skip-build).
1893+
# marker -> aks-gpu action selection (install vs install-skip-build), including the
1894+
# driver_kind guard (a CUDA-baked marker on a GRID node must NOT skip the build).
1895+
# logs_to_events is mocked to faithfully dispatch the wrapped command (dropping the
1896+
# event-name arg) so the real installGPUDriverImage runs and surfaces the action.
1897+
logs_to_events() { shift; $@; }
18941898
waitForContainerdReady() { return 0; }
18951899
mkdir() { :; }
18961900
ctr() { echo "ctr $*"; }
18971901
nvidia-modprobe() { return 0; }
18981902
nvidia-smi() { return 0; }
18991903
ldconfig() { return 0; }
19001904
isMarinerOrAzureLinux() { return 1; }
1905+
isAzureLinuxOSGuard() { return 1; }
19011906
isACL() { return 1; }
19021907
systemctlEnableAndStart() { return 0; }
19031908
systemctl() { return 0; }
19041909
# Capture the action passed to the install container.
19051910
retrycmd_if_failure() { shift 3; echo "INSTALL_CMD: $*"; return 0; }
19061911

1907-
BeforeEach 'OS="$UBUNTU_OS_NAME"; NVIDIA_DRIVER_IMAGE="mcr.microsoft.com/aks/aks-gpu-cuda"; NVIDIA_DRIVER_IMAGE_TAG="580.0.0"; CTR_GPU_INSTALL_CMD="ctr-run"; GPU_DKMS_MARKER_FILE="$(mktemp -u)"'
1912+
BeforeEach 'OS="$UBUNTU_OS_NAME"; NVIDIA_GPU_DRIVER_TYPE="cuda"; NVIDIA_DRIVER_IMAGE="mcr.microsoft.com/aks/aks-gpu-cuda"; NVIDIA_DRIVER_IMAGE_TAG="580.0.0"; CTR_GPU_INSTALL_CMD="ctr-run"; GPU_DKMS_MARKER_FILE="$(mktemp -u)"'
19081913

19091914
It 'uses the full install action when no prebake marker is present'
19101915
When call configGPUDrivers
19111916
The output should include "/entrypoint.sh install"
19121917
The output should not include "install-skip-build"
19131918
End
19141919

1915-
It 'uses install-skip-build when the prebake marker is present'
1920+
It 'uses install-skip-build when the prebake marker matches the node driver kind'
19161921
marker="$(mktemp)"
1922+
printf 'driver_kind=cuda\n' > "$marker"
19171923
GPU_DKMS_MARKER_FILE="$marker"
19181924
When call configGPUDrivers
19191925
The output should include "/entrypoint.sh install-skip-build"
19201926
rm -f "$marker"
19211927
End
1928+
1929+
It 'falls back to full install when the marker driver_kind does not match the node (CUDA marker on GRID node)'
1930+
marker="$(mktemp)"
1931+
printf 'driver_kind=cuda\n' > "$marker"
1932+
GPU_DKMS_MARKER_FILE="$marker"
1933+
NVIDIA_GPU_DRIVER_TYPE="grid"
1934+
NVIDIA_DRIVER_IMAGE="mcr.microsoft.com/aks/aks-gpu-grid"
1935+
When call configGPUDrivers
1936+
The output should include "/entrypoint.sh install"
1937+
The output should not include "install-skip-build"
1938+
rm -f "$marker"
1939+
End
19221940
End
19231941

19241942
Describe 'configureManagedGPUExperience'

spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh

Lines changed: 8 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,43 +11,24 @@ Describe 'cse_install_ubuntu.sh'
1111
The output should equal ""
1212
End
1313

14-
It 'deregisters the nvidia DKMS module and removes baked artifacts when the marker is present'
14+
It 'deregisters the nvidia DKMS module and removes baked artifacts (libs, binaries, marker) when present'
1515
marker="$(mktemp)"
1616
GPU_DKMS_MARKER_FILE="${marker}"
17-
# mock dkms so `command -v dkms` succeeds and `dkms status` returns the installed form
18-
dkms() {
19-
if [ "$1" = "status" ]; then
20-
echo "nvidia/580.126.09, 6.8.0-1029-azure, x86_64: installed"
21-
else
22-
echo "mock dkms $*"
23-
fi
24-
}
2517
rm() { echo "mock rm $*"; }
2618
ldconfig() { echo "mock ldconfig"; }
2719
When call cleanUpPrebakedGPUDriver
2820
The status should be success
2921
The output should include "Removing pre-baked NVIDIA driver"
30-
The output should include "mock dkms remove nvidia/580.126.09 --all"
22+
# deregisters via the DKMS source tree + built module removal (no slow dkms remove)
3123
The output should include "mock rm -rf /var/lib/dkms/nvidia"
24+
The output should include "mock rm -f /lib/modules"
25+
# relocated userspace libs
3226
The output should include "mock rm -rf /usr/bin/lib64"
27+
# driver userspace binaries so nvidia-smi becomes "command not found" on non-GPU nodes
28+
The output should include "mock rm -f /usr/bin/nvidia-smi"
3329
The output should include "mock ldconfig"
34-
End
35-
36-
It 'parses the bare "nvidia/<ver>: added" dkms status form to a clean version'
37-
marker="$(mktemp)"
38-
GPU_DKMS_MARKER_FILE="${marker}"
39-
dkms() {
40-
if [ "$1" = "status" ]; then
41-
echo "nvidia/570.86.15: added"
42-
else
43-
echo "mock dkms $*"
44-
fi
45-
}
46-
rm() { echo "mock rm $*"; }
47-
ldconfig() { echo "mock ldconfig"; }
48-
When call cleanUpPrebakedGPUDriver
49-
The status should be success
50-
The output should include "mock dkms remove nvidia/570.86.15 --all"
30+
# the slow per-version dkms remove --all must NOT be on the critical path anymore
31+
The output should not include "dkms remove"
5132
End
5233
End
5334
End

vhdbuilder/packer/install-dependencies.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -749,6 +749,11 @@ EOF
749749
# Dropping the image is a separate, deferred size optimization.
750750
if grep -q "NVIDIA_CUDA_PREBAKE" <<< "$FEATURE_FLAGS"; then
751751
echo "Pre-building NVIDIA CUDA kernel module into the VHD (build-only) for kernel $(uname -r)"
752+
# nvidia-installer compiles the kernel module and needs the libc development headers (libc6-dev),
753+
# which the standard (non-GPU) VHD builder image does not ship by default (gcc/make are present
754+
# but libc6-dev is not). Ensure the kernel-module build toolchain before the bake; the boot-time
755+
# fallback path already gets these via installDeps, so the runtime recompile stays intact.
756+
apt_get_install 10 2 300 gcc make libc6-dev || exit 1
752757
CTR_GPU_PREBUILD_CMD="ctr -n k8s.io run --privileged --rm --net-host --with-ns pid:/proc/1/ns/pid --mount type=bind,src=/opt/gpu,dst=/mnt/gpu,options=rbind --mount type=bind,src=/opt/actions,dst=/mnt/actions,options=rbind"
753758
retrycmd_if_failure 3 10 600 bash -c "$CTR_GPU_PREBUILD_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuprebuild /entrypoint.sh build-only" || exit 1
754759
if [ ! -f /opt/azure/aks-gpu/dkms-marker ]; then

0 commit comments

Comments
 (0)