@@ -14,56 +14,74 @@ with lib; let
1414 ] ;
1515 } ;
1616
17- # Generic CDI Plugin DaemonSet for GPU resource allocation
18- generic-cdi-plugin-manifest = pkgs . writeText "generic-cdi-plugin.yaml" ''
17+ nvidia-device-plugin-version = "v0.19.1" ;
18+
19+ nvidia-device-plugin-manifest = pkgs . writeText "nvidia-device-plugin.yaml" ''
20+ apiVersion: node.k8s.io/v1
21+ handler: nvidia
22+ kind: RuntimeClass
23+ metadata:
24+ name: nvidia
25+ labels:
26+ app.kubernetes.io/component: gpu-operator
27+ ---
1928 apiVersion: apps/v1
2029 kind: DaemonSet
2130 metadata:
22- name: generic-cdi -plugin
31+ name: nvidia-device -plugin-daemonset
2332 namespace: kube-system
2433 labels:
25- app: generic-cdi -plugin
34+ app.kubernetes.io/name: nvidia-device -plugin
2635 spec:
2736 selector:
2837 matchLabels:
29- app: generic-cdi-plugin
38+ app.kubernetes.io/name: nvidia-device-plugin
39+ updateStrategy:
40+ type: RollingUpdate
3041 template:
3142 metadata:
3243 labels:
33- app: generic-cdi -plugin
44+ app.kubernetes.io/name: nvidia-device -plugin
3445 spec:
46+ runtimeClassName: nvidia
47+ priorityClassName: system-node-critical
3548 nodeSelector:
36- nixos- nvidia-cdi : "enabled "
49+ nvidia.com/gpu.present : "true "
3750 tolerations:
3851 - key: nvidia.com/gpu
3952 operator: Exists
4053 effect: NoSchedule
4154 containers:
42- - name: generic-cdi-plugin
43- image: ghcr.io/olfillasodikno/generic-cdi-plugin:main
44- imagePullPolicy: Always
45- args:
46- - "/var/run/cdi/nvidia-container-toolkit.json"
55+ - name: nvidia-device-plugin-ctr
56+ image: nvcr.io/nvidia/k8s-device-plugin:${ nvidia-device-plugin-version }
57+ imagePullPolicy: IfNotPresent
58+ command: ["nvidia-device-plugin"]
59+ env:
60+ - name: DEVICE_ID_STRATEGY
61+ value: uuid
62+ - name: NVIDIA_VISIBLE_DEVICES
63+ value: all
64+ - name: NVIDIA_DRIVER_CAPABILITIES
65+ value: compute,utility
4766 securityContext:
48- privileged: true
67+ allowPrivilegeEscalation: false
68+ capabilities:
69+ drop: ["ALL"]
4970 volumeMounts:
50- - name: device-plugin
71+ - name: kubelet- device-plugins-dir
5172 mountPath: /var/lib/kubelet/device-plugins
52- - name: pod-resources
53- mountPath: /var/lib/kubelet/pod-resources
5473 - name: cdi-specs
5574 mountPath: /var/run/cdi
5675 readOnly: true
5776 volumes:
58- - name: device-plugin
77+ - name: kubelet- device-plugins-dir
5978 hostPath:
6079 path: /var/lib/kubelet/device-plugins
61- - name: pod-resources
62- hostPath:
63- path: /var/lib/kubelet/pod-resources
80+ type: Directory
6481 - name: cdi-specs
6582 hostPath:
6683 path: /var/run/cdi
84+ type: DirectoryOrCreate
6785 '' ;
6886
6987 # Test pod to verify GPU access
@@ -75,13 +93,14 @@ with lib; let
7593 namespace: default
7694 spec:
7795 restartPolicy: Never
96+ runtimeClassName: nvidia
7897 containers:
7998 - name: cuda-test
8099 image: nvidia/cuda:12.6.3-base-ubuntu24.04
81100 command: ["nvidia-smi"]
82101 resources:
83102 limits:
84- nvidia.com/gpu-all : 1
103+ nvidia.com/gpu: 1
85104 '' ;
86105in {
87106 options = {
97116
98117 config = mkIf cfg . enable {
99118 # NVIDIA container toolkit for CDI spec generation
100- hardware . nvidia-container-toolkit . enable = true ;
119+ hardware . nvidia-container-toolkit = {
120+ enable = true ;
121+ device-name-strategy = "uuid" ;
122+ mount-nvidia-executables = true ;
123+ } ;
101124
102125 # Ensure CDI generator has access to nvidia libs
103126 systemd . services . nvidia-container-toolkit-cdi-generator = {
@@ -124,12 +147,15 @@ in {
124147 '' ;
125148 } ;
126149
127- extraFlags = [
128- "--node-label=nixos-nvidia-cdi=enabled"
129- "--tls-san=${ config . networking . hostName } "
130- "--tls-san=${ config . networking . hostName } .local"
131- "--tls-san=localhost"
132- ] ++ cfg . extraFlags ;
150+ extraFlags =
151+ [
152+ "--node-label=nixos-nvidia-cdi=enabled"
153+ "--node-label=nvidia.com/gpu.present=true"
154+ "--tls-san=${ config . networking . hostName } "
155+ "--tls-san=${ config . networking . hostName } .local"
156+ "--tls-san=localhost"
157+ ]
158+ ++ cfg . extraFlags ;
133159
134160 # Containerd config with CDI support
135161 # k3s 1.31+ with containerd 2.0 has CDI enabled by default
@@ -138,10 +164,13 @@ in {
138164 {{ template "base" . }}
139165
140166 [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
167+ privileged_without_host_devices = false
168+ runtime_engine = ""
169+ runtime_root = ""
141170 runtime_type = "io.containerd.runc.v2"
142171
143172 [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
144- BinaryName = "/run/current-system/sw /bin/nvidia-container-runtime.cdi"
173+ BinaryName = "${ lib . getOutput "tools" config . hardware . nvidia-container-toolkit . package } /bin/nvidia-container-runtime.cdi"
145174 '' ;
146175
147176 gracefulNodeShutdown . enable = true ;
@@ -160,13 +189,13 @@ in {
160189 export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
161190 '' ;
162191
163- # Create systemd service to deploy the generic-cdi- plugin after k3s is ready
192+ # Create systemd service to deploy the NVIDIA device plugin after k3s is ready
164193 systemd . services . k3s-gpu-plugin-deploy = {
165- description = "Deploy generic-cdi- plugin to k3s" ;
166- after = [ "k3s.service" ] ;
167- wants = [ "k3s.service" ] ;
168- wantedBy = [ "multi-user.target" ] ;
169- path = [ pkgs . kubectl pkgs . coreutils ] ;
194+ description = "Deploy NVIDIA device plugin to k3s" ;
195+ after = [ "k3s.service" ] ;
196+ wants = [ "k3s.service" ] ;
197+ wantedBy = [ "multi-user.target" ] ;
198+ path = [ pkgs . kubectl pkgs . coreutils ] ;
170199 serviceConfig = {
171200 Type = "oneshot" ;
172201 RemainAfterExit = true ;
@@ -183,23 +212,22 @@ in {
183212 sleep 5
184213 done
185214
186- # Check if plugin already exists
187215 if kubectl get daemonset -n kube-system generic-cdi-plugin &>/dev/null; then
188- echo "generic-cdi-plugin already deployed, updating..."
189- kubectl apply -f ${ generic-cdi-plugin-manifest }
190- else
191- echo "Deploying generic-cdi-plugin..."
192- kubectl apply -f ${ generic-cdi-plugin-manifest }
216+ echo "Removing old generic-cdi-plugin deployment..."
217+ kubectl delete daemonset -n kube-system generic-cdi-plugin --ignore-not-found=true
193218 fi
194219
195- echo "Waiting for generic-cdi-plugin to be ready..."
196- kubectl rollout status daemonset/generic-cdi-plugin -n kube-system --timeout=120s || true
220+ echo "Deploying NVIDIA device plugin..."
221+ kubectl apply -f ${ nvidia-device-plugin-manifest }
222+
223+ echo "Waiting for NVIDIA device plugin to be ready..."
224+ kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s || true
197225 '' ;
198226 } ;
199227 } ;
200228
201229 # Store test manifests in /etc for easy access
202230 environment . etc . "k3s/gpu-test-pod.yaml" . source = gpu-test-pod ;
203- environment . etc . "k3s/generic-cdi -plugin.yaml" . source = generic-cdi -plugin-manifest;
231+ environment . etc . "k3s/nvidia-device -plugin.yaml" . source = nvidia-device -plugin-manifest;
204232 } ;
205233}
0 commit comments