From 302f40ebfcdd234865f545c60cb28f9bb614e6c5 Mon Sep 17 00:00:00 2001 From: hydazz Date: Fri, 17 Oct 2025 14:56:42 +0200 Subject: [PATCH 1/2] add talos support Signed-off-by: hydazz --- cmd/compute-domain-kubelet-plugin/cdi.go | 10 ++++++++++ cmd/compute-domain-kubelet-plugin/root.go | 2 ++ cmd/gpu-kubelet-plugin/cdi.go | 10 ++++++++++ cmd/gpu-kubelet-plugin/root.go | 2 ++ .../nvidia-dra-driver-gpu/templates/kubeletplugin.yaml | 10 +++++----- deployments/helm/nvidia-dra-driver-gpu/values.yaml | 4 ++++ hack/kubelet-plugin-prestart.sh | 2 ++ 7 files changed, 35 insertions(+), 5 deletions(-) diff --git a/cmd/compute-domain-kubelet-plugin/cdi.go b/cmd/compute-domain-kubelet-plugin/cdi.go index 95be15a72..72d93dd5e 100644 --- a/cmd/compute-domain-kubelet-plugin/cdi.go +++ b/cmd/compute-domain-kubelet-plugin/cdi.go @@ -46,6 +46,14 @@ const ( defaultCDIRoot = "/var/run/cdi" ) +func getTalosLibrarySearchPaths() []string { + return []string{ + "/driver-root/usr/local/glibc/usr/lib", + "/driver-root/usr/local/glibc/lib", + "/driver-root/usr/local/glibc/lib64", + } +} + type CDIHandler struct { logger *logrus.Logger nvml nvml.Interface @@ -103,6 +111,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) { nvcdi.WithVendor(h.vendor), nvcdi.WithClass(h.deviceClass), nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath), + nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()), ) if err != nil { return nil, fmt.Errorf("unable to create CDI library for devices: %w", err) @@ -120,6 +129,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) { nvcdi.WithVendor(h.vendor), nvcdi.WithClass(h.claimClass), nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath), + nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()), ) if err != nil { return nil, fmt.Errorf("unable to create CDI library for claims: %w", err) diff --git a/cmd/compute-domain-kubelet-plugin/root.go b/cmd/compute-domain-kubelet-plugin/root.go index 6079c78db..450455adf 100644 --- a/cmd/compute-domain-kubelet-plugin/root.go +++ b/cmd/compute-domain-kubelet-plugin/root.go @@ -34,6 +34,7 @@ func (r root) getDriverLibraryPath() (string, error) { "/lib64", "/lib/x86_64-linux-gnu", "/lib/aarch64-linux-gnu", + "/usr/local/glibc/usr/lib", } libraryPath, err := r.findFile("libnvidia-ml.so.1", librarySearchPaths...) @@ -52,6 +53,7 @@ func (r root) getNvidiaSMIPath() (string, error) { "/usr/sbin", "/bin", "/sbin", + "/usr/local/bin", } binaryPath, err := r.findFile("nvidia-smi", binarySearchPaths...) diff --git a/cmd/gpu-kubelet-plugin/cdi.go b/cmd/gpu-kubelet-plugin/cdi.go index 0fd8c8db7..e0f080255 100644 --- a/cmd/gpu-kubelet-plugin/cdi.go +++ b/cmd/gpu-kubelet-plugin/cdi.go @@ -51,6 +51,14 @@ const ( procNvCapsPath = "/proc/driver/nvidia/capabilities" ) +func getTalosLibrarySearchPaths() []string { + return []string{ + "/driver-root/usr/local/glibc/usr/lib", + "/driver-root/usr/local/glibc/lib", + "/driver-root/usr/local/glibc/lib64", + } +} + type CDIHandler struct { logger *logrus.Logger nvml nvml.Interface @@ -108,6 +116,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) { nvcdi.WithVendor(h.vendor), nvcdi.WithClass(h.deviceClass), nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath), + nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()), ) if err != nil { return nil, fmt.Errorf("unable to create CDI library for devices: %w", err) @@ -125,6 +134,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) { nvcdi.WithVendor(h.vendor), nvcdi.WithClass(h.claimClass), nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath), + nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()), ) if err != nil { return nil, fmt.Errorf("unable to create CDI library for claims: %w", err) diff --git a/cmd/gpu-kubelet-plugin/root.go b/cmd/gpu-kubelet-plugin/root.go index 6079c78db..450455adf 100644 --- a/cmd/gpu-kubelet-plugin/root.go +++ b/cmd/gpu-kubelet-plugin/root.go @@ -34,6 +34,7 @@ func (r root) getDriverLibraryPath() (string, error) { "/lib64", "/lib/x86_64-linux-gnu", "/lib/aarch64-linux-gnu", + "/usr/local/glibc/usr/lib", } libraryPath, err := r.findFile("libnvidia-ml.so.1", librarySearchPaths...) @@ -52,6 +53,7 @@ func (r root) getNvidiaSMIPath() (string, error) { "/usr/sbin", "/bin", "/sbin", + "/usr/local/bin", } binaryPath, err := r.findFile("nvidia-smi", binarySearchPaths...) diff --git a/deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml b/deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml index 504cf0437..642925d4c 100644 --- a/deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml +++ b/deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml @@ -137,7 +137,7 @@ spec: - name: NVIDIA_VISIBLE_DEVICES value: void - name: CDI_ROOT - value: /var/run/cdi + value: {{ .Values.cdiRoot | quote }} - name: NVIDIA_MIG_CONFIG_DEVICES value: all - name: NODE_NAME @@ -174,7 +174,7 @@ spec: mountPath: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }} mountPropagation: Bidirectional - name: cdi - mountPath: /var/run/cdi + mountPath: {{ .Values.cdiRoot | quote }} - name: driver-root mountPath: /driver-root readOnly: true @@ -238,7 +238,7 @@ spec: - name: NVIDIA_VISIBLE_DEVICES value: void - name: CDI_ROOT - value: /var/run/cdi + value: {{ .Values.cdiRoot | quote }} - name: NVIDIA_MIG_CONFIG_DEVICES value: all - name: NODE_NAME @@ -277,7 +277,7 @@ spec: mountPath: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }} mountPropagation: Bidirectional - name: cdi - mountPath: /var/run/cdi + mountPath: {{ .Values.cdiRoot | quote }} - name: driver-root mountPath: /driver-root mountPropagation: HostToContainer @@ -303,7 +303,7 @@ spec: path: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }} - name: cdi hostPath: - path: /var/run/cdi + path: {{ .Values.cdiRoot | quote }} - name: driver-root-parent hostPath: # If nvidiaDriverRoot == "/" then its parent is itself. Otherwise, get diff --git a/deployments/helm/nvidia-dra-driver-gpu/values.yaml b/deployments/helm/nvidia-dra-driver-gpu/values.yaml index 614a48fdd..190da5ad2 100644 --- a/deployments/helm/nvidia-dra-driver-gpu/values.yaml +++ b/deployments/helm/nvidia-dra-driver-gpu/values.yaml @@ -26,6 +26,10 @@ nvidiaDriverRoot: / # If not specified, the default path inferred from the nvidia-container-toolkit library version will be used. nvidiaCDIHookPath: "" +# CDI root directory path. +# This is where CDI spec files are stored and accessed by the runtime. +cdiRoot: "/var/run/cdi" + nameOverride: "" fullnameOverride: "" namespaceOverride: "" diff --git a/hack/kubelet-plugin-prestart.sh b/hack/kubelet-plugin-prestart.sh index 851d0a298..41907758a 100755 --- a/hack/kubelet-plugin-prestart.sh +++ b/hack/kubelet-plugin-prestart.sh @@ -47,6 +47,7 @@ validate_and_exit_on_success () { /driver-root/usr/bin \ /driver-root/usr/sbin \ /driver-root/bin \ + /driver-root/usr/local/bin \ /driver-root/sbin \ -maxdepth 1 -type f -name "nvidia-smi" 2> /dev/null | head -n1 ) @@ -60,6 +61,7 @@ validate_and_exit_on_success () { /driver-root/usr/lib64 \ /driver-root/usr/lib/x86_64-linux-gnu \ /driver-root/usr/lib/aarch64-linux-gnu \ + /driver-root/usr/local/glibc/usr/lib \ /driver-root/lib64 \ /driver-root/lib/x86_64-linux-gnu \ /driver-root/lib/aarch64-linux-gnu \ From 2cdaeff312aa56913961e69b870e05536683a354 Mon Sep 17 00:00:00 2001 From: hydazz Date: Fri, 30 Jan 2026 20:20:02 +1100 Subject: [PATCH 2/2] remove local talos library search paths Signed-off-by: hydazz --- cmd/compute-domain-kubelet-plugin/cdi.go | 10 ---------- cmd/gpu-kubelet-plugin/cdi.go | 10 ---------- 2 files changed, 20 deletions(-) diff --git a/cmd/compute-domain-kubelet-plugin/cdi.go b/cmd/compute-domain-kubelet-plugin/cdi.go index 72d93dd5e..95be15a72 100644 --- a/cmd/compute-domain-kubelet-plugin/cdi.go +++ b/cmd/compute-domain-kubelet-plugin/cdi.go @@ -46,14 +46,6 @@ const ( defaultCDIRoot = "/var/run/cdi" ) -func getTalosLibrarySearchPaths() []string { - return []string{ - "/driver-root/usr/local/glibc/usr/lib", - "/driver-root/usr/local/glibc/lib", - "/driver-root/usr/local/glibc/lib64", - } -} - type CDIHandler struct { logger *logrus.Logger nvml nvml.Interface @@ -111,7 +103,6 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) { nvcdi.WithVendor(h.vendor), nvcdi.WithClass(h.deviceClass), nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath), - nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()), ) if err != nil { return nil, fmt.Errorf("unable to create CDI library for devices: %w", err) @@ -129,7 +120,6 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) { nvcdi.WithVendor(h.vendor), nvcdi.WithClass(h.claimClass), nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath), - nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()), ) if err != nil { return nil, fmt.Errorf("unable to create CDI library for claims: %w", err) diff --git a/cmd/gpu-kubelet-plugin/cdi.go b/cmd/gpu-kubelet-plugin/cdi.go index e0f080255..0fd8c8db7 100644 --- a/cmd/gpu-kubelet-plugin/cdi.go +++ b/cmd/gpu-kubelet-plugin/cdi.go @@ -51,14 +51,6 @@ const ( procNvCapsPath = "/proc/driver/nvidia/capabilities" ) -func getTalosLibrarySearchPaths() []string { - return []string{ - "/driver-root/usr/local/glibc/usr/lib", - "/driver-root/usr/local/glibc/lib", - "/driver-root/usr/local/glibc/lib64", - } -} - type CDIHandler struct { logger *logrus.Logger nvml nvml.Interface @@ -116,7 +108,6 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) { nvcdi.WithVendor(h.vendor), nvcdi.WithClass(h.deviceClass), nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath), - nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()), ) if err != nil { return nil, fmt.Errorf("unable to create CDI library for devices: %w", err) @@ -134,7 +125,6 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) { nvcdi.WithVendor(h.vendor), nvcdi.WithClass(h.claimClass), nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath), - nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()), ) if err != nil { return nil, fmt.Errorf("unable to create CDI library for claims: %w", err)