Skip to content

Commit 71be3d1

Browse files
harcheclaude
andcommitted
Skip device plugin alert when devicePlugin is disabled in ClusterPolicy
When devicePlugin.enabled is set to false in the ClusterPolicy, the nvidia-node-status-exporter still monitors the device_plugin_devices_total metric which reports 0 (since no device plugin pods are running). This triggers a false positive GPUOperatorNodeDeploymentFailed alert. Fix: The operator now injects a DEVICE_PLUGIN_ENABLED env var into the node-status-exporter daemonset based on the ClusterPolicy. When set to "false", the exporter skips device plugin validation entirely, so the metric is never emitted and the alert does not fire. Fixes: #2237 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Harshal Patil <12152047+harche@users.noreply.github.com>
1 parent 30a8bbd commit 71be3d1

6 files changed

Lines changed: 99 additions & 2 deletions

File tree

assets/state-node-status-exporter/0800_prometheus_rule_openshift.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@ spec:
1010
- name: Alert on node deployment failure
1111
rules:
1212
- alert: GPUOperatorNodeDeploymentFailed
13-
# There is no GPU exposed on the node,
13+
# There is no GPU exposed on the node.
14+
# When the device plugin is intentionally disabled in the ClusterPolicy
15+
# (devicePlugin.enabled: false), the metric is set to -1, so this
16+
# alert will not fire in that case.
1417
expr: |
1518
gpu_operator_node_device_plugin_devices_total == 0
1619
for: 30m

cmd/nvidia-validator/metrics.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,15 @@ func (nm *NodeMetrics) Run() error {
306306
go nm.watchStatusFile(&nm.cudaReady, cudaStatusFile)
307307

308308
go nm.watchDriverValidation()
309-
go nm.watchDevicePluginValidation()
309+
if os.Getenv("DEVICE_PLUGIN_ENABLED") != "false" {
310+
go nm.watchDevicePluginValidation()
311+
} else {
312+
// Set to -1 so the alert (expr: == 0) does not fire.
313+
// The gauge is auto-registered by promauto and defaults to 0,
314+
// which would be a false positive.
315+
nm.deviceCount.Set(-1)
316+
log.Info("metrics: DevicePlugin is disabled in ClusterPolicy, skipping device plugin validation")
317+
}
310318
go nm.watchNVIDIAPCI()
311319

312320
log.Printf("Running the metrics server, listening on :%d/metrics", nm.port)

controllers/object_controls.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ const (
122122
NvidiaDisableRequireEnvName = "NVIDIA_DISABLE_REQUIRE"
123123
// GDSEnabledEnvName is the env name to enable GDS support with device-plugin
124124
GDSEnabledEnvName = "GDS_ENABLED"
125+
// DevicePluginEnabledEnvName indicates whether the device plugin is enabled in the ClusterPolicy
126+
DevicePluginEnabledEnvName = "DEVICE_PLUGIN_ENABLED"
125127
// MOFEDEnabledEnvName is the env name to enable MOFED devices injection with device-plugin
126128
MOFEDEnabledEnvName = "MOFED_ENABLED"
127129
// GDRCopyEnabledEnvName is the envvar that enables injection of the GDRCopy device node with the device-plugin
@@ -2450,6 +2452,12 @@ func TransformNodeStatusExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
24502452
obj.Spec.Template.Spec.Containers[0].Args = config.NodeStatusExporter.Args
24512453
}
24522454

2455+
devicePluginEnabled := "true"
2456+
if !config.DevicePlugin.IsEnabled() {
2457+
devicePluginEnabled = "false"
2458+
}
2459+
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DevicePluginEnabledEnvName, devicePluginEnabled)
2460+
24532461
// set/append environment variables for exporter container
24542462
if len(config.NodeStatusExporter.Env) > 0 {
24552463
for _, env := range config.NodeStatusExporter.Env {

controllers/transforms_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2891,6 +2891,35 @@ func TestTransformNodeStatusExporter(t *testing.T) {
28912891
Name: "dummy",
28922892
Image: "nvcr.io/nvidia/cloud-native/node-status-exporter:v1.0.0",
28932893
ImagePullPolicy: corev1.PullIfNotPresent,
2894+
Env: []corev1.EnvVar{
2895+
{Name: DevicePluginEnabledEnvName, Value: "true"},
2896+
},
2897+
SecurityContext: &corev1.SecurityContext{
2898+
RunAsUser: rootUID,
2899+
},
2900+
}),
2901+
},
2902+
{
2903+
description: "node status exporter with device plugin disabled",
2904+
ds: NewDaemonset().
2905+
WithContainer(corev1.Container{Name: "dummy"}),
2906+
cpSpec: &gpuv1.ClusterPolicySpec{
2907+
NodeStatusExporter: gpuv1.NodeStatusExporterSpec{
2908+
Repository: "nvcr.io/nvidia/cloud-native",
2909+
Image: "node-status-exporter",
2910+
Version: "v1.0.0",
2911+
ImagePullPolicy: "IfNotPresent",
2912+
},
2913+
DevicePlugin: gpuv1.DevicePluginSpec{Enabled: newBoolPtr(false)},
2914+
},
2915+
expectedDs: NewDaemonset().
2916+
WithContainer(corev1.Container{
2917+
Name: "dummy",
2918+
Image: "nvcr.io/nvidia/cloud-native/node-status-exporter:v1.0.0",
2919+
ImagePullPolicy: corev1.PullIfNotPresent,
2920+
Env: []corev1.EnvVar{
2921+
{Name: DevicePluginEnabledEnvName, Value: "false"},
2922+
},
28942923
SecurityContext: &corev1.SecurityContext{
28952924
RunAsUser: rootUID,
28962925
},

tests/e2e/helpers/clusterpolicy.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,18 @@ func (h *ClusterPolicyClient) DisableGFD(ctx context.Context, name string) error
106106
})
107107
}
108108

109+
func (h *ClusterPolicyClient) EnableDevicePlugin(ctx context.Context, name string) error {
110+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
111+
clusterPolicy.Spec.DevicePlugin.Enabled = ptr.To(true)
112+
})
113+
}
114+
115+
func (h *ClusterPolicyClient) DisableDevicePlugin(ctx context.Context, name string) error {
116+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
117+
clusterPolicy.Spec.DevicePlugin.Enabled = ptr.To(false)
118+
})
119+
}
120+
109121
func (h *ClusterPolicyClient) SetMIGStrategy(ctx context.Context, name, strategy string) error {
110122
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
111123
clusterPolicy.Spec.MIG.Strategy = nvidiav1.MIGStrategy(strategy)

tests/e2e/suites/clusterpolicy_test.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,43 @@ var _ = Describe("ClusterPolicy Management", Label("clusterPolicy"), func() {
328328
})
329329
})
330330

331+
// test_device_plugin_disabled_env - Verify DEVICE_PLUGIN_ENABLED env var propagation
332+
When("Disabling device plugin", Label("device-plugin", "toggle"), func() {
333+
It("should set DEVICE_PLUGIN_ENABLED=false on node-status-exporter when device plugin is disabled", func(ctx context.Context) {
334+
clusterPolicy := getClusterPolicyOrSkip(ctx, clusterPolicyClient, policyName)
335+
originalState := clusterPolicy.Spec.DevicePlugin.Enabled
336+
DeferCleanup(func(ctx context.Context) {
337+
if originalState == nil || *originalState {
338+
_ = clusterPolicyClient.EnableDevicePlugin(ctx, policyName)
339+
waitForDaemonSetReady(ctx, daemonSetClient, testNamespace, "nvidia-device-plugin-daemonset")
340+
}
341+
})
342+
343+
err := clusterPolicyClient.DisableDevicePlugin(ctx, policyName)
344+
Expect(err).NotTo(HaveOccurred(), "Failed to disable device plugin in ClusterPolicy")
345+
346+
verifyEnvInDaemonSet(ctx, daemonSetClient, testNamespace,
347+
"nvidia-node-status-exporter", "DEVICE_PLUGIN_ENABLED", "false")
348+
})
349+
350+
It("should set DEVICE_PLUGIN_ENABLED=true on node-status-exporter when device plugin is re-enabled", func(ctx context.Context) {
351+
clusterPolicy := getClusterPolicyOrSkip(ctx, clusterPolicyClient, policyName)
352+
originalState := clusterPolicy.Spec.DevicePlugin.Enabled
353+
DeferCleanup(func(ctx context.Context) {
354+
if originalState != nil && !*originalState {
355+
_ = clusterPolicyClient.DisableDevicePlugin(ctx, policyName)
356+
}
357+
})
358+
359+
err := clusterPolicyClient.EnableDevicePlugin(ctx, policyName)
360+
Expect(err).NotTo(HaveOccurred(), "Failed to enable device plugin in ClusterPolicy")
361+
362+
verifyEnvInDaemonSet(ctx, daemonSetClient, testNamespace,
363+
"nvidia-node-status-exporter", "DEVICE_PLUGIN_ENABLED", "true")
364+
waitForDaemonSetReady(ctx, daemonSetClient, testNamespace, "nvidia-device-plugin-daemonset")
365+
})
366+
})
367+
331368
// test_custom_labels_override - Test custom labels on daemonsets
332369
When("Updating daemonset custom labels", Label("labels", "config"), func() {
333370
It("should apply custom labels to all operand pods", func(ctx context.Context) {

0 commit comments

Comments
 (0)