Skip to content

Commit 30d844a

Browse files
Restart driver pods in place when driver config is unchanged
A patch chart upgrade can change only cosmetic pod-template metadata (e.g. the helm.sh/chart label) without changing the driver itself. The upgrade controller keys on the DaemonSet's controller revision hash, so such a change still cordons the node, evicts running GPU workloads, and drains the node -- for no driver benefit. Register a RestartOnlyPredicate on the upgrade state manager (from the UpgradeReconciler) that compares DRIVER_CONFIG_DIGEST -- a hash of the install-relevant driver config, already stamped on the driver pod template -- between the running pod and the desired DaemonSet. When the digests match, the driver pod is rolled in place without cordon, eviction, or drain; the driver fast-path keeps the kernel modules loaded across the restart, so running GPU workloads are not disrupted. A missing or differing digest falls back to the full upgrade flow. The digest env name and a reader for it live in internal/config beside the digest definition; the restart-only routing decision is a method on the upgrade controller, registered in SetupWithManager. Depends on the RestartOnlyPredicate hook in k8s-operator-libs; the vendored dependency bump follows once that change is released. Signed-off-by: Rajath Agasthya <ragasthya@nvidia.com>
1 parent bc8d643 commit 30d844a

5 files changed

Lines changed: 189 additions & 3 deletions

File tree

controllers/object_controls.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,19 +1064,19 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C
10641064
// Set the computed digest in driver-manager initContainer
10651065
driverManagerContainer := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager")
10661066
if driverManagerContainer != nil {
1067-
setContainerEnv(driverManagerContainer, "DRIVER_CONFIG_DIGEST", configDigest)
1067+
setContainerEnv(driverManagerContainer, driverconfig.DriverConfigDigestEnvName, configDigest)
10681068
}
10691069

10701070
// Set the computed digest in nvidia-driver container
10711071
driverContainer := findContainerByName(obj.Spec.Template.Spec.Containers, "nvidia-driver-ctr")
10721072
if driverContainer != nil {
1073-
setContainerEnv(driverContainer, "DRIVER_CONFIG_DIGEST", configDigest)
1073+
setContainerEnv(driverContainer, driverconfig.DriverConfigDigestEnvName, configDigest)
10741074
}
10751075

10761076
// Used by dtk-build-driver to determine if fast path should be used (skip rebuild)
10771077
driverToolkitContainer := findContainerByName(obj.Spec.Template.Spec.Containers, "openshift-driver-toolkit-ctr")
10781078
if driverToolkitContainer != nil {
1079-
setContainerEnv(driverToolkitContainer, "DRIVER_CONFIG_DIGEST", configDigest)
1079+
setContainerEnv(driverToolkitContainer, driverconfig.DriverConfigDigestEnvName, configDigest)
10801080
}
10811081

10821082
// set hostNetwork for driver if specified

controllers/upgrade_controller.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ import (
4747

4848
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
4949
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
50+
driverconfig "github.com/NVIDIA/gpu-operator/internal/config"
5051
)
5152

5253
// UpgradeReconciler reconciles Driver Daemon Sets for upgrade
@@ -231,10 +232,38 @@ func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) er
231232
return nil
232233
}
233234

235+
// driverPodRestartOnly is the upgrade controller's RestartOnlyPredicate: it allows an
236+
// out-of-sync driver pod to be restarted in place when the running pod and the desired
237+
// DaemonSet carry the same DRIVER_CONFIG_DIGEST, i.e. the install-relevant config is
238+
// unchanged (e.g. only a helm.sh/chart label moved). A digest missing on either side
239+
// returns false, taking the full upgrade flow.
240+
func (r *UpgradeReconciler) driverPodRestartOnly(_ context.Context, pod *corev1.Pod, ds *appsv1.DaemonSet) (bool, error) {
241+
if pod == nil || ds == nil {
242+
return false, nil
243+
}
244+
desired := driverconfig.DriverConfigDigestFromPodSpec(&ds.Spec.Template.Spec)
245+
running := driverconfig.DriverConfigDigestFromPodSpec(&pod.Spec)
246+
if desired == "" || running == "" {
247+
r.Log.V(consts.LogLevelDebug).Info("driver config digest missing; taking full upgrade flow",
248+
"pod", pod.Name, "daemonset", ds.Name, "desiredDigest", desired, "runningDigest", running)
249+
return false, nil
250+
}
251+
restartOnly := desired == running
252+
r.Log.V(consts.LogLevelDebug).Info("evaluated driver config digest for restart-only routing",
253+
"pod", pod.Name, "daemonset", ds.Name,
254+
"desiredDigest", desired, "runningDigest", running, "restartOnly", restartOnly)
255+
return restartOnly, nil
256+
}
257+
234258
// SetupWithManager sets up the controller with the Manager.
235259
//
236260
//nolint:dupl
237261
func (r *UpgradeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
262+
// Route digest-unchanged driver pod-template changes to a restart-only upgrade.
263+
if r.StateManager != nil {
264+
r.StateManager = r.StateManager.WithRestartOnlyPredicate(r.driverPodRestartOnly)
265+
}
266+
238267
// Create a new controller
239268
c, err := controller.New("upgrade-controller", mgr, controller.Options{Reconciler: r, MaxConcurrentReconciles: 1,
240269
RateLimiter: workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](minDelayCR, maxDelayCR)})

controllers/upgrade_controller_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,17 @@
1717
package controllers
1818

1919
import (
20+
"context"
2021
"fmt"
2122
"testing"
2223

2324
upgrade_v1alpha1 "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1"
25+
"github.com/go-logr/logr"
2426
"github.com/stretchr/testify/assert"
27+
appsv1 "k8s.io/api/apps/v1"
28+
corev1 "k8s.io/api/core/v1"
29+
30+
driverconfig "github.com/NVIDIA/gpu-operator/internal/config"
2531
)
2632

2733
func TestSetDrainSpecPodSelector(t *testing.T) {
@@ -69,3 +75,44 @@ func TestSetDrainSpecPodSelector(t *testing.T) {
6975
})
7076
}
7177
}
78+
79+
func TestDriverPodRestartOnly(t *testing.T) {
80+
driverPod := func(digest string) *corev1.Pod {
81+
return &corev1.Pod{Spec: corev1.PodSpec{Containers: []corev1.Container{{
82+
Name: "nvidia-driver-ctr",
83+
Env: []corev1.EnvVar{{Name: driverconfig.DriverConfigDigestEnvName, Value: digest}},
84+
}}}}
85+
}
86+
driverDS := func(digest string) *appsv1.DaemonSet {
87+
return &appsv1.DaemonSet{Spec: appsv1.DaemonSetSpec{Template: corev1.PodTemplateSpec{
88+
Spec: corev1.PodSpec{Containers: []corev1.Container{{
89+
Name: "nvidia-driver-ctr",
90+
Env: []corev1.EnvVar{{Name: driverconfig.DriverConfigDigestEnvName, Value: digest}},
91+
}}},
92+
}}}
93+
}
94+
95+
r := &UpgradeReconciler{Log: logr.Discard()}
96+
ctx := context.Background()
97+
98+
tests := []struct {
99+
name string
100+
pod *corev1.Pod
101+
ds *appsv1.DaemonSet
102+
wantRestart bool
103+
}{
104+
{name: "equal digests -> restart-only", pod: driverPod("same"), ds: driverDS("same"), wantRestart: true},
105+
{name: "differing digests -> full upgrade", pod: driverPod("old"), ds: driverDS("new"), wantRestart: false},
106+
{name: "missing digest on pod -> full upgrade", pod: driverPod(""), ds: driverDS("new"), wantRestart: false},
107+
{name: "missing digest on daemonset -> full upgrade", pod: driverPod("old"), ds: driverDS(""), wantRestart: false},
108+
{name: "nil pod -> full upgrade", pod: nil, ds: driverDS("x"), wantRestart: false},
109+
{name: "nil daemonset -> full upgrade", pod: driverPod("x"), ds: nil, wantRestart: false},
110+
}
111+
for _, tt := range tests {
112+
t.Run(tt.name, func(t *testing.T) {
113+
got, err := r.driverPodRestartOnly(ctx, tt.pod, tt.ds)
114+
assert.NoError(t, err)
115+
assert.Equal(t, tt.wantRestart, got)
116+
})
117+
}
118+
}

internal/config/driver_config_digest.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,38 @@ import (
2222
corev1 "k8s.io/api/core/v1"
2323
)
2424

25+
// DriverConfigDigestEnvName is the env var the operator sets on the driver pod
26+
// template, carrying a hash of the install-relevant driver config (DriverInstallState).
27+
const DriverConfigDigestEnvName = "DRIVER_CONFIG_DIGEST"
28+
29+
// DriverConfigDigestFromPodSpec returns the DRIVER_CONFIG_DIGEST value from a driver
30+
// pod spec, or "" if absent. The env is set identically on every driver container, so
31+
// the first non-empty value (init containers first) is returned.
32+
func DriverConfigDigestFromPodSpec(spec *corev1.PodSpec) string {
33+
if spec == nil {
34+
return ""
35+
}
36+
digestFromEnv := func(env []corev1.EnvVar) string {
37+
for _, e := range env {
38+
if e.Name == DriverConfigDigestEnvName {
39+
return e.Value
40+
}
41+
}
42+
return ""
43+
}
44+
for i := range spec.InitContainers {
45+
if v := digestFromEnv(spec.InitContainers[i].Env); v != "" {
46+
return v
47+
}
48+
}
49+
for i := range spec.Containers {
50+
if v := digestFromEnv(spec.Containers[i].Env); v != "" {
51+
return v
52+
}
53+
}
54+
return ""
55+
}
56+
2557
// DriverInstallState lists all fields that affect driver installation.
2658
// Changes to these fields trigger a driver reinstall.
2759
//

internal/config/driver_config_digest_test.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,3 +309,81 @@ func TestExtractVolumes(t *testing.T) {
309309
})
310310
}
311311
}
312+
313+
// containerWithConfigDigest builds a container carrying the DRIVER_CONFIG_DIGEST env
314+
// when digest is non-empty (matching how object_controls.go sets it).
315+
func containerWithConfigDigest(name, digest string) corev1.Container {
316+
c := corev1.Container{Name: name}
317+
if digest != "" {
318+
c.Env = []corev1.EnvVar{{Name: DriverConfigDigestEnvName, Value: digest}}
319+
}
320+
return c
321+
}
322+
323+
func TestDriverConfigDigestFromPodSpec(t *testing.T) {
324+
tests := []struct {
325+
name string
326+
spec *corev1.PodSpec
327+
want string
328+
}{
329+
{
330+
name: "digest on k8s-driver-manager init container",
331+
spec: &corev1.PodSpec{
332+
InitContainers: []corev1.Container{containerWithConfigDigest("k8s-driver-manager", "abc123")},
333+
Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "")},
334+
},
335+
want: "abc123",
336+
},
337+
{
338+
name: "digest on nvidia-driver-ctr main container",
339+
spec: &corev1.PodSpec{
340+
Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "def456")},
341+
},
342+
want: "def456",
343+
},
344+
{
345+
name: "digest on OCP openshift-driver-toolkit-ctr",
346+
spec: &corev1.PodSpec{
347+
Containers: []corev1.Container{containerWithConfigDigest("openshift-driver-toolkit-ctr", "ocp789")},
348+
},
349+
want: "ocp789",
350+
},
351+
{
352+
name: "init container digest takes precedence over main container",
353+
spec: &corev1.PodSpec{
354+
InitContainers: []corev1.Container{containerWithConfigDigest("k8s-driver-manager", "init-digest")},
355+
Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "main-digest")},
356+
},
357+
want: "init-digest",
358+
},
359+
{
360+
name: "empty init digest is skipped; main container value used",
361+
spec: &corev1.PodSpec{
362+
InitContainers: []corev1.Container{{
363+
Name: "k8s-driver-manager",
364+
Env: []corev1.EnvVar{{Name: DriverConfigDigestEnvName, Value: ""}},
365+
}},
366+
Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "main-digest")},
367+
},
368+
want: "main-digest",
369+
},
370+
{
371+
name: "no digest anywhere",
372+
spec: &corev1.PodSpec{
373+
InitContainers: []corev1.Container{{Name: "k8s-driver-manager"}},
374+
Containers: []corev1.Container{{Name: "nvidia-driver-ctr"}},
375+
},
376+
want: "",
377+
},
378+
{
379+
name: "nil spec",
380+
spec: nil,
381+
want: "",
382+
},
383+
}
384+
for _, tt := range tests {
385+
t.Run(tt.name, func(t *testing.T) {
386+
assert.Equal(t, tt.want, DriverConfigDigestFromPodSpec(tt.spec))
387+
})
388+
}
389+
}

0 commit comments

Comments
 (0)