Skip to content

Commit 56f5a7d

Browse files
committed
add support for default nvidiadriver and migration from clusterpolicy to nvidiadriver
changes include: * added default nvidiadriver which doesn't conflict with other user-specified nvidiadrivers * added migration support from clusterpolicy to nvidiadriver * added/updated e2e tests for nvidiadriver workflow * use new field to mark a nvidiadriver as default instead of using a label * don't allow adding nodeselector to default nvidiadriver, it should cover entire cluster * move default NVIDIADriver ownership helpers into internal/nvidiadriver and re-use * requeue after node owner label changes so later reconcile steps read updated cache state * use patch instead of update for node label updates Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 64714d6 commit 56f5a7d

28 files changed

Lines changed: 1923 additions & 124 deletions

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2183,6 +2183,11 @@ func (d *DriverSpec) UseNvidiaDriverCRDType() bool {
21832183
return *d.UseNvidiaDriverCRD
21842184
}
21852185

2186+
// IsNVIDIADriverCRDEnabled returns true if driver management is enabled through NVIDIADriver CRs.
2187+
func (d *DriverSpec) IsNVIDIADriverCRDEnabled() bool {
2188+
return d.IsEnabled() && d.UseNvidiaDriverCRDType()
2189+
}
2190+
21862191
// UsePrecompiledDrivers returns true if driver install is enabled using pre-compiled modules
21872192
func (d *DriverSpec) UsePrecompiledDrivers() bool {
21882193
if d.UsePrecompiled == nil {
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Copyright (c) NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package v1
18+
19+
import (
20+
"testing"
21+
22+
"github.com/stretchr/testify/require"
23+
)
24+
25+
func TestDriverSpecIsNVIDIADriverCRDEnabled(t *testing.T) {
26+
driverEnabled := true
27+
driverDisabled := false
28+
crdEnabled := true
29+
crdDisabled := false
30+
31+
tests := []struct {
32+
name string
33+
driverEnabled *bool
34+
crdEnabled *bool
35+
expected bool
36+
}{
37+
{
38+
name: "driver enabled by default and CRD enabled",
39+
crdEnabled: &crdEnabled,
40+
expected: true,
41+
},
42+
{
43+
name: "CRD disabled",
44+
crdEnabled: &crdDisabled,
45+
expected: false,
46+
},
47+
{
48+
name: "driver disabled",
49+
driverEnabled: &driverDisabled,
50+
crdEnabled: &crdEnabled,
51+
expected: false,
52+
},
53+
{
54+
name: "driver explicitly enabled and CRD enabled",
55+
driverEnabled: &driverEnabled,
56+
crdEnabled: &crdEnabled,
57+
expected: true,
58+
},
59+
}
60+
61+
for _, tc := range tests {
62+
t.Run(tc.name, func(t *testing.T) {
63+
driverSpec := DriverSpec{
64+
Enabled: tc.driverEnabled,
65+
UseNvidiaDriverCRD: tc.crdEnabled,
66+
}
67+
68+
require.Equal(t, tc.expected, driverSpec.IsNVIDIADriverCRDEnabled())
69+
})
70+
}
71+
}

api/nvidia/v1alpha1/nvidiadriver_types.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,19 @@ const (
3636
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
3737
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
3838

39-
// NVIDIADriverSpec defines the desired state of NVIDIADriver
39+
// NVIDIADriverSpec defines the desired state of NVIDIADriver.
40+
// The CEL validation allows non-default drivers to use nodeSelector, but requires
41+
// default drivers to leave nodeSelector unset or empty.
42+
// +kubebuilder:validation:XValidation:rule="has(self.default) && self.default ? !has(self.nodeSelector) || size(self.nodeSelector) == 0 : true",message="default NVIDIADriver must not use nodeSelector"
4043
type NVIDIADriverSpec struct {
4144
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
4245
// Important: Run "make" to regenerate code after modifying this file
4346

47+
// Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
48+
// that do not match any non-default NVIDIADriver nodeSelector.
49+
// +kubebuilder:default=false
50+
Default bool `json:"default"`
51+
4452
// +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager
4553
// +kubebuilder:default=gpu
4654
// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="driverType is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting."
@@ -504,6 +512,7 @@ type NVIDIADriverStatus struct {
504512
//+kubebuilder:subresource:status
505513
//+kubebuilder:resource:scope=Cluster,shortName={"nvd","nvdriver","nvdrivers"}
506514
//+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0
515+
//+kubebuilder:printcolumn:name="Default",type=boolean,JSONPath=`.spec.default`,priority=0
507516
//+kubebuilder:printcolumn:name="Age",type=string,JSONPath=`.metadata.creationTimestamp`,priority=0
508517

509518
// NVIDIADriver is the Schema for the nvidiadrivers API
@@ -524,6 +533,11 @@ type NVIDIADriverList struct {
524533
Items []NVIDIADriver `json:"items"`
525534
}
526535

536+
// IsDefault returns true when the NVIDIADriver is marked as the fallback driver.
537+
func (d *NVIDIADriver) IsDefault() bool {
538+
return d != nil && d.Spec.Default
539+
}
540+
527541
// UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec
528542
func (d *NVIDIADriverSpec) UsePrecompiledDrivers() bool {
529543
if d.UsePrecompiled == nil {

bundle/manifests/nvidia.com_nvidiadrivers.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ spec:
2222
- jsonPath: .status.state
2323
name: Status
2424
type: string
25+
- jsonPath: .spec.default
26+
name: Default
27+
type: boolean
2528
- jsonPath: .metadata.creationTimestamp
2629
name: Age
2730
type: string
@@ -48,7 +51,10 @@ spec:
4851
metadata:
4952
type: object
5053
spec:
51-
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
54+
description: |-
55+
NVIDIADriverSpec defines the desired state of NVIDIADriver.
56+
The CEL validation allows non-default drivers to use nodeSelector, but requires
57+
default drivers to leave nodeSelector unset or empty.
5258
properties:
5359
annotations:
5460
additionalProperties:
@@ -70,6 +76,12 @@ spec:
7076
name:
7177
type: string
7278
type: object
79+
default:
80+
default: false
81+
description: |-
82+
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
83+
that do not match any non-default NVIDIADriver nodeSelector.
84+
type: boolean
7385
driverType:
7486
default: gpu
7587
description: DriverType defines NVIDIA driver type
@@ -968,9 +980,14 @@ spec:
968980
type: string
969981
type: object
970982
required:
983+
- default
971984
- driverType
972985
- image
973986
type: object
987+
x-kubernetes-validations:
988+
- message: default NVIDIADriver must not use nodeSelector
989+
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990+
size(self.nodeSelector) == 0 : true'
974991
status:
975992
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
976993
properties:

config/crd/bases/nvidia.com_nvidiadrivers.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ spec:
2222
- jsonPath: .status.state
2323
name: Status
2424
type: string
25+
- jsonPath: .spec.default
26+
name: Default
27+
type: boolean
2528
- jsonPath: .metadata.creationTimestamp
2629
name: Age
2730
type: string
@@ -48,7 +51,10 @@ spec:
4851
metadata:
4952
type: object
5053
spec:
51-
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
54+
description: |-
55+
NVIDIADriverSpec defines the desired state of NVIDIADriver.
56+
The CEL validation allows non-default drivers to use nodeSelector, but requires
57+
default drivers to leave nodeSelector unset or empty.
5258
properties:
5359
annotations:
5460
additionalProperties:
@@ -70,6 +76,12 @@ spec:
7076
name:
7177
type: string
7278
type: object
79+
default:
80+
default: false
81+
description: |-
82+
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
83+
that do not match any non-default NVIDIADriver nodeSelector.
84+
type: boolean
7385
driverType:
7486
default: gpu
7587
description: DriverType defines NVIDIA driver type
@@ -968,9 +980,14 @@ spec:
968980
type: string
969981
type: object
970982
required:
983+
- default
971984
- driverType
972985
- image
973986
type: object
987+
x-kubernetes-validations:
988+
- message: default NVIDIADriver must not use nodeSelector
989+
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990+
size(self.nodeSelector) == 0 : true'
974991
status:
975992
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
976993
properties:

controllers/nvidiadriver_controller.go

Lines changed: 87 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
4646
"github.com/NVIDIA/gpu-operator/internal/conditions"
4747
"github.com/NVIDIA/gpu-operator/internal/consts"
48+
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
4849
"github.com/NVIDIA/gpu-operator/internal/state"
4950
"github.com/NVIDIA/gpu-operator/internal/validator"
5051
)
@@ -83,9 +84,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
8384
if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
8485
if apierrors.IsNotFound(err) {
8586
// Request object not found, could have been deleted after reconcile request.
86-
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
87-
// Return and don't requeue
88-
return reconcile.Result{}, nil
87+
// Re-run owner assignment so deleting the last NVIDIADriver clears stale node owner labels.
88+
return r.cleanupNVIDIADriverOwnerLabelsAndReturn(ctx)
8989
}
9090
wrappedErr := fmt.Errorf("error getting NVIDIADriver object: %w", err)
9191
logger.Error(err, "error getting NVIDIADriver object")
@@ -96,6 +96,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
9696
// Error reading the object - requeue the request.
9797
return reconcile.Result{}, wrappedErr
9898
}
99+
if !instance.GetDeletionTimestamp().IsZero() {
100+
logger.Info("NVIDIADriver delete requested; cleaning up owner labels")
101+
return r.cleanupNVIDIADriverOwnerLabelsAndReturn(ctx)
102+
}
99103

100104
// Get the singleton NVIDIA ClusterPolicy object in the cluster.
101105
clusterPolicyList := &gpuv1.ClusterPolicyList{}
@@ -121,7 +125,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
121125
clusterPolicyInstance := clusterPolicyList.Items[0]
122126

123127
// Ensure that ClusterPolicy is configured to use NVIDIADriver CRD
124-
if !clusterPolicyInstance.Spec.Driver.UseNvidiaDriverCRDType() {
128+
if !clusterPolicyInstance.Spec.Driver.IsNVIDIADriverCRDEnabled() {
125129
msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy"
126130
logger.V(consts.LogLevelWarning).Info("NVIDIADriver reconciliation skipped", "reason", msg)
127131
instance.Status.State = nvidiav1alpha1.Disabled
@@ -152,6 +156,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
152156
return reconcile.Result{}, nil
153157
}
154158

159+
changed, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
160+
if err != nil {
161+
logger.Error(err, "failed to assign NVIDIADriver owners to nodes")
162+
instance.Status.State = nvidiav1alpha1.NotReady
163+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
164+
logger.Error(condErr, "failed to set condition")
165+
}
166+
return reconcile.Result{}, err
167+
}
168+
if changed {
169+
return reconcile.Result{RequeueAfter: time.Second}, nil
170+
}
171+
155172
if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
156173
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
157174
logger.Error(err, "unsupported driver combination detected")
@@ -188,6 +205,11 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
188205
// Sync state and update status
189206
managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)
190207

208+
if err := r.labelNodesWithOrphanedDriverPods(ctx); err != nil {
209+
logger.Error(err, "failed to label nodes with orphaned NVIDIA driver pods")
210+
return reconcile.Result{}, err
211+
}
212+
191213
// update CR status
192214
if err := r.updateCrStatus(ctx, instance, managerStatus); err != nil {
193215
return ctrl.Result{}, err
@@ -276,6 +298,65 @@ func (r *NVIDIADriverReconciler) enqueueAllNVIDIADrivers(ctx context.Context) []
276298
return reconcileRequests
277299
}
278300

301+
// enqueueNVIDIADriverReconcilers enqueues the NVIDIADriver that triggered the
302+
// event and all current NVIDIADriver instances. The triggering object is
303+
// included even for delete events so the NotFound reconcile path can clear
304+
// stale node owner labels.
305+
func (r *NVIDIADriverReconciler) enqueueNVIDIADriverReconcilers(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
306+
requests := r.enqueueAllNVIDIADrivers(ctx)
307+
if driver != nil {
308+
requests = append(requests, reconcile.Request{
309+
NamespacedName: types.NamespacedName{
310+
Name: driver.GetName(),
311+
Namespace: driver.GetNamespace(),
312+
},
313+
})
314+
}
315+
return dedupeReconcileRequests(requests)
316+
}
317+
318+
func dedupeReconcileRequests(requests []reconcile.Request) []reconcile.Request {
319+
seen := map[types.NamespacedName]struct{}{}
320+
deduped := make([]reconcile.Request, 0, len(requests))
321+
for _, request := range requests {
322+
if _, ok := seen[request.NamespacedName]; ok {
323+
continue
324+
}
325+
seen[request.NamespacedName] = struct{}{}
326+
deduped = append(deduped, request)
327+
}
328+
return deduped
329+
}
330+
331+
func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabelsAndReturn(ctx context.Context) (reconcile.Result, error) {
332+
if err := r.cleanupNVIDIADriverOwnerLabels(ctx); err != nil {
333+
log.FromContext(ctx).Error(err, "failed to cleanup NVIDIADriver owner labels")
334+
return reconcile.Result{}, err
335+
}
336+
return reconcile.Result{}, nil
337+
}
338+
339+
// cleanupNVIDIADriverOwnerLabels re-runs owner assignment after a delete event
340+
// when ClusterPolicy is still configured for NVIDIADriver mode. This lets
341+
// AssignOwners remove stale nvidia.com/gpu-operator.driver.owner labels when no remaining
342+
// NVIDIADriver matches a GPU node, including deletion of the last NVIDIADriver.
343+
func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabels(ctx context.Context) error {
344+
clusterPolicyList := &gpuv1.ClusterPolicyList{}
345+
if err := r.List(ctx, clusterPolicyList); err != nil {
346+
return fmt.Errorf("error getting ClusterPolicy list: %w", err)
347+
}
348+
349+
for i := range clusterPolicyList.Items {
350+
if !clusterPolicyList.Items[i].Spec.Driver.IsNVIDIADriverCRDEnabled() {
351+
continue
352+
}
353+
_, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
354+
return err
355+
}
356+
357+
return nil
358+
}
359+
279360
// SetupWithManager sets up the controller with the Manager.
280361
func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
281362
// Create state manager
@@ -307,8 +388,8 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
307388

308389
// Watch for changes to NVIDIADriver CRs. Whenever an event is generated for a NVIDIADriver CR,
309390
// enqueue a reconcile request for all NVIDIADriver instances.
310-
nvidiaDriverMapFn := func(ctx context.Context, _ *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
311-
return r.enqueueAllNVIDIADrivers(ctx)
391+
nvidiaDriverMapFn := func(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
392+
return r.enqueueNVIDIADriverReconcilers(ctx, driver)
312393
}
313394

314395
// Watch for changes to the primary resource NVIDIADriver

0 commit comments

Comments
 (0)