Skip to content

Commit 29ecb7d

Browse files
authored
Merge pull request #2518 from rahulait/default-nvidiadriver
add support for default nvidiadriver and migration from clusterpolicy to nvidiadriver
2 parents 4a456dd + 53770fc commit 29ecb7d

27 files changed

Lines changed: 2030 additions & 135 deletions

api/nvidia/v1alpha1/nvidiadriver_types.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,19 @@ const (
3636
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
3737
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
3838

39-
// NVIDIADriverSpec defines the desired state of NVIDIADriver
39+
// NVIDIADriverSpec defines the desired state of NVIDIADriver.
40+
// The CEL validation allows non-default drivers to use nodeSelector, but requires
41+
// default drivers to leave nodeSelector unset or empty.
42+
// +kubebuilder:validation:XValidation:rule="has(self.default) && self.default ? !has(self.nodeSelector) || size(self.nodeSelector) == 0 : true",message="default NVIDIADriver must not use nodeSelector"
4043
type NVIDIADriverSpec struct {
4144
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
4245
// Important: Run "make" to regenerate code after modifying this file
4346

47+
// Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
48+
// that do not match any non-default NVIDIADriver nodeSelector.
49+
// +kubebuilder:default=false
50+
Default bool `json:"default"`
51+
4452
// +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager
4553
// +kubebuilder:default=gpu
4654
// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="driverType is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting."
@@ -504,6 +512,7 @@ type NVIDIADriverStatus struct {
504512
//+kubebuilder:subresource:status
505513
//+kubebuilder:resource:scope=Cluster,shortName={"nvd","nvdriver","nvdrivers"}
506514
//+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0
515+
//+kubebuilder:printcolumn:name="Default",type=boolean,JSONPath=`.spec.default`,priority=0
507516
//+kubebuilder:printcolumn:name="Age",type=string,JSONPath=`.metadata.creationTimestamp`,priority=0
508517

509518
// NVIDIADriver is the Schema for the nvidiadrivers API
@@ -524,6 +533,31 @@ type NVIDIADriverList struct {
524533
Items []NVIDIADriver `json:"items"`
525534
}
526535

536+
// IsDefault returns true when the NVIDIADriver is marked as the fallback driver.
537+
func (d *NVIDIADriver) IsDefault() bool {
538+
return d != nil && d.Spec.Default
539+
}
540+
541+
// HasDeletionTimestamp returns true when the NVIDIADriver is marked for deletion.
542+
func (d *NVIDIADriver) HasDeletionTimestamp() bool {
543+
return d != nil && !d.GetDeletionTimestamp().IsZero()
544+
}
545+
546+
// ValidateNodeSelector rejects selectors that use operator-managed routing labels
547+
// or scope the default fallback driver.
548+
func (d *NVIDIADriver) ValidateNodeSelector() error {
549+
if d == nil || d.Spec.NodeSelector == nil {
550+
return nil
551+
}
552+
if d.IsDefault() && len(d.Spec.NodeSelector) > 0 {
553+
return fmt.Errorf("default NVIDIADriver %q cannot use nodeSelector", d.Name)
554+
}
555+
if _, ok := d.Spec.NodeSelector[consts.NVIDIADriverOwnerLabel]; ok {
556+
return fmt.Errorf("NVIDIADriver %q nodeSelector cannot use reserved label %q", d.Name, consts.NVIDIADriverOwnerLabel)
557+
}
558+
return nil
559+
}
560+
527561
// UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec
528562
func (d *NVIDIADriverSpec) UsePrecompiledDrivers() bool {
529563
if d.UsePrecompiled == nil {

api/nvidia/v1alpha1/nvidiadriver_types_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,26 @@ import (
2020
"testing"
2121

2222
"github.com/stretchr/testify/require"
23+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2324
)
2425

2526
const (
2627
testDigest = "10d1df8034373061366d4fb17b364b3b28d766b54d5a0b700c1a5a75378cf125"
2728
)
2829

30+
func TestNVIDIADriverHasDeletionTimestamp(t *testing.T) {
31+
var nilDriver *NVIDIADriver
32+
require.False(t, nilDriver.HasDeletionTimestamp())
33+
require.False(t, (&NVIDIADriver{}).HasDeletionTimestamp())
34+
35+
deletionTimestamp := metav1.Now()
36+
require.True(t, (&NVIDIADriver{
37+
ObjectMeta: metav1.ObjectMeta{
38+
DeletionTimestamp: &deletionTimestamp,
39+
},
40+
}).HasDeletionTimestamp())
41+
}
42+
2943
func TestGetImagePath(t *testing.T) {
3044
testCases := []struct {
3145
description string

bundle/manifests/nvidia.com_nvidiadrivers.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ spec:
2222
- jsonPath: .status.state
2323
name: Status
2424
type: string
25+
- jsonPath: .spec.default
26+
name: Default
27+
type: boolean
2528
- jsonPath: .metadata.creationTimestamp
2629
name: Age
2730
type: string
@@ -48,7 +51,10 @@ spec:
4851
metadata:
4952
type: object
5053
spec:
51-
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
54+
description: |-
55+
NVIDIADriverSpec defines the desired state of NVIDIADriver.
56+
The CEL validation allows non-default drivers to use nodeSelector, but requires
57+
default drivers to leave nodeSelector unset or empty.
5258
properties:
5359
annotations:
5460
additionalProperties:
@@ -70,6 +76,12 @@ spec:
7076
name:
7177
type: string
7278
type: object
79+
default:
80+
default: false
81+
description: |-
82+
Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
83+
that do not match any non-default NVIDIADriver nodeSelector.
84+
type: boolean
7385
driverType:
7486
default: gpu
7587
description: DriverType defines NVIDIA driver type
@@ -968,9 +980,14 @@ spec:
968980
type: string
969981
type: object
970982
required:
983+
- default
971984
- driverType
972985
- image
973986
type: object
987+
x-kubernetes-validations:
988+
- message: default NVIDIADriver must not use nodeSelector
989+
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990+
size(self.nodeSelector) == 0 : true'
974991
status:
975992
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
976993
properties:

config/crd/bases/nvidia.com_nvidiadrivers.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ spec:
2222
- jsonPath: .status.state
2323
name: Status
2424
type: string
25+
- jsonPath: .spec.default
26+
name: Default
27+
type: boolean
2528
- jsonPath: .metadata.creationTimestamp
2629
name: Age
2730
type: string
@@ -48,7 +51,10 @@ spec:
4851
metadata:
4952
type: object
5053
spec:
51-
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
54+
description: |-
55+
NVIDIADriverSpec defines the desired state of NVIDIADriver.
56+
The CEL validation allows non-default drivers to use nodeSelector, but requires
57+
default drivers to leave nodeSelector unset or empty.
5258
properties:
5359
annotations:
5460
additionalProperties:
@@ -70,6 +76,12 @@ spec:
7076
name:
7177
type: string
7278
type: object
79+
default:
80+
default: false
81+
description: |-
82+
Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
83+
that do not match any non-default NVIDIADriver nodeSelector.
84+
type: boolean
7385
driverType:
7486
default: gpu
7587
description: DriverType defines NVIDIA driver type
@@ -968,9 +980,14 @@ spec:
968980
type: string
969981
type: object
970982
required:
983+
- default
971984
- driverType
972985
- image
973986
type: object
987+
x-kubernetes-validations:
988+
- message: default NVIDIADriver must not use nodeSelector
989+
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990+
size(self.nodeSelector) == 0 : true'
974991
status:
975992
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
976993
properties:

controllers/nvidiadriver_controller.go

Lines changed: 84 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
4646
"github.com/NVIDIA/gpu-operator/internal/conditions"
4747
"github.com/NVIDIA/gpu-operator/internal/consts"
48+
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
4849
"github.com/NVIDIA/gpu-operator/internal/state"
4950
"github.com/NVIDIA/gpu-operator/internal/validator"
5051
)
@@ -83,9 +84,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
8384
if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
8485
if apierrors.IsNotFound(err) {
8586
// Request object not found, could have been deleted after reconcile request.
86-
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
87-
// Return and don't requeue
88-
return reconcile.Result{}, nil
87+
// Re-run owner assignment so deleting the last NVIDIADriver clears stale node owner labels.
88+
return r.cleanupNVIDIADriverOwnerLabels(ctx)
8989
}
9090
wrappedErr := fmt.Errorf("error getting NVIDIADriver object: %w", err)
9191
logger.Error(err, "error getting NVIDIADriver object")
@@ -96,6 +96,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
9696
// Error reading the object - requeue the request.
9797
return reconcile.Result{}, wrappedErr
9898
}
99+
if instance.HasDeletionTimestamp() {
100+
logger.Info("NVIDIADriver delete requested; cleaning up owner labels")
101+
return r.cleanupNVIDIADriverOwnerLabels(ctx)
102+
}
99103

100104
// Get the singleton NVIDIA ClusterPolicy object in the cluster.
101105
clusterPolicyList := &gpuv1.ClusterPolicyList{}
@@ -152,6 +156,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
152156
return reconcile.Result{}, nil
153157
}
154158

159+
changed, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
160+
if err != nil {
161+
logger.Error(err, "failed to assign NVIDIADriver owners to nodes")
162+
instance.Status.State = nvidiav1alpha1.NotReady
163+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
164+
logger.Error(condErr, "failed to set condition")
165+
}
166+
return reconcile.Result{}, err
167+
}
168+
if changed {
169+
return reconcile.Result{RequeueAfter: time.Second}, nil
170+
}
171+
155172
if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
156173
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
157174
logger.Error(err, "unsupported driver combination detected")
@@ -188,6 +205,11 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
188205
// Sync state and update status
189206
managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)
190207

208+
if err := r.labelNodesWithOrphanedDriverPods(ctx); err != nil {
209+
logger.Error(err, "failed to label nodes with orphaned NVIDIA driver pods")
210+
return reconcile.Result{}, err
211+
}
212+
191213
// update CR status
192214
if err := r.updateCrStatus(ctx, instance, managerStatus); err != nil {
193215
return ctrl.Result{}, err
@@ -276,6 +298,63 @@ func (r *NVIDIADriverReconciler) enqueueAllNVIDIADrivers(ctx context.Context) []
276298
return reconcileRequests
277299
}
278300

301+
// enqueueNVIDIADriverReconcilers enqueues the NVIDIADriver that triggered the
302+
// event and all current NVIDIADriver instances. The triggering object is
303+
// included even for delete events so the NotFound reconcile path can clear
304+
// stale node owner labels.
305+
func (r *NVIDIADriverReconciler) enqueueNVIDIADriverReconcilers(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
306+
requests := r.enqueueAllNVIDIADrivers(ctx)
307+
if driver != nil {
308+
requests = append(requests, reconcile.Request{
309+
NamespacedName: types.NamespacedName{
310+
Name: driver.GetName(),
311+
Namespace: driver.GetNamespace(),
312+
},
313+
})
314+
}
315+
return dedupeReconcileRequests(requests)
316+
}
317+
318+
func dedupeReconcileRequests(requests []reconcile.Request) []reconcile.Request {
319+
seen := map[types.NamespacedName]struct{}{}
320+
deduped := make([]reconcile.Request, 0, len(requests))
321+
for _, request := range requests {
322+
if _, ok := seen[request.NamespacedName]; ok {
323+
continue
324+
}
325+
seen[request.NamespacedName] = struct{}{}
326+
deduped = append(deduped, request)
327+
}
328+
return deduped
329+
}
330+
331+
// cleanupNVIDIADriverOwnerLabels re-runs owner assignment after a delete event
332+
// when ClusterPolicy is still configured for NVIDIADriver mode. This lets
333+
// AssignOwners remove stale nvidia.com/gpu-operator.driver.owner labels when no remaining
334+
// NVIDIADriver matches a GPU node, including deletion of the last NVIDIADriver.
335+
func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabels(ctx context.Context) (reconcile.Result, error) {
336+
clusterPolicyList := &gpuv1.ClusterPolicyList{}
337+
if err := r.List(ctx, clusterPolicyList); err != nil {
338+
wrappedErr := fmt.Errorf("error getting ClusterPolicy list: %w", err)
339+
log.FromContext(ctx).Error(wrappedErr, "failed to cleanup NVIDIADriver owner labels")
340+
return reconcile.Result{}, wrappedErr
341+
}
342+
343+
if len(clusterPolicyList.Items) == 0 {
344+
return reconcile.Result{}, nil
345+
}
346+
347+
if !clusterPolicyList.Items[0].Spec.Driver.UseNvidiaDriverCRDType() {
348+
return reconcile.Result{}, nil
349+
}
350+
351+
if _, err := nvidiadriverutil.AssignOwners(ctx, r.Client); err != nil {
352+
log.FromContext(ctx).Error(err, "failed to cleanup NVIDIADriver owner labels")
353+
return reconcile.Result{}, err
354+
}
355+
return reconcile.Result{}, nil
356+
}
357+
279358
// SetupWithManager sets up the controller with the Manager.
280359
func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
281360
// Create state manager
@@ -307,8 +386,8 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
307386

308387
// Watch for changes to NVIDIADriver CRs. Whenever an event is generated for a NVIDIADriver CR,
309388
// enqueue a reconcile request for all NVIDIADriver instances.
310-
nvidiaDriverMapFn := func(ctx context.Context, _ *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
311-
return r.enqueueAllNVIDIADrivers(ctx)
389+
nvidiaDriverMapFn := func(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
390+
return r.enqueueNVIDIADriverReconcilers(ctx, driver)
312391
}
313392

314393
// Watch for changes to the primary resource NVIDIADriver

0 commit comments

Comments
 (0)