Skip to content

Commit 452812c

Browse files
committed
add support for default nvidiadriver and migration from clusterpolicy to nvidiadriver
changes include: * added default nvidiadriver which doesn't conflict with other user-specified nvidiadrivers * added migration support from clusterpolicy to nvidiadriver * added/updated e2e tests for nvidiadriver workflow * use new field to mark a nvidiadriver as default instead of using a label * don't allow adding nodeselector to default nvidiadriver, it should cover entire cluster * move default NVIDIADriver ownership helpers into internal/nvidiadriver and re-use * requeue after node owner label changes so later reconcile steps read updated cache state * use patch instead of update for node label updates Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 64714d6 commit 452812c

25 files changed

Lines changed: 1651 additions & 59 deletions

api/nvidia/v1alpha1/nvidiadriver_types.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,19 @@ const (
3636
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
3737
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
3838

39-
// NVIDIADriverSpec defines the desired state of NVIDIADriver
39+
// NVIDIADriverSpec defines the desired state of NVIDIADriver.
40+
// The CEL validation allows non-default drivers to use nodeSelector, but requires
41+
// default drivers to leave nodeSelector unset or empty.
42+
// +kubebuilder:validation:XValidation:rule="has(self.default) && self.default ? !has(self.nodeSelector) || size(self.nodeSelector) == 0 : true",message="default NVIDIADriver must not use nodeSelector"
4043
type NVIDIADriverSpec struct {
4144
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
4245
// Important: Run "make" to regenerate code after modifying this file
4346

47+
// Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
48+
// that do not match any non-default NVIDIADriver nodeSelector.
49+
// +kubebuilder:default=false
50+
Default bool `json:"default"`
51+
4452
// +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager
4553
// +kubebuilder:default=gpu
4654
// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="driverType is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting."
@@ -504,6 +512,7 @@ type NVIDIADriverStatus struct {
504512
//+kubebuilder:subresource:status
505513
//+kubebuilder:resource:scope=Cluster,shortName={"nvd","nvdriver","nvdrivers"}
506514
//+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0
515+
//+kubebuilder:printcolumn:name="Default",type=boolean,JSONPath=`.spec.default`,priority=0
507516
//+kubebuilder:printcolumn:name="Age",type=string,JSONPath=`.metadata.creationTimestamp`,priority=0
508517

509518
// NVIDIADriver is the Schema for the nvidiadrivers API

bundle/manifests/nvidia.com_nvidiadrivers.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ spec:
2222
- jsonPath: .status.state
2323
name: Status
2424
type: string
25+
- jsonPath: .spec.default
26+
name: Default
27+
type: boolean
2528
- jsonPath: .metadata.creationTimestamp
2629
name: Age
2730
type: string
@@ -48,7 +51,10 @@ spec:
4851
metadata:
4952
type: object
5053
spec:
51-
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
54+
description: |-
55+
NVIDIADriverSpec defines the desired state of NVIDIADriver.
56+
The CEL validation allows non-default drivers to use nodeSelector, but requires
57+
default drivers to leave nodeSelector unset or empty.
5258
properties:
5359
annotations:
5460
additionalProperties:
@@ -70,6 +76,12 @@ spec:
7076
name:
7177
type: string
7278
type: object
79+
default:
80+
default: false
81+
description: |-
82+
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
83+
that do not match any non-default NVIDIADriver nodeSelector.
84+
type: boolean
7385
driverType:
7486
default: gpu
7587
description: DriverType defines NVIDIA driver type
@@ -968,9 +980,14 @@ spec:
968980
type: string
969981
type: object
970982
required:
983+
- default
971984
- driverType
972985
- image
973986
type: object
987+
x-kubernetes-validations:
988+
- message: default NVIDIADriver must not use nodeSelector
989+
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990+
size(self.nodeSelector) == 0 : true'
974991
status:
975992
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
976993
properties:

config/crd/bases/nvidia.com_nvidiadrivers.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ spec:
2222
- jsonPath: .status.state
2323
name: Status
2424
type: string
25+
- jsonPath: .spec.default
26+
name: Default
27+
type: boolean
2528
- jsonPath: .metadata.creationTimestamp
2629
name: Age
2730
type: string
@@ -48,7 +51,10 @@ spec:
4851
metadata:
4952
type: object
5053
spec:
51-
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
54+
description: |-
55+
NVIDIADriverSpec defines the desired state of NVIDIADriver.
56+
The CEL validation allows non-default drivers to use nodeSelector, but requires
57+
default drivers to leave nodeSelector unset or empty.
5258
properties:
5359
annotations:
5460
additionalProperties:
@@ -70,6 +76,12 @@ spec:
7076
name:
7177
type: string
7278
type: object
79+
default:
80+
default: false
81+
description: |-
82+
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
83+
that do not match any non-default NVIDIADriver nodeSelector.
84+
type: boolean
7385
driverType:
7486
default: gpu
7587
description: DriverType defines NVIDIA driver type
@@ -968,9 +980,14 @@ spec:
968980
type: string
969981
type: object
970982
required:
983+
- default
971984
- driverType
972985
- image
973986
type: object
987+
x-kubernetes-validations:
988+
- message: default NVIDIADriver must not use nodeSelector
989+
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990+
size(self.nodeSelector) == 0 : true'
974991
status:
975992
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
976993
properties:

controllers/clusterpolicy_controller.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ import (
4242
"sigs.k8s.io/controller-runtime/pkg/source"
4343

4444
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
45+
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
4546
"github.com/NVIDIA/gpu-operator/internal/conditions"
47+
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
4648
)
4749

4850
const (
@@ -143,6 +145,24 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
143145
}
144146

145147
clusterPolicyCtrl.operatorMetrics.reconciliationTotal.Inc()
148+
149+
nvidiaDriverEnabled := nvidiadriverutil.CRDEnabled(instance)
150+
if nvidiaDriverEnabled {
151+
changed, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
152+
if err != nil {
153+
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
154+
clusterPolicyCtrl.operatorMetrics.reconciliationFailed.Inc()
155+
updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
156+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
157+
r.Log.Error(condErr, "failed to set condition")
158+
}
159+
return ctrl.Result{}, err
160+
}
161+
if changed {
162+
return ctrl.Result{RequeueAfter: time.Second}, nil
163+
}
164+
}
165+
146166
overallStatus := gpuv1.Ready
147167
statesNotReady := []string{}
148168
for {
@@ -170,6 +190,12 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
170190
}
171191
}
172192

193+
if nvidiaDriverEnabled {
194+
if err := clusterPolicyCtrl.labelNodesWithOrphanedDriverPods(ctx); err != nil {
195+
r.Log.Error(err, "failed to label nodes with orphaned NVIDIA driver pods")
196+
}
197+
}
198+
173199
// if any state is not ready, requeue for reconcile after 5 seconds
174200
if overallStatus != gpuv1.Ready {
175201
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
@@ -371,6 +397,29 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl
371397
return err
372398
}
373399

400+
err = c.Watch(
401+
source.Kind(mgr.GetCache(),
402+
&nvidiav1alpha1.NVIDIADriver{},
403+
handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, _ *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
404+
list := &gpuv1.ClusterPolicyList{}
405+
if err := mgr.GetClient().List(ctx, list); err != nil {
406+
return []reconcile.Request{}
407+
}
408+
requests := make([]reconcile.Request, 0, len(list.Items))
409+
for _, item := range list.Items {
410+
requests = append(requests, reconcile.Request{
411+
NamespacedName: types.NamespacedName{Name: item.GetName()},
412+
})
413+
}
414+
return requests
415+
}),
416+
nvidiaDriverGenerationChangedPredicate(),
417+
),
418+
)
419+
if err != nil {
420+
return err
421+
}
422+
374423
// TODO(user): Modify this to be the types you create that are owned by the primary resource
375424
// Watch for changes to secondary resource Daemonsets and requeue the owner ClusterPolicy
376425
err = c.Watch(

controllers/nvidiadriver_controller.go

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
4646
"github.com/NVIDIA/gpu-operator/internal/conditions"
4747
"github.com/NVIDIA/gpu-operator/internal/consts"
48+
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
4849
"github.com/NVIDIA/gpu-operator/internal/state"
4950
"github.com/NVIDIA/gpu-operator/internal/validator"
5051
)
@@ -96,6 +97,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
9697
// Error reading the object - requeue the request.
9798
return reconcile.Result{}, wrappedErr
9899
}
100+
if !instance.GetDeletionTimestamp().IsZero() {
101+
logger.Info("NVIDIADriver delete requested; skipping reconciliation")
102+
return reconcile.Result{}, nil
103+
}
99104

100105
// Get the singleton NVIDIA ClusterPolicy object in the cluster.
101106
clusterPolicyList := &gpuv1.ClusterPolicyList{}
@@ -121,7 +126,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
121126
clusterPolicyInstance := clusterPolicyList.Items[0]
122127

123128
// Ensure that ClusterPolicy is configured to use NVIDIADriver CRD
124-
if !clusterPolicyInstance.Spec.Driver.UseNvidiaDriverCRDType() {
129+
if !nvidiadriverutil.CRDEnabled(&clusterPolicyInstance) {
125130
msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy"
126131
logger.V(consts.LogLevelWarning).Info("NVIDIADriver reconciliation skipped", "reason", msg)
127132
instance.Status.State = nvidiav1alpha1.Disabled
@@ -152,6 +157,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
152157
return reconcile.Result{}, nil
153158
}
154159

160+
changed, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
161+
if err != nil {
162+
logger.Error(err, "failed to assign NVIDIADriver owners to nodes")
163+
instance.Status.State = nvidiav1alpha1.NotReady
164+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
165+
logger.Error(condErr, "failed to set condition")
166+
}
167+
return reconcile.Result{}, err
168+
}
169+
if changed {
170+
return reconcile.Result{RequeueAfter: time.Second}, nil
171+
}
172+
155173
if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
156174
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
157175
logger.Error(err, "unsupported driver combination detected")
@@ -316,7 +334,7 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
316334
mgr.GetCache(),
317335
&nvidiav1alpha1.NVIDIADriver{},
318336
handler.TypedEnqueueRequestsFromMapFunc(nvidiaDriverMapFn),
319-
predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.NVIDIADriver]{},
337+
nvidiaDriverGenerationChangedPredicate(),
320338
),
321339
)
322340
if err != nil {
@@ -413,3 +431,27 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
413431

414432
return nil
415433
}
434+
435+
// nvidiaDriverGenerationChangedPredicate reconciles NVIDIADrivers on create, delete, and spec changes.
436+
func nvidiaDriverGenerationChangedPredicate() predicate.TypedPredicate[*nvidiav1alpha1.NVIDIADriver] {
437+
return predicate.TypedFuncs[*nvidiav1alpha1.NVIDIADriver]{
438+
CreateFunc: func(event.TypedCreateEvent[*nvidiav1alpha1.NVIDIADriver]) bool {
439+
return true
440+
},
441+
DeleteFunc: func(event.TypedDeleteEvent[*nvidiav1alpha1.NVIDIADriver]) bool {
442+
return true
443+
},
444+
UpdateFunc: func(e event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]) bool {
445+
if e.ObjectOld == nil || e.ObjectNew == nil {
446+
return true
447+
}
448+
if e.ObjectOld.GetGeneration() != e.ObjectNew.GetGeneration() {
449+
return true
450+
}
451+
return false
452+
},
453+
GenericFunc: func(event.TypedGenericEvent[*nvidiav1alpha1.NVIDIADriver]) bool {
454+
return false
455+
},
456+
}
457+
}

0 commit comments

Comments
 (0)