Skip to content

Commit cf23d79

Browse files
committed
add support for default nvidiadriver and migration from clusterpolicy to nvidiadriver
changes include: * added default nvidiadriver which doesn't conflict with other user-specified nvidiadrivers * added migration support from clusterpolicy to nvidiadriver * added/updated e2e tests for nvidiadriver workflow Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent b10eeb6 commit cf23d79

19 files changed

Lines changed: 1357 additions & 51 deletions

controllers/clusterpolicy_controller.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242
"sigs.k8s.io/controller-runtime/pkg/source"
4343

4444
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
45+
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
4546
"github.com/NVIDIA/gpu-operator/internal/conditions"
4647
)
4748

@@ -143,6 +144,20 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
143144
}
144145

145146
clusterPolicyCtrl.operatorMetrics.reconciliationTotal.Inc()
147+
148+
nvidiaDriverEnabled := nvidiaDriverCRDEnabled(instance)
149+
if nvidiaDriverEnabled {
150+
if err := assignNVIDIADriverOwners(ctx, r.Client); err != nil {
151+
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
152+
clusterPolicyCtrl.operatorMetrics.reconciliationFailed.Inc()
153+
updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
154+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
155+
r.Log.Error(condErr, "failed to set condition")
156+
}
157+
return ctrl.Result{}, err
158+
}
159+
}
160+
146161
overallStatus := gpuv1.Ready
147162
statesNotReady := []string{}
148163
for {
@@ -170,6 +185,12 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
170185
}
171186
}
172187

188+
if nvidiaDriverEnabled {
189+
if err := clusterPolicyCtrl.labelNodesWithOrphanedDriverPods(ctx); err != nil {
190+
r.Log.Error(err, "failed to label nodes with orphaned NVIDIA driver pods")
191+
}
192+
}
193+
173194
// if any state is not ready, requeue for reconcile after 5 seconds
174195
if overallStatus != gpuv1.Ready {
175196
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
@@ -371,6 +392,29 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl
371392
return err
372393
}
373394

395+
err = c.Watch(
396+
source.Kind(mgr.GetCache(),
397+
&nvidiav1alpha1.NVIDIADriver{},
398+
handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, _ *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
399+
list := &gpuv1.ClusterPolicyList{}
400+
if err := mgr.GetClient().List(ctx, list); err != nil {
401+
return []reconcile.Request{}
402+
}
403+
requests := make([]reconcile.Request, 0, len(list.Items))
404+
for _, item := range list.Items {
405+
requests = append(requests, reconcile.Request{
406+
NamespacedName: types.NamespacedName{Name: item.GetName()},
407+
})
408+
}
409+
return requests
410+
}),
411+
nvidiaDriverGenerationOrDefaultLabelChangedPredicate(),
412+
),
413+
)
414+
if err != nil {
415+
return err
416+
}
417+
374418
// TODO(user): Modify this to be the types you create that are owned by the primary resource
375419
// Watch for changes to secondary resource Daemonsets and requeue the owner ClusterPolicy
376420
err = c.Watch(

controllers/nvidiadriver_controller.go

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
110110
}
111111

112112
if len(clusterPolicyList.Items) == 0 {
113+
if handled, err := r.handleDefaultNVIDIADriverDeletion(ctx, instance); handled || err != nil {
114+
return reconcile.Result{}, err
115+
}
113116
err := fmt.Errorf("no ClusterPolicy object found in the cluster")
114117
logger.Error(err, "failed to get ClusterPolicy object")
115118
instance.Status.State = nvidiav1alpha1.NotReady
@@ -120,8 +123,12 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
120123
}
121124
clusterPolicyInstance := clusterPolicyList.Items[0]
122125

126+
if handled, err := r.handleDefaultNVIDIADriverDeletion(ctx, instance); handled || err != nil {
127+
return reconcile.Result{}, err
128+
}
129+
123130
// Ensure that ClusterPolicy is configured to use NVIDIADriver CRD
124-
if !clusterPolicyInstance.Spec.Driver.UseNvidiaDriverCRDType() {
131+
if !nvidiaDriverCRDEnabled(&clusterPolicyInstance) {
125132
msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy"
126133
logger.V(consts.LogLevelWarning).Info("NVIDIADriver reconciliation skipped", "reason", msg)
127134
instance.Status.State = nvidiav1alpha1.Disabled
@@ -152,6 +159,15 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
152159
return reconcile.Result{}, nil
153160
}
154161

162+
if err := assignNVIDIADriverOwners(ctx, r.Client); err != nil {
163+
logger.Error(err, "failed to assign NVIDIADriver owners to nodes")
164+
instance.Status.State = nvidiav1alpha1.NotReady
165+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
166+
logger.Error(condErr, "failed to set condition")
167+
}
168+
return reconcile.Result{}, err
169+
}
170+
155171
if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
156172
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
157173
logger.Error(err, "unsupported driver combination detected")
@@ -221,6 +237,14 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
221237
return reconcile.Result{}, nil
222238
}
223239

240+
func (r *NVIDIADriverReconciler) handleDefaultNVIDIADriverDeletion(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) (bool, error) {
241+
if !isDefaultNVIDIADriver(driver) || driver.GetDeletionTimestamp() == nil {
242+
return false, nil
243+
}
244+
log.FromContext(ctx).Info("Default NVIDIADriver delete requested; skipping reconciliation")
245+
return true, nil
246+
}
247+
224248
func (r *NVIDIADriverReconciler) updateCrStatus(
225249
ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, status state.Results) error {
226250
reqLogger := log.FromContext(ctx)
@@ -316,7 +340,7 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
316340
mgr.GetCache(),
317341
&nvidiav1alpha1.NVIDIADriver{},
318342
handler.TypedEnqueueRequestsFromMapFunc(nvidiaDriverMapFn),
319-
predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.NVIDIADriver]{},
343+
nvidiaDriverGenerationOrDefaultLabelChangedPredicate(),
320344
),
321345
)
322346
if err != nil {
@@ -413,3 +437,30 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
413437

414438
return nil
415439
}
440+
441+
// nvidiaDriverGenerationOrDefaultLabelChangedPredicate reconciles NVIDIADrivers on spec changes and
442+
// on changes to the default-driver label, which is metadata but affects ownership semantics.
443+
func nvidiaDriverGenerationOrDefaultLabelChangedPredicate() predicate.TypedPredicate[*nvidiav1alpha1.NVIDIADriver] {
444+
return predicate.TypedFuncs[*nvidiav1alpha1.NVIDIADriver]{
445+
CreateFunc: func(event.TypedCreateEvent[*nvidiav1alpha1.NVIDIADriver]) bool {
446+
return true
447+
},
448+
DeleteFunc: func(event.TypedDeleteEvent[*nvidiav1alpha1.NVIDIADriver]) bool {
449+
return true
450+
},
451+
UpdateFunc: func(e event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]) bool {
452+
if e.ObjectOld == nil || e.ObjectNew == nil {
453+
return true
454+
}
455+
if e.ObjectOld.GetGeneration() != e.ObjectNew.GetGeneration() {
456+
return true
457+
}
458+
oldLabels := e.ObjectOld.GetLabels()
459+
newLabels := e.ObjectNew.GetLabels()
460+
return oldLabels[consts.DefaultNVIDIADriverLabel] != newLabels[consts.DefaultNVIDIADriverLabel]
461+
},
462+
GenericFunc: func(event.TypedGenericEvent[*nvidiav1alpha1.NVIDIADriver]) bool {
463+
return false
464+
},
465+
}
466+
}

controllers/nvidiadriver_controller_test.go

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,19 @@ import (
2828
"github.com/stretchr/testify/require"
2929
"go.uber.org/zap"
3030
"go.uber.org/zap/zapcore"
31+
corev1 "k8s.io/api/core/v1"
3132
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3233
"k8s.io/apimachinery/pkg/runtime"
3334
"k8s.io/apimachinery/pkg/types"
3435
"k8s.io/utils/ptr"
3536
ctrl "sigs.k8s.io/controller-runtime"
37+
"sigs.k8s.io/controller-runtime/pkg/client"
3638
"sigs.k8s.io/controller-runtime/pkg/client/fake"
39+
"sigs.k8s.io/controller-runtime/pkg/event"
3740

3841
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
3942
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
43+
"github.com/NVIDIA/gpu-operator/internal/consts"
4044
"github.com/NVIDIA/gpu-operator/internal/validator"
4145
)
4246

@@ -70,6 +74,15 @@ func (f *FakeNodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1al
7074
return f.CustomError
7175
}
7276

77+
type patchFailingClient struct {
78+
client.Client
79+
patchErr error
80+
}
81+
82+
func (c *patchFailingClient) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.PatchOption) error {
83+
return c.patchErr
84+
}
85+
7386
// newTestLogger creates a zap.Logger that writes to an in-memory buffer for testing
7487
func newTestLogger() (logr.Logger, *bytes.Buffer) {
7588
buf := &bytes.Buffer{}
@@ -97,6 +110,7 @@ func TestReconcile(t *testing.T) {
97110
scheme := runtime.NewScheme()
98111
require.NoError(t, nvidiav1alpha1.AddToScheme(scheme))
99112
require.NoError(t, gpuv1.AddToScheme(scheme))
113+
require.NoError(t, corev1.AddToScheme(scheme))
100114

101115
tests := []struct {
102116
name string
@@ -247,6 +261,59 @@ func TestReconcileConflictSetsNotReadyState(t *testing.T) {
247261
require.Equal(t, nvidiav1alpha1.NotReady, updater.LastErrorState)
248262
}
249263

264+
func TestReconcileReturnsErrorWhenOwnerAssignmentFails(t *testing.T) {
265+
scheme := runtime.NewScheme()
266+
require.NoError(t, nvidiav1alpha1.AddToScheme(scheme))
267+
require.NoError(t, gpuv1.AddToScheme(scheme))
268+
require.NoError(t, corev1.AddToScheme(scheme))
269+
270+
driver := &nvidiav1alpha1.NVIDIADriver{
271+
ObjectMeta: metav1.ObjectMeta{
272+
Name: "test-driver",
273+
Namespace: "default",
274+
},
275+
Spec: nvidiav1alpha1.NVIDIADriverSpec{
276+
NodeSelector: map[string]string{"nodepool": "a"},
277+
},
278+
}
279+
cp := &gpuv1.ClusterPolicy{
280+
ObjectMeta: metav1.ObjectMeta{Name: "default"},
281+
Spec: gpuv1.ClusterPolicySpec{
282+
Driver: gpuv1.DriverSpec{
283+
UseNvidiaDriverCRD: ptr.To(true),
284+
},
285+
},
286+
}
287+
node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{
288+
Name: "gpu-node",
289+
Labels: map[string]string{
290+
"nodepool": "a",
291+
"nvidia.com/gpu.present": "true",
292+
},
293+
}}
294+
patchErr := errors.New("patch failed")
295+
k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(cp, driver, node).Build()
296+
updater := &FakeConditionUpdater{}
297+
298+
reconciler := &NVIDIADriverReconciler{
299+
Client: &patchFailingClient{Client: k8sClient, patchErr: patchErr},
300+
Scheme: scheme,
301+
conditionUpdater: updater,
302+
nodeSelectorValidator: &FakeNodeSelectorValidator{},
303+
}
304+
305+
_, err := reconciler.Reconcile(context.Background(), ctrl.Request{
306+
NamespacedName: types.NamespacedName{
307+
Name: driver.Name,
308+
Namespace: driver.Namespace,
309+
},
310+
})
311+
312+
require.Error(t, err)
313+
require.ErrorContains(t, err, patchErr.Error())
314+
require.Equal(t, nvidiav1alpha1.NotReady, updater.LastErrorState)
315+
}
316+
250317
func TestEnqueueAllNVIDIADrivers(t *testing.T) {
251318
scheme := runtime.NewScheme()
252319
require.NoError(t, nvidiav1alpha1.AddToScheme(scheme))
@@ -267,3 +334,45 @@ func TestEnqueueAllNVIDIADrivers(t *testing.T) {
267334
sort.Strings(got)
268335
require.Equal(t, []string{"default/driver-a", "default/driver-b"}, got)
269336
}
337+
338+
func TestNVIDIADriverGenerationOrDefaultLabelChangedPredicate(t *testing.T) {
339+
p := nvidiaDriverGenerationOrDefaultLabelChangedPredicate()
340+
341+
require.True(t, p.Update(event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]{
342+
ObjectOld: &nvidiav1alpha1.NVIDIADriver{ObjectMeta: metav1.ObjectMeta{Name: "driver", Generation: 1}},
343+
ObjectNew: &nvidiav1alpha1.NVIDIADriver{ObjectMeta: metav1.ObjectMeta{Name: "driver", Generation: 2}},
344+
}))
345+
346+
require.True(t, p.Update(event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]{
347+
ObjectOld: &nvidiav1alpha1.NVIDIADriver{
348+
ObjectMeta: metav1.ObjectMeta{
349+
Name: "driver",
350+
Generation: 1,
351+
},
352+
},
353+
ObjectNew: &nvidiav1alpha1.NVIDIADriver{
354+
ObjectMeta: metav1.ObjectMeta{
355+
Name: "driver",
356+
Generation: 1,
357+
Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"},
358+
},
359+
},
360+
}))
361+
362+
require.False(t, p.Update(event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]{
363+
ObjectOld: &nvidiav1alpha1.NVIDIADriver{
364+
ObjectMeta: metav1.ObjectMeta{
365+
Name: "driver",
366+
Generation: 1,
367+
Labels: map[string]string{"example.com/label": "old"},
368+
},
369+
},
370+
ObjectNew: &nvidiav1alpha1.NVIDIADriver{
371+
ObjectMeta: metav1.ObjectMeta{
372+
Name: "driver",
373+
Generation: 1,
374+
Labels: map[string]string{"example.com/label": "new"},
375+
},
376+
},
377+
}))
378+
}

0 commit comments

Comments
 (0)