@@ -45,6 +45,7 @@ import (
4545 "github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
4646 "github.com/NVIDIA/gpu-operator/internal/conditions"
4747 "github.com/NVIDIA/gpu-operator/internal/consts"
48+ nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
4849 "github.com/NVIDIA/gpu-operator/internal/state"
4950 "github.com/NVIDIA/gpu-operator/internal/validator"
5051)
@@ -83,9 +84,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
8384 if err := r .Get (ctx , req .NamespacedName , instance ); err != nil {
8485 if apierrors .IsNotFound (err ) {
8586 // Request object not found, could have been deleted after reconcile request.
86- // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
87- // Return and don't requeue
88- return reconcile.Result {}, nil
87+ // Re-run owner assignment so deleting the last NVIDIADriver clears stale node owner labels.
88+ return r .cleanupNVIDIADriverOwnerLabelsAndReturn (ctx )
8989 }
9090 wrappedErr := fmt .Errorf ("error getting NVIDIADriver object: %w" , err )
9191 logger .Error (err , "error getting NVIDIADriver object" )
@@ -96,6 +96,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
9696 // Error reading the object - requeue the request.
9797 return reconcile.Result {}, wrappedErr
9898 }
99+ if ! instance .GetDeletionTimestamp ().IsZero () {
100+ logger .Info ("NVIDIADriver delete requested; cleaning up owner labels" )
101+ return r .cleanupNVIDIADriverOwnerLabelsAndReturn (ctx )
102+ }
99103
100104 // Get the singleton NVIDIA ClusterPolicy object in the cluster.
101105 clusterPolicyList := & gpuv1.ClusterPolicyList {}
@@ -121,7 +125,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
121125 clusterPolicyInstance := clusterPolicyList .Items [0 ]
122126
123127 // Ensure that ClusterPolicy is configured to use NVIDIADriver CRD
124- if ! clusterPolicyInstance .Spec .Driver .UseNvidiaDriverCRDType () {
128+ if ! clusterPolicyInstance .Spec .Driver .IsNVIDIADriverCRDEnabled () {
125129 msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy"
126130 logger .V (consts .LogLevelWarning ).Info ("NVIDIADriver reconciliation skipped" , "reason" , msg )
127131 instance .Status .State = nvidiav1alpha1 .Disabled
@@ -152,6 +156,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
152156 return reconcile.Result {}, nil
153157 }
154158
159+ changed , err := nvidiadriverutil .AssignOwners (ctx , r .Client )
160+ if err != nil {
161+ logger .Error (err , "failed to assign NVIDIADriver owners to nodes" )
162+ instance .Status .State = nvidiav1alpha1 .NotReady
163+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ()); condErr != nil {
164+ logger .Error (condErr , "failed to set condition" )
165+ }
166+ return reconcile.Result {}, err
167+ }
168+ if changed {
169+ return reconcile.Result {RequeueAfter : time .Second }, nil
170+ }
171+
155172 if instance .Spec .UsePrecompiledDrivers () && (instance .Spec .IsGDSEnabled () || instance .Spec .IsGDRCopyEnabled ()) {
156173 err := errors .New ("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers" )
157174 logger .Error (err , "unsupported driver combination detected" )
@@ -188,6 +205,11 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
188205 // Sync state and update status
189206 managerStatus := r .stateManager .SyncState (ctx , instance , infoCatalog )
190207
208+ if err := r .labelNodesWithOrphanedDriverPods (ctx ); err != nil {
209+ logger .Error (err , "failed to label nodes with orphaned NVIDIA driver pods" )
210+ return reconcile.Result {}, err
211+ }
212+
191213 // update CR status
192214 if err := r .updateCrStatus (ctx , instance , managerStatus ); err != nil {
193215 return ctrl.Result {}, err
@@ -276,6 +298,65 @@ func (r *NVIDIADriverReconciler) enqueueAllNVIDIADrivers(ctx context.Context) []
276298 return reconcileRequests
277299}
278300
301+ // enqueueNVIDIADriverReconcilers enqueues the NVIDIADriver that triggered the
302+ // event and all current NVIDIADriver instances. The triggering object is
303+ // included even for delete events so the NotFound reconcile path can clear
304+ // stale node owner labels.
305+ func (r * NVIDIADriverReconciler ) enqueueNVIDIADriverReconcilers (ctx context.Context , driver * nvidiav1alpha1.NVIDIADriver ) []reconcile.Request {
306+ requests := r .enqueueAllNVIDIADrivers (ctx )
307+ if driver != nil {
308+ requests = append (requests , reconcile.Request {
309+ NamespacedName : types.NamespacedName {
310+ Name : driver .GetName (),
311+ Namespace : driver .GetNamespace (),
312+ },
313+ })
314+ }
315+ return dedupeReconcileRequests (requests )
316+ }
317+
318+ func dedupeReconcileRequests (requests []reconcile.Request ) []reconcile.Request {
319+ seen := map [types.NamespacedName ]struct {}{}
320+ deduped := make ([]reconcile.Request , 0 , len (requests ))
321+ for _ , request := range requests {
322+ if _ , ok := seen [request .NamespacedName ]; ok {
323+ continue
324+ }
325+ seen [request .NamespacedName ] = struct {}{}
326+ deduped = append (deduped , request )
327+ }
328+ return deduped
329+ }
330+
331+ func (r * NVIDIADriverReconciler ) cleanupNVIDIADriverOwnerLabelsAndReturn (ctx context.Context ) (reconcile.Result , error ) {
332+ if err := r .cleanupNVIDIADriverOwnerLabels (ctx ); err != nil {
333+ log .FromContext (ctx ).Error (err , "failed to cleanup NVIDIADriver owner labels" )
334+ return reconcile.Result {}, err
335+ }
336+ return reconcile.Result {}, nil
337+ }
338+
339+ // cleanupNVIDIADriverOwnerLabels re-runs owner assignment after a delete event
340+ // when ClusterPolicy is still configured for NVIDIADriver mode. This lets
341+ // AssignOwners remove stale nvidia.com/gpu-operator.driver.owner labels when no remaining
342+ // NVIDIADriver matches a GPU node, including deletion of the last NVIDIADriver.
343+ func (r * NVIDIADriverReconciler ) cleanupNVIDIADriverOwnerLabels (ctx context.Context ) error {
344+ clusterPolicyList := & gpuv1.ClusterPolicyList {}
345+ if err := r .List (ctx , clusterPolicyList ); err != nil {
346+ return fmt .Errorf ("error getting ClusterPolicy list: %w" , err )
347+ }
348+
349+ for i := range clusterPolicyList .Items {
350+ if ! clusterPolicyList .Items [i ].Spec .Driver .IsNVIDIADriverCRDEnabled () {
351+ continue
352+ }
353+ _ , err := nvidiadriverutil .AssignOwners (ctx , r .Client )
354+ return err
355+ }
356+
357+ return nil
358+ }
359+
279360// SetupWithManager sets up the controller with the Manager.
280361func (r * NVIDIADriverReconciler ) SetupWithManager (ctx context.Context , mgr ctrl.Manager ) error {
281362 // Create state manager
@@ -307,8 +388,8 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
307388
308389 // Watch for changes to NVIDIADriver CRs. Whenever an event is generated for a NVIDIADriver CR,
309390 // enqueue a reconcile request for all NVIDIADriver instances.
310- nvidiaDriverMapFn := func (ctx context.Context , _ * nvidiav1alpha1.NVIDIADriver ) []reconcile.Request {
311- return r .enqueueAllNVIDIADrivers (ctx )
391+ nvidiaDriverMapFn := func (ctx context.Context , driver * nvidiav1alpha1.NVIDIADriver ) []reconcile.Request {
392+ return r .enqueueNVIDIADriverReconcilers (ctx , driver )
312393 }
313394
314395 // Watch for changes to the primary resource NVIDIADriver
0 commit comments