@@ -45,6 +45,7 @@ import (
4545 "github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
4646 "github.com/NVIDIA/gpu-operator/internal/conditions"
4747 "github.com/NVIDIA/gpu-operator/internal/consts"
48+ nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
4849 "github.com/NVIDIA/gpu-operator/internal/state"
4950 "github.com/NVIDIA/gpu-operator/internal/validator"
5051)
@@ -83,9 +84,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
8384 if err := r .Get (ctx , req .NamespacedName , instance ); err != nil {
8485 if apierrors .IsNotFound (err ) {
8586 // Request object not found, could have been deleted after reconcile request.
86- // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
87- // Return and don't requeue
88- return reconcile.Result {}, nil
87+ // Re-run owner assignment so deleting the last NVIDIADriver clears stale node owner labels.
88+ return r .cleanupNVIDIADriverOwnerLabels (ctx )
8989 }
9090 wrappedErr := fmt .Errorf ("error getting NVIDIADriver object: %w" , err )
9191 logger .Error (err , "error getting NVIDIADriver object" )
@@ -96,6 +96,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
9696 // Error reading the object - requeue the request.
9797 return reconcile.Result {}, wrappedErr
9898 }
99+ if instance .HasDeletionTimestamp () {
100+ logger .Info ("NVIDIADriver delete requested; cleaning up owner labels" )
101+ return r .cleanupNVIDIADriverOwnerLabels (ctx )
102+ }
99103
100104 // Get the singleton NVIDIA ClusterPolicy object in the cluster.
101105 clusterPolicyList := & gpuv1.ClusterPolicyList {}
@@ -152,6 +156,19 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
152156 return reconcile.Result {}, nil
153157 }
154158
159+ changed , err := nvidiadriverutil .AssignOwners (ctx , r .Client )
160+ if err != nil {
161+ logger .Error (err , "failed to assign NVIDIADriver owners to nodes" )
162+ instance .Status .State = nvidiav1alpha1 .NotReady
163+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ()); condErr != nil {
164+ logger .Error (condErr , "failed to set condition" )
165+ }
166+ return reconcile.Result {}, err
167+ }
168+ if changed {
169+ return reconcile.Result {RequeueAfter : time .Second }, nil
170+ }
171+
155172 if instance .Spec .UsePrecompiledDrivers () && (instance .Spec .IsGDSEnabled () || instance .Spec .IsGDRCopyEnabled ()) {
156173 err := errors .New ("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers" )
157174 logger .Error (err , "unsupported driver combination detected" )
@@ -188,6 +205,11 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
188205 // Sync state and update status
189206 managerStatus := r .stateManager .SyncState (ctx , instance , infoCatalog )
190207
208+ if err := r .labelNodesWithOrphanedDriverPods (ctx ); err != nil {
209+ logger .Error (err , "failed to label nodes with orphaned NVIDIA driver pods" )
210+ return reconcile.Result {}, err
211+ }
212+
191213 // update CR status
192214 if err := r .updateCrStatus (ctx , instance , managerStatus ); err != nil {
193215 return ctrl.Result {}, err
@@ -276,6 +298,63 @@ func (r *NVIDIADriverReconciler) enqueueAllNVIDIADrivers(ctx context.Context) []
276298 return reconcileRequests
277299}
278300
301+ // enqueueNVIDIADriverReconcilers enqueues the NVIDIADriver that triggered the
302+ // event and all current NVIDIADriver instances. The triggering object is
303+ // included even for delete events so the NotFound reconcile path can clear
304+ // stale node owner labels.
305+ func (r * NVIDIADriverReconciler ) enqueueNVIDIADriverReconcilers (ctx context.Context , driver * nvidiav1alpha1.NVIDIADriver ) []reconcile.Request {
306+ requests := r .enqueueAllNVIDIADrivers (ctx )
307+ if driver != nil {
308+ requests = append (requests , reconcile.Request {
309+ NamespacedName : types.NamespacedName {
310+ Name : driver .GetName (),
311+ Namespace : driver .GetNamespace (),
312+ },
313+ })
314+ }
315+ return dedupeReconcileRequests (requests )
316+ }
317+
318+ func dedupeReconcileRequests (requests []reconcile.Request ) []reconcile.Request {
319+ seen := map [types.NamespacedName ]struct {}{}
320+ deduped := make ([]reconcile.Request , 0 , len (requests ))
321+ for _ , request := range requests {
322+ if _ , ok := seen [request .NamespacedName ]; ok {
323+ continue
324+ }
325+ seen [request .NamespacedName ] = struct {}{}
326+ deduped = append (deduped , request )
327+ }
328+ return deduped
329+ }
330+
331+ // cleanupNVIDIADriverOwnerLabels re-runs owner assignment after a delete event
332+ // when ClusterPolicy is still configured for NVIDIADriver mode. This lets
333+ // AssignOwners remove stale nvidia.com/gpu-operator.driver.owner labels when no remaining
334+ // NVIDIADriver matches a GPU node, including deletion of the last NVIDIADriver.
335+ func (r * NVIDIADriverReconciler ) cleanupNVIDIADriverOwnerLabels (ctx context.Context ) (reconcile.Result , error ) {
336+ clusterPolicyList := & gpuv1.ClusterPolicyList {}
337+ if err := r .List (ctx , clusterPolicyList ); err != nil {
338+ wrappedErr := fmt .Errorf ("error getting ClusterPolicy list: %w" , err )
339+ log .FromContext (ctx ).Error (wrappedErr , "failed to cleanup NVIDIADriver owner labels" )
340+ return reconcile.Result {}, wrappedErr
341+ }
342+
343+ if len (clusterPolicyList .Items ) == 0 {
344+ return reconcile.Result {}, nil
345+ }
346+
347+ if ! clusterPolicyList .Items [0 ].Spec .Driver .UseNvidiaDriverCRDType () {
348+ return reconcile.Result {}, nil
349+ }
350+
351+ if _ , err := nvidiadriverutil .AssignOwners (ctx , r .Client ); err != nil {
352+ log .FromContext (ctx ).Error (err , "failed to cleanup NVIDIADriver owner labels" )
353+ return reconcile.Result {}, err
354+ }
355+ return reconcile.Result {}, nil
356+ }
357+
279358// SetupWithManager sets up the controller with the Manager.
280359func (r * NVIDIADriverReconciler ) SetupWithManager (ctx context.Context , mgr ctrl.Manager ) error {
281360 // Create state manager
@@ -307,8 +386,8 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
307386
308387 // Watch for changes to NVIDIADriver CRs. Whenever an event is generated for a NVIDIADriver CR,
309388 // enqueue a reconcile request for all NVIDIADriver instances.
310- nvidiaDriverMapFn := func (ctx context.Context , _ * nvidiav1alpha1.NVIDIADriver ) []reconcile.Request {
311- return r .enqueueAllNVIDIADrivers (ctx )
389+ nvidiaDriverMapFn := func (ctx context.Context , driver * nvidiav1alpha1.NVIDIADriver ) []reconcile.Request {
390+ return r .enqueueNVIDIADriverReconcilers (ctx , driver )
312391 }
313392
314393 // Watch for changes to the primary resource NVIDIADriver
0 commit comments