Skip to content

Commit 2e803be

Browse files
committed
address review comments
changes include: * move default NVIDIADriver ownership helpers into internal/nvidiadriver and re-use * requeue after node owner label changes so later reconcile steps read updated cache state * use patch instead of update for node label updates Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent e7b72ea commit 2e803be

14 files changed

Lines changed: 144 additions & 110 deletions

File tree

api/nvidia/v1alpha1/nvidiadriver_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ const (
3939
// NVIDIADriverSpec defines the desired state of NVIDIADriver.
4040
// The CEL validation allows non-default drivers to use nodeSelector, but requires
4141
// default drivers to leave nodeSelector unset or empty.
42-
// +kubebuilder:validation:XValidation:rule="has(self.default) && self.default ? !has(self.nodeSelector) || size(self.nodeSelector) == 0 : true",message="default NVIDIADriver cannot use nodeSelector"
42+
// +kubebuilder:validation:XValidation:rule="has(self.default) && self.default ? !has(self.nodeSelector) || size(self.nodeSelector) == 0 : true",message="default NVIDIADriver must not use nodeSelector"
4343
type NVIDIADriverSpec struct {
4444
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
4545
// Important: Run "make" to regenerate code after modifying this file

bundle/manifests/nvidia.com_nvidiadrivers.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,7 @@ spec:
985985
- image
986986
type: object
987987
x-kubernetes-validations:
988-
- message: default NVIDIADriver cannot use nodeSelector
988+
- message: default NVIDIADriver must not use nodeSelector
989989
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990990
size(self.nodeSelector) == 0 : true'
991991
status:

config/crd/bases/nvidia.com_nvidiadrivers.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,7 @@ spec:
985985
- image
986986
type: object
987987
x-kubernetes-validations:
988-
- message: default NVIDIADriver cannot use nodeSelector
988+
- message: default NVIDIADriver must not use nodeSelector
989989
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990990
size(self.nodeSelector) == 0 : true'
991991
status:

controllers/clusterpolicy_controller.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ import (
4444
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
4545
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
4646
"github.com/NVIDIA/gpu-operator/internal/conditions"
47+
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
4748
)
4849

4950
const (
@@ -145,9 +146,10 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
145146

146147
clusterPolicyCtrl.operatorMetrics.reconciliationTotal.Inc()
147148

148-
nvidiaDriverEnabled := nvidiaDriverCRDEnabled(instance)
149+
nvidiaDriverEnabled := nvidiadriverutil.CRDEnabled(instance)
149150
if nvidiaDriverEnabled {
150-
if err := assignNVIDIADriverOwners(ctx, r.Client); err != nil {
151+
changed, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
152+
if err != nil {
151153
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
152154
clusterPolicyCtrl.operatorMetrics.reconciliationFailed.Inc()
153155
updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
@@ -156,6 +158,9 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
156158
}
157159
return ctrl.Result{}, err
158160
}
161+
if changed {
162+
return ctrl.Result{RequeueAfter: time.Second}, nil
163+
}
159164
}
160165

161166
overallStatus := gpuv1.Ready

controllers/nvidiadriver_controller.go

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
4646
"github.com/NVIDIA/gpu-operator/internal/conditions"
4747
"github.com/NVIDIA/gpu-operator/internal/consts"
48+
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
4849
"github.com/NVIDIA/gpu-operator/internal/state"
4950
"github.com/NVIDIA/gpu-operator/internal/validator"
5051
)
@@ -96,6 +97,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
9697
// Error reading the object - requeue the request.
9798
return reconcile.Result{}, wrappedErr
9899
}
100+
if !instance.GetDeletionTimestamp().IsZero() {
101+
logger.Info("NVIDIADriver delete requested; skipping reconciliation")
102+
return reconcile.Result{}, nil
103+
}
99104

100105
// Get the singleton NVIDIA ClusterPolicy object in the cluster.
101106
clusterPolicyList := &gpuv1.ClusterPolicyList{}
@@ -110,9 +115,6 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
110115
}
111116

112117
if len(clusterPolicyList.Items) == 0 {
113-
if handled, err := r.handleDefaultNVIDIADriverDeletion(ctx, instance); handled || err != nil {
114-
return reconcile.Result{}, err
115-
}
116118
err := fmt.Errorf("no ClusterPolicy object found in the cluster")
117119
logger.Error(err, "failed to get ClusterPolicy object")
118120
instance.Status.State = nvidiav1alpha1.NotReady
@@ -123,12 +125,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
123125
}
124126
clusterPolicyInstance := clusterPolicyList.Items[0]
125127

126-
if handled, err := r.handleDefaultNVIDIADriverDeletion(ctx, instance); handled || err != nil {
127-
return reconcile.Result{}, err
128-
}
129-
130128
// Ensure that ClusterPolicy is configured to use NVIDIADriver CRD
131-
if !nvidiaDriverCRDEnabled(&clusterPolicyInstance) {
129+
if !nvidiadriverutil.CRDEnabled(&clusterPolicyInstance) {
132130
msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy"
133131
logger.V(consts.LogLevelWarning).Info("NVIDIADriver reconciliation skipped", "reason", msg)
134132
instance.Status.State = nvidiav1alpha1.Disabled
@@ -159,14 +157,18 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
159157
return reconcile.Result{}, nil
160158
}
161159

162-
if err := assignNVIDIADriverOwners(ctx, r.Client); err != nil {
160+
changed, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
161+
if err != nil {
163162
logger.Error(err, "failed to assign NVIDIADriver owners to nodes")
164163
instance.Status.State = nvidiav1alpha1.NotReady
165164
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
166165
logger.Error(condErr, "failed to set condition")
167166
}
168167
return reconcile.Result{}, err
169168
}
169+
if changed {
170+
return reconcile.Result{RequeueAfter: time.Second}, nil
171+
}
170172

171173
if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
172174
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
@@ -237,14 +239,6 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
237239
return reconcile.Result{}, nil
238240
}
239241

240-
func (r *NVIDIADriverReconciler) handleDefaultNVIDIADriverDeletion(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) (bool, error) {
241-
if !isDefaultNVIDIADriver(driver) || driver.GetDeletionTimestamp() == nil {
242-
return false, nil
243-
}
244-
log.FromContext(ctx).Info("Default NVIDIADriver delete requested; skipping reconciliation")
245-
return true, nil
246-
}
247-
248242
func (r *NVIDIADriverReconciler) updateCrStatus(
249243
ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, status state.Results) error {
250244
reqLogger := log.FromContext(ctx)

controllers/object_controls.go

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ import (
5252
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
5353
driverconfig "github.com/NVIDIA/gpu-operator/internal/config"
5454
"github.com/NVIDIA/gpu-operator/internal/consts"
55+
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
5556
"github.com/NVIDIA/gpu-operator/internal/utils"
5657
)
5758

@@ -4339,11 +4340,12 @@ func (n ClusterPolicyController) labelNodesWithOrphanedDriverPods(ctx context.Co
43394340
if node.Labels != nil && node.Labels[upgradeStateLabel] == upgrade.UpgradeStateUpgradeRequired {
43404341
continue
43414342
}
4343+
originalNode := node.DeepCopy()
43424344
if node.Labels == nil {
43434345
node.Labels = map[string]string{}
43444346
}
43454347
node.Labels[upgradeStateLabel] = upgrade.UpgradeStateUpgradeRequired
4346-
if err := n.client.Update(ctx, node); err != nil {
4348+
if err := n.client.Patch(ctx, node, client.MergeFrom(originalNode)); err != nil {
43474349
n.logger.Error(err, "failed to label node with orphaned NVIDIA driver pod", "pod", pod.Name, "node", node.Name)
43484350
}
43494351
}
@@ -4354,23 +4356,13 @@ func (n ClusterPolicyController) labelNodesWithOrphanedDriverPods(ctx context.Co
43544356
// nodeMatchesAnyNVIDIADriver returns true when the node matches at least one NVIDIADriver selector.
43554357
func nodeMatchesAnyNVIDIADriver(node *corev1.Node, nvidiaDrivers []nvidiav1alpha1.NVIDIADriver) bool {
43564358
for _, nvidiaDriver := range nvidiaDrivers {
4357-
if nodeMatchesSelector(node, nvidiaDriver.GetNodeSelector()) {
4359+
if nvidiadriverutil.NodeMatchesSelector(node, nvidiaDriver.GetNodeSelector()) {
43584360
return true
43594361
}
43604362
}
43614363
return false
43624364
}
43634365

4364-
// nodeMatchesSelector returns true when all selector labels are present on the node.
4365-
func nodeMatchesSelector(node *corev1.Node, selector map[string]string) bool {
4366-
for key, value := range selector {
4367-
if node.Labels[key] != value {
4368-
return false
4369-
}
4370-
}
4371-
return true
4372-
}
4373-
43744366
// cleanupStalePrecompiledDaemonsets deletes stale driver daemonsets which can happen
43754367
// 1. If all nodes upgraded to the latest kernel
43764368
// 2. no GPU nodes are present

controllers/state_manager.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636
"sigs.k8s.io/controller-runtime/pkg/client/config"
3737

3838
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
39+
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
3940
)
4041

4142
const (
@@ -1058,7 +1059,7 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) {
10581059
// - In object_controls.go, check the OwnerRef for existing objects
10591060
// before managing them. Clusterpolicy controller should not be creating /
10601061
// updating / deleting objects owned by another controller.
1061-
if n.stateNames[n.idx] == "state-driver" && nvidiaDriverCRDEnabled(n.singleton) {
1062+
if n.stateNames[n.idx] == "state-driver" && nvidiadriverutil.CRDEnabled(n.singleton) {
10621063
n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy")
10631064
n.idx++
10641065
// Cleanup all driver daemonsets owned by ClusterPolicy while keeping the
@@ -1069,7 +1070,7 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) {
10691070
}
10701071
return gpuv1.Disabled, nil
10711072
}
1072-
if n.stateNames[n.idx] == "state-vgpu-manager" && nvidiaDriverCRDEnabled(n.singleton) {
1073+
if n.stateNames[n.idx] == "state-vgpu-manager" && nvidiadriverutil.CRDEnabled(n.singleton) {
10731074
n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy")
10741075
n.idx++
10751076
// Cleanup all driver daemonsets owned by ClusterPolicy while keeping the

controllers/upgrade_controller.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ import (
4747

4848
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
4949
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
50+
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
5051
)
5152

5253
// UpgradeReconciler reconciles Driver Daemon Sets for upgrade
@@ -132,7 +133,7 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
132133
driverLabelKey := DriverLabelKey
133134
driverLabelValue := DriverLabelValue
134135

135-
if nvidiaDriverCRDEnabled(clusterPolicy) {
136+
if nvidiadriverutil.CRDEnabled(clusterPolicy) {
136137
// app component label is added for all new driver daemonsets deployed by NVIDIADriver controller
137138
driverLabelKey = AppComponentLabelKey
138139
driverLabelValue = AppComponentLabelValue

deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,7 @@ spec:
985985
- image
986986
type: object
987987
x-kubernetes-validations:
988-
- message: default NVIDIADriver cannot use nodeSelector
988+
- message: default NVIDIADriver must not use nodeSelector
989989
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990990
size(self.nodeSelector) == 0 : true'
991991
status:
Lines changed: 42 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package controllers
17+
package nvidiadriver
1818

1919
import (
2020
"context"
@@ -29,25 +29,18 @@ import (
2929
"github.com/NVIDIA/gpu-operator/internal/consts"
3030
)
3131

32-
// isDefaultNVIDIADriver returns true when the NVIDIADriver is marked as the fallback driver.
33-
func isDefaultNVIDIADriver(driver *nvidiav1alpha1.NVIDIADriver) bool {
32+
// IsDefault returns true when the NVIDIADriver is marked as the fallback driver.
33+
func IsDefault(driver *nvidiav1alpha1.NVIDIADriver) bool {
3434
return driver != nil && driver.Spec.Default
3535
}
3636

37-
// nvidiaDriverCRDEnabled returns true when ClusterPolicy driver management is enabled through NVIDIADriver CRs.
38-
func nvidiaDriverCRDEnabled(clusterPolicy *gpuv1.ClusterPolicy) bool {
39-
return clusterPolicy != nil &&
40-
clusterPolicy.Spec.Driver.IsEnabled() &&
41-
clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType()
42-
}
43-
44-
// validateNVIDIADriverNodeSelector rejects selectors that use operator-managed routing labels
37+
// ValidateNodeSelector rejects selectors that use operator-managed routing labels
4538
// or scope the default fallback driver.
46-
func validateNVIDIADriverNodeSelector(driver *nvidiav1alpha1.NVIDIADriver) error {
39+
func ValidateNodeSelector(driver *nvidiav1alpha1.NVIDIADriver) error {
4740
if driver == nil || driver.Spec.NodeSelector == nil {
4841
return nil
4942
}
50-
if isDefaultNVIDIADriver(driver) && len(driver.Spec.NodeSelector) > 0 {
43+
if IsDefault(driver) && len(driver.Spec.NodeSelector) > 0 {
5144
return fmt.Errorf("default NVIDIADriver %q cannot use nodeSelector", driver.Name)
5245
}
5346
if _, ok := driver.Spec.NodeSelector[consts.NVIDIADriverOwnerLabel]; ok {
@@ -56,59 +49,77 @@ func validateNVIDIADriverNodeSelector(driver *nvidiav1alpha1.NVIDIADriver) error
5649
return nil
5750
}
5851

59-
// assignNVIDIADriverOwners labels GPU nodes with the NVIDIADriver that should manage their driver pods.
52+
// CRDEnabled returns true when ClusterPolicy driver management is enabled through NVIDIADriver CRs.
53+
func CRDEnabled(clusterPolicy *gpuv1.ClusterPolicy) bool {
54+
return clusterPolicy != nil &&
55+
clusterPolicy.Spec.Driver.IsEnabled() &&
56+
clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType()
57+
}
58+
59+
// NodeMatchesSelector returns true when all selector labels are present on the node.
60+
func NodeMatchesSelector(node *corev1.Node, selector map[string]string) bool {
61+
for key, value := range selector {
62+
if node.Labels[key] != value {
63+
return false
64+
}
65+
}
66+
return true
67+
}
68+
69+
// AssignOwners labels GPU nodes with the NVIDIADriver that should manage their driver pods.
6070
// Non-default NVIDIADrivers take precedence over the default fallback, and conflicts fail closed before
61-
// node owner labels are changed.
62-
func assignNVIDIADriverOwners(ctx context.Context, c client.Client) error {
71+
// node owner labels are changed. It returns true when any node label was changed.
72+
func AssignOwners(ctx context.Context, c client.Client) (bool, error) {
6373
drivers := &nvidiav1alpha1.NVIDIADriverList{}
6474
if err := c.List(ctx, drivers); err != nil {
65-
return fmt.Errorf("failed to list NVIDIADriver CRs: %w", err)
75+
return false, fmt.Errorf("failed to list NVIDIADriver CRs: %w", err)
6676
}
6777

6878
var defaultDriver *nvidiav1alpha1.NVIDIADriver
6979
defaultDriverNames := []string{}
7080
specificDrivers := make([]nvidiav1alpha1.NVIDIADriver, 0, len(drivers.Items))
7181
for i := range drivers.Items {
72-
if err := validateNVIDIADriverNodeSelector(&drivers.Items[i]); err != nil {
73-
return err
82+
if err := ValidateNodeSelector(&drivers.Items[i]); err != nil {
83+
return false, err
7484
}
75-
if isDefaultNVIDIADriver(&drivers.Items[i]) {
85+
if IsDefault(&drivers.Items[i]) {
7686
defaultDriverNames = append(defaultDriverNames, drivers.Items[i].Name)
7787
defaultDriver = &drivers.Items[i]
7888
continue
7989
}
8090
specificDrivers = append(specificDrivers, drivers.Items[i])
8191
}
8292
if len(defaultDriverNames) > 1 {
83-
return fmt.Errorf("multiple default NVIDIADrivers found: %s", strings.Join(defaultDriverNames, ", "))
93+
return false, fmt.Errorf("multiple default NVIDIADrivers found: %s", strings.Join(defaultDriverNames, ", "))
8494
}
8595
nodes := &corev1.NodeList{}
8696
if err := c.List(ctx, nodes, client.MatchingLabels{consts.GPUPresentLabel: "true"}); err != nil {
87-
return fmt.Errorf("failed to list GPU nodes: %w", err)
97+
return false, fmt.Errorf("failed to list GPU nodes: %w", err)
8898
}
8999

90100
for i := range nodes.Items {
91101
matchingDrivers := []string{}
92102
for _, driver := range specificDrivers {
93-
if nodeMatchesSelector(&nodes.Items[i], driver.GetNodeSelector()) {
103+
if NodeMatchesSelector(&nodes.Items[i], driver.GetNodeSelector()) {
94104
matchingDrivers = append(matchingDrivers, driver.Name)
95105
}
96106
}
97107
if len(matchingDrivers) > 1 {
98-
return fmt.Errorf("conflicting NVIDIADriver NodeSelectors found for node %s: %s", nodes.Items[i].Name, strings.Join(matchingDrivers, ", "))
108+
return false, fmt.Errorf("multiple NVIDIADrivers match the same node %s: %v", nodes.Items[i].Name, matchingDrivers)
99109
}
100110
}
101111

112+
changed := false
102113
for i := range nodes.Items {
103114
node := &nodes.Items[i]
104115
originalNode := node.DeepCopy()
105116
owner := ""
106117
for _, driver := range specificDrivers {
107-
if nodeMatchesSelector(node, driver.GetNodeSelector()) {
118+
if NodeMatchesSelector(node, driver.GetNodeSelector()) {
108119
owner = driver.Name
109120
}
110121
}
111-
if owner == "" && defaultDriver != nil && nodeMatchesSelector(node, defaultDriver.GetNodeSelector()) {
122+
if owner == "" && defaultDriver != nil && NodeMatchesSelector(node, defaultDriver.GetNodeSelector()) {
112123
owner = defaultDriver.Name
113124
}
114125
if owner == "" {
@@ -120,8 +131,9 @@ func assignNVIDIADriverOwners(ctx context.Context, c client.Client) error {
120131
}
121132
delete(node.Labels, consts.NVIDIADriverOwnerLabel)
122133
if err := c.Patch(ctx, node, client.MergeFrom(originalNode)); err != nil {
123-
return fmt.Errorf("failed to remove NVIDIADriver owner label for node %s: %w", node.Name, err)
134+
return false, fmt.Errorf("failed to remove NVIDIADriver owner label for node %s: %w", node.Name, err)
124135
}
136+
changed = true
125137
continue
126138
}
127139
if node.Labels != nil && node.Labels[consts.NVIDIADriverOwnerLabel] == owner {
@@ -132,9 +144,10 @@ func assignNVIDIADriverOwners(ctx context.Context, c client.Client) error {
132144
}
133145
node.Labels[consts.NVIDIADriverOwnerLabel] = owner
134146
if err := c.Patch(ctx, node, client.MergeFrom(originalNode)); err != nil {
135-
return fmt.Errorf("failed to update NVIDIADriver owner label for node %s: %w", node.Name, err)
147+
return false, fmt.Errorf("failed to update NVIDIADriver owner label for node %s: %w", node.Name, err)
136148
}
149+
changed = true
137150
}
138151

139-
return nil
152+
return changed, nil
140153
}

0 commit comments

Comments
 (0)