Skip to content

Commit cb154ca

Browse files
committed
address review comments
Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent ad7eaf6 commit cb154ca

8 files changed

Lines changed: 150 additions & 68 deletions

File tree

api/nvidia/v1alpha1/nvidiadriver_types.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ type NVIDIADriverSpec struct {
4444
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
4545
// Important: Run "make" to regenerate code after modifying this file
4646

47-
// Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
47+
// Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
4848
// that do not match any non-default NVIDIADriver nodeSelector.
4949
// +kubebuilder:default=false
5050
Default bool `json:"default"`
@@ -538,6 +538,21 @@ func (d *NVIDIADriver) IsDefault() bool {
538538
return d != nil && d.Spec.Default
539539
}
540540

541+
// ValidateNodeSelector rejects selectors that use operator-managed routing labels
542+
// or scope the default fallback driver.
543+
func (d *NVIDIADriver) ValidateNodeSelector() error {
544+
if d == nil || d.Spec.NodeSelector == nil {
545+
return nil
546+
}
547+
if d.IsDefault() && len(d.Spec.NodeSelector) > 0 {
548+
return fmt.Errorf("default NVIDIADriver %q cannot use nodeSelector", d.Name)
549+
}
550+
if _, ok := d.Spec.NodeSelector[consts.NVIDIADriverOwnerLabel]; ok {
551+
return fmt.Errorf("NVIDIADriver %q nodeSelector cannot use reserved label %q", d.Name, consts.NVIDIADriverOwnerLabel)
552+
}
553+
return nil
554+
}
555+
541556
// UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec
542557
func (d *NVIDIADriverSpec) UsePrecompiledDrivers() bool {
543558
if d.UsePrecompiled == nil {

bundle/manifests/nvidia.com_nvidiadrivers.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ spec:
7979
default:
8080
default: false
8181
description: |-
82-
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
82+
Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
8383
that do not match any non-default NVIDIADriver nodeSelector.
8484
type: boolean
8585
driverType:

config/crd/bases/nvidia.com_nvidiadrivers.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ spec:
7979
default:
8080
default: false
8181
description: |-
82-
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
82+
Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
8383
that do not match any non-default NVIDIADriver nodeSelector.
8484
type: boolean
8585
driverType:

controllers/nvidiadriver_controller.go

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
8585
if apierrors.IsNotFound(err) {
8686
// Request object not found, could have been deleted after reconcile request.
8787
// Re-run owner assignment so deleting the last NVIDIADriver clears stale node owner labels.
88-
return r.cleanupNVIDIADriverOwnerLabelsAndReturn(ctx)
88+
return r.cleanupNVIDIADriverOwnerLabels(ctx)
8989
}
9090
wrappedErr := fmt.Errorf("error getting NVIDIADriver object: %w", err)
9191
logger.Error(err, "error getting NVIDIADriver object")
@@ -98,7 +98,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
9898
}
9999
if !instance.GetDeletionTimestamp().IsZero() {
100100
logger.Info("NVIDIADriver delete requested; cleaning up owner labels")
101-
return r.cleanupNVIDIADriverOwnerLabelsAndReturn(ctx)
101+
return r.cleanupNVIDIADriverOwnerLabels(ctx)
102102
}
103103

104104
// Get the singleton NVIDIA ClusterPolicy object in the cluster.
@@ -328,33 +328,30 @@ func dedupeReconcileRequests(requests []reconcile.Request) []reconcile.Request {
328328
return deduped
329329
}
330330

331-
func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabelsAndReturn(ctx context.Context) (reconcile.Result, error) {
332-
if err := r.cleanupNVIDIADriverOwnerLabels(ctx); err != nil {
333-
log.FromContext(ctx).Error(err, "failed to cleanup NVIDIADriver owner labels")
334-
return reconcile.Result{}, err
335-
}
336-
return reconcile.Result{}, nil
337-
}
338-
339331
// cleanupNVIDIADriverOwnerLabels re-runs owner assignment after a delete event
340332
// when ClusterPolicy is still configured for NVIDIADriver mode. This lets
341333
// AssignOwners remove stale nvidia.com/gpu-operator.driver.owner labels when no remaining
342334
// NVIDIADriver matches a GPU node, including deletion of the last NVIDIADriver.
343-
func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabels(ctx context.Context) error {
335+
func (r *NVIDIADriverReconciler) cleanupNVIDIADriverOwnerLabels(ctx context.Context) (reconcile.Result, error) {
344336
clusterPolicyList := &gpuv1.ClusterPolicyList{}
345337
if err := r.List(ctx, clusterPolicyList); err != nil {
346-
return fmt.Errorf("error getting ClusterPolicy list: %w", err)
338+
wrappedErr := fmt.Errorf("error getting ClusterPolicy list: %w", err)
339+
log.FromContext(ctx).Error(wrappedErr, "failed to cleanup NVIDIADriver owner labels")
340+
return reconcile.Result{}, wrappedErr
347341
}
348342

349343
for i := range clusterPolicyList.Items {
350344
if !clusterPolicyList.Items[i].Spec.Driver.IsNVIDIADriverCRDEnabled() {
351345
continue
352346
}
353-
_, err := nvidiadriverutil.AssignOwners(ctx, r.Client)
354-
return err
347+
if _, err := nvidiadriverutil.AssignOwners(ctx, r.Client); err != nil {
348+
log.FromContext(ctx).Error(err, "failed to cleanup NVIDIADriver owner labels")
349+
return reconcile.Result{}, err
350+
}
351+
return reconcile.Result{}, nil
355352
}
356353

357-
return nil
354+
return reconcile.Result{}, nil
358355
}
359356

360357
// SetupWithManager sets up the controller with the Manager.

deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ spec:
7979
default:
8080
default: false
8181
description: |-
82-
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
82+
Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
8383
that do not match any non-default NVIDIADriver nodeSelector.
8484
type: boolean
8585
driverType:

internal/nvidiadriver/nvidiadriver.go

Lines changed: 19 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -22,35 +22,16 @@ import (
2222
"strings"
2323

2424
corev1 "k8s.io/api/core/v1"
25+
"k8s.io/apimachinery/pkg/labels"
2526
"sigs.k8s.io/controller-runtime/pkg/client"
2627

2728
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
2829
"github.com/NVIDIA/gpu-operator/internal/consts"
2930
)
3031

31-
// ValidateNodeSelector rejects selectors that use operator-managed routing labels
32-
// or scope the default fallback driver.
33-
func ValidateNodeSelector(driver *nvidiav1alpha1.NVIDIADriver) error {
34-
if driver == nil || driver.Spec.NodeSelector == nil {
35-
return nil
36-
}
37-
if driver.IsDefault() && len(driver.Spec.NodeSelector) > 0 {
38-
return fmt.Errorf("default NVIDIADriver %q cannot use nodeSelector", driver.Name)
39-
}
40-
if _, ok := driver.Spec.NodeSelector[consts.NVIDIADriverOwnerLabel]; ok {
41-
return fmt.Errorf("NVIDIADriver %q nodeSelector cannot use reserved label %q", driver.Name, consts.NVIDIADriverOwnerLabel)
42-
}
43-
return nil
44-
}
45-
46-
// NodeMatchesSelector returns true when all selector labels are present on the node.
47-
func NodeMatchesSelector(node *corev1.Node, selector map[string]string) bool {
48-
for key, value := range selector {
49-
if node.Labels[key] != value {
50-
return false
51-
}
52-
}
53-
return true
32+
// nodeMatchesSelector reports whether nodeLabels satisfy the NVIDIADriver nodeSelector.
33+
func nodeMatchesSelector(nodeLabels map[string]string, selector map[string]string) bool {
34+
return labels.SelectorFromSet(selector).Matches(labels.Set(nodeLabels))
5435
}
5536

5637
// AssignOwners labels GPU nodes with the NVIDIADriver that should manage their driver pods.
@@ -62,22 +43,24 @@ func AssignOwners(ctx context.Context, c client.Client) (bool, error) {
6243
return false, fmt.Errorf("failed to list NVIDIADriver CRs: %w", err)
6344
}
6445

65-
var defaultDriver *nvidiav1alpha1.NVIDIADriver
46+
var defaultDriver nvidiav1alpha1.NVIDIADriver
47+
hasDefaultDriver := false
6648
defaultDriverNames := []string{}
6749
nonDefaultDrivers := make([]nvidiav1alpha1.NVIDIADriver, 0, len(drivers.Items))
68-
for i := range drivers.Items {
69-
if !drivers.Items[i].GetDeletionTimestamp().IsZero() {
50+
for _, driver := range drivers.Items {
51+
if !driver.GetDeletionTimestamp().IsZero() {
7052
continue
7153
}
72-
if err := ValidateNodeSelector(&drivers.Items[i]); err != nil {
54+
if err := driver.ValidateNodeSelector(); err != nil {
7355
return false, err
7456
}
75-
if drivers.Items[i].IsDefault() {
76-
defaultDriverNames = append(defaultDriverNames, drivers.Items[i].Name)
77-
defaultDriver = &drivers.Items[i]
57+
if driver.IsDefault() {
58+
defaultDriverNames = append(defaultDriverNames, driver.Name)
59+
defaultDriver = driver
60+
hasDefaultDriver = true
7861
continue
7962
}
80-
nonDefaultDrivers = append(nonDefaultDrivers, drivers.Items[i])
63+
nonDefaultDrivers = append(nonDefaultDrivers, driver)
8164
}
8265
if len(defaultDriverNames) > 1 {
8366
return false, fmt.Errorf("multiple default NVIDIADrivers found: %s", strings.Join(defaultDriverNames, ", "))
@@ -87,15 +70,15 @@ func AssignOwners(ctx context.Context, c client.Client) (bool, error) {
8770
return false, fmt.Errorf("failed to list GPU nodes: %w", err)
8871
}
8972

90-
for i := range nodes.Items {
73+
for _, node := range nodes.Items {
9174
matchingDrivers := []string{}
9275
for _, driver := range nonDefaultDrivers {
93-
if NodeMatchesSelector(&nodes.Items[i], driver.GetNodeSelector()) {
76+
if nodeMatchesSelector(node.Labels, driver.GetNodeSelector()) {
9477
matchingDrivers = append(matchingDrivers, driver.Name)
9578
}
9679
}
9780
if len(matchingDrivers) > 1 {
98-
return false, fmt.Errorf("multiple NVIDIADrivers match the same node %s: %v", nodes.Items[i].Name, matchingDrivers)
81+
return false, fmt.Errorf("multiple NVIDIADrivers match the same node %s: %v", node.Name, matchingDrivers)
9982
}
10083
}
10184

@@ -105,11 +88,11 @@ func AssignOwners(ctx context.Context, c client.Client) (bool, error) {
10588
originalNode := node.DeepCopy()
10689
owner := ""
10790
for _, driver := range nonDefaultDrivers {
108-
if NodeMatchesSelector(node, driver.GetNodeSelector()) {
91+
if nodeMatchesSelector(node.Labels, driver.GetNodeSelector()) {
10992
owner = driver.Name
11093
}
11194
}
112-
if owner == "" && defaultDriver != nil && NodeMatchesSelector(node, defaultDriver.GetNodeSelector()) {
95+
if owner == "" && hasDefaultDriver && nodeMatchesSelector(node.Labels, defaultDriver.GetNodeSelector()) {
11396
owner = defaultDriver.Name
11497
}
11598
if owner == "" {

internal/nvidiadriver/nvidiadriver_test.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,98 @@ import (
3131
"github.com/NVIDIA/gpu-operator/internal/consts"
3232
)
3333

34+
func TestNodeMatchesSelector(t *testing.T) {
35+
testCases := []struct {
36+
description string
37+
nodeLabels map[string]string
38+
selector map[string]string
39+
expected bool
40+
}{
41+
{
42+
description: "non-default driver selector matches a GPU node",
43+
nodeLabels: map[string]string{
44+
consts.GPUPresentLabel: "true",
45+
"region": "us-east-1",
46+
},
47+
selector: map[string]string{"region": "us-east-1"},
48+
expected: true,
49+
},
50+
{
51+
description: "non-default driver selector does not match when label is missing",
52+
nodeLabels: map[string]string{consts.GPUPresentLabel: "true"},
53+
selector: map[string]string{"region": "us-east-1"},
54+
expected: false,
55+
},
56+
{
57+
description: "non-default driver selector does not match when label value differs",
58+
nodeLabels: map[string]string{
59+
consts.GPUPresentLabel: "true",
60+
"region": "us-west-2",
61+
},
62+
selector: map[string]string{"region": "us-east-1"},
63+
expected: false,
64+
},
65+
{
66+
description: "default driver empty selector matches GPU node",
67+
nodeLabels: map[string]string{consts.GPUPresentLabel: "true"},
68+
selector: map[string]string{},
69+
expected: true,
70+
},
71+
{
72+
description: "nil selector matches GPU node",
73+
nodeLabels: map[string]string{consts.GPUPresentLabel: "true"},
74+
selector: nil,
75+
expected: true,
76+
},
77+
{
78+
description: "empty selector matches node without labels",
79+
nodeLabels: nil,
80+
selector: map[string]string{},
81+
expected: true,
82+
},
83+
{
84+
description: "non-empty driver selector does not match node without labels",
85+
nodeLabels: nil,
86+
selector: map[string]string{"region": "us-east-1"},
87+
expected: false,
88+
},
89+
{
90+
description: "existing owner label does not affect user selector matching",
91+
nodeLabels: map[string]string{
92+
consts.GPUPresentLabel: "true",
93+
consts.NVIDIADriverOwnerLabel: "old-driver",
94+
"region": "us-east-1",
95+
},
96+
selector: map[string]string{"region": "us-east-1"},
97+
expected: true,
98+
},
99+
{
100+
description: "reserved owner selector follows exact label matching",
101+
nodeLabels: map[string]string{
102+
consts.GPUPresentLabel: "true",
103+
consts.NVIDIADriverOwnerLabel: "demo-gold",
104+
},
105+
selector: map[string]string{consts.NVIDIADriverOwnerLabel: "demo-gold"},
106+
expected: true,
107+
},
108+
{
109+
description: "reserved owner selector does not match a different owner",
110+
nodeLabels: map[string]string{
111+
consts.GPUPresentLabel: "true",
112+
consts.NVIDIADriverOwnerLabel: "demo-silver",
113+
},
114+
selector: map[string]string{consts.NVIDIADriverOwnerLabel: "demo-gold"},
115+
expected: false,
116+
},
117+
}
118+
119+
for _, tc := range testCases {
120+
t.Run(tc.description, func(t *testing.T) {
121+
require.Equal(t, tc.expected, nodeMatchesSelector(tc.nodeLabels, tc.selector))
122+
})
123+
}
124+
}
125+
34126
func TestAssignNVIDIADriverOwnersGivesSpecificDriversPrecedence(t *testing.T) {
35127
scheme := runtime.NewScheme()
36128
require.NoError(t, nvidiav1alpha1.AddToScheme(scheme))

internal/validator/validator.go

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ import (
2626
"sigs.k8s.io/controller-runtime/pkg/client"
2727

2828
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
29-
nvidiadriverutil "github.com/NVIDIA/gpu-operator/internal/nvidiadriver"
3029
)
3130

3231
// Validator provides interface to validate NVIDIADriver fields
@@ -47,7 +46,7 @@ func NewNodeSelectorValidator(c client.Client) Validator {
4746
// Check returns error when nodes matching with the selector labels of current instance of NVIDIADriver
4847
// are conflicting with other instances of NVIDIADriver
4948
func (nsv *nodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver) error {
50-
if err := nvidiadriverutil.ValidateNodeSelector(cr); err != nil {
49+
if err := cr.ValidateNodeSelector(); err != nil {
5150
return err
5251
}
5352

@@ -58,15 +57,15 @@ func (nsv *nodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1alph
5857
}
5958

6059
selectedNodeOwners := map[string][]string{}
61-
for di := range drivers.Items {
62-
if err := nvidiadriverutil.ValidateNodeSelector(&drivers.Items[di]); err != nil {
60+
for _, driver := range drivers.Items {
61+
if err := driver.ValidateNodeSelector(); err != nil {
6362
return err
6463
}
65-
if drivers.Items[di].IsDefault() {
64+
if driver.IsDefault() {
6665
continue
6766
}
68-
driverName := drivers.Items[di].Name
69-
nodeList, err := nsv.getNVIDIADriverSelectedNodes(ctx, &drivers.Items[di])
67+
driverName := driver.Name
68+
nodeList, err := nsv.getNVIDIADriverSelectedNodes(ctx, driver)
7069
if err != nil {
7170
return err
7271
}
@@ -86,14 +85,10 @@ func (nsv *nodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1alph
8685
}
8786

8887
// getNVIDIADriverSelectedNodes returns selected nodes based on the nodeselector labels set for a given NVIDIADriver instance
89-
func (nsv *nodeSelectorValidator) getNVIDIADriverSelectedNodes(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver) (*corev1.NodeList, error) {
88+
func (nsv *nodeSelectorValidator) getNVIDIADriverSelectedNodes(ctx context.Context, cr nvidiav1alpha1.NVIDIADriver) (*corev1.NodeList, error) {
9089
nodeList := &corev1.NodeList{}
9190

92-
if cr.Spec.NodeSelector == nil {
93-
cr.Spec.NodeSelector = cr.GetNodeSelector()
94-
}
95-
96-
selector := labels.Set(cr.Spec.NodeSelector).AsSelector()
91+
selector := labels.Set(cr.GetNodeSelector()).AsSelector()
9792

9893
opts := []client.ListOption{
9994
client.MatchingLabelsSelector{Selector: selector},

0 commit comments

Comments
 (0)