Skip to content

Commit e7b72ea

Browse files
committed
address review comments
changes include: 1. use new field to mark a nvidiadriver as default instead of using a label 2. don't allow adding nodeselector to default nvidiadriver, it should cover entire cluster Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent c3eabcf commit e7b72ea

17 files changed

Lines changed: 162 additions & 114 deletions

api/nvidia/v1alpha1/nvidiadriver_types.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,19 @@ const (
3636
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
3737
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
3838

39-
// NVIDIADriverSpec defines the desired state of NVIDIADriver
39+
// NVIDIADriverSpec defines the desired state of NVIDIADriver.
40+
// The CEL validation allows non-default drivers to use nodeSelector, but requires
41+
// default drivers to leave nodeSelector unset or empty.
42+
// +kubebuilder:validation:XValidation:rule="has(self.default) && self.default ? !has(self.nodeSelector) || size(self.nodeSelector) == 0 : true",message="default NVIDIADriver cannot use nodeSelector"
4043
type NVIDIADriverSpec struct {
4144
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
4245
// Important: Run "make" to regenerate code after modifying this file
4346

47+
// Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
48+
// that do not match any non-default NVIDIADriver nodeSelector.
49+
// +kubebuilder:default=false
50+
Default bool `json:"default"`
51+
4452
// +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager
4553
// +kubebuilder:default=gpu
4654
// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="driverType is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting."
@@ -504,6 +512,7 @@ type NVIDIADriverStatus struct {
504512
//+kubebuilder:subresource:status
505513
//+kubebuilder:resource:scope=Cluster,shortName={"nvd","nvdriver","nvdrivers"}
506514
//+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0
515+
//+kubebuilder:printcolumn:name="Default",type=boolean,JSONPath=`.spec.default`,priority=0
507516
//+kubebuilder:printcolumn:name="Age",type=string,JSONPath=`.metadata.creationTimestamp`,priority=0
508517

509518
// NVIDIADriver is the Schema for the nvidiadrivers API

bundle/manifests/nvidia.com_nvidiadrivers.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ spec:
2222
- jsonPath: .status.state
2323
name: Status
2424
type: string
25+
- jsonPath: .spec.default
26+
name: Default
27+
type: boolean
2528
- jsonPath: .metadata.creationTimestamp
2629
name: Age
2730
type: string
@@ -48,7 +51,10 @@ spec:
4851
metadata:
4952
type: object
5053
spec:
51-
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
54+
description: |-
55+
NVIDIADriverSpec defines the desired state of NVIDIADriver.
56+
The CEL validation allows non-default drivers to use nodeSelector, but requires
57+
default drivers to leave nodeSelector unset or empty.
5258
properties:
5359
annotations:
5460
additionalProperties:
@@ -70,6 +76,12 @@ spec:
7076
name:
7177
type: string
7278
type: object
79+
default:
80+
default: false
81+
description: |-
82+
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
83+
that do not match any non-default NVIDIADriver nodeSelector.
84+
type: boolean
7385
driverType:
7486
default: gpu
7587
description: DriverType defines NVIDIA driver type
@@ -968,9 +980,14 @@ spec:
968980
type: string
969981
type: object
970982
required:
983+
- default
971984
- driverType
972985
- image
973986
type: object
987+
x-kubernetes-validations:
988+
- message: default NVIDIADriver cannot use nodeSelector
989+
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990+
size(self.nodeSelector) == 0 : true'
974991
status:
975992
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
976993
properties:

config/crd/bases/nvidia.com_nvidiadrivers.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ spec:
2222
- jsonPath: .status.state
2323
name: Status
2424
type: string
25+
- jsonPath: .spec.default
26+
name: Default
27+
type: boolean
2528
- jsonPath: .metadata.creationTimestamp
2629
name: Age
2730
type: string
@@ -48,7 +51,10 @@ spec:
4851
metadata:
4952
type: object
5053
spec:
51-
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
54+
description: |-
55+
NVIDIADriverSpec defines the desired state of NVIDIADriver.
56+
The CEL validation allows non-default drivers to use nodeSelector, but requires
57+
default drivers to leave nodeSelector unset or empty.
5258
properties:
5359
annotations:
5460
additionalProperties:
@@ -70,6 +76,12 @@ spec:
7076
name:
7177
type: string
7278
type: object
79+
default:
80+
default: false
81+
description: |-
82+
Default indicates that this NVIDIADriver acts as the fallback driver for GPU nodes
83+
that do not match any non-default NVIDIADriver nodeSelector.
84+
type: boolean
7385
driverType:
7486
default: gpu
7587
description: DriverType defines NVIDIA driver type
@@ -968,9 +980,14 @@ spec:
968980
type: string
969981
type: object
970982
required:
983+
- default
971984
- driverType
972985
- image
973986
type: object
987+
x-kubernetes-validations:
988+
- message: default NVIDIADriver cannot use nodeSelector
989+
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990+
size(self.nodeSelector) == 0 : true'
974991
status:
975992
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
976993
properties:

controllers/clusterpolicy_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl
408408
}
409409
return requests
410410
}),
411-
nvidiaDriverGenerationOrDefaultLabelChangedPredicate(),
411+
nvidiaDriverGenerationChangedPredicate(),
412412
),
413413
)
414414
if err != nil {

controllers/nvidiadriver_controller.go

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
340340
mgr.GetCache(),
341341
&nvidiav1alpha1.NVIDIADriver{},
342342
handler.TypedEnqueueRequestsFromMapFunc(nvidiaDriverMapFn),
343-
nvidiaDriverGenerationOrDefaultLabelChangedPredicate(),
343+
nvidiaDriverGenerationChangedPredicate(),
344344
),
345345
)
346346
if err != nil {
@@ -438,9 +438,8 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
438438
return nil
439439
}
440440

441-
// nvidiaDriverGenerationOrDefaultLabelChangedPredicate reconciles NVIDIADrivers on spec changes and
442-
// on changes to the default-driver label, which is metadata but affects ownership semantics.
443-
func nvidiaDriverGenerationOrDefaultLabelChangedPredicate() predicate.TypedPredicate[*nvidiav1alpha1.NVIDIADriver] {
441+
// nvidiaDriverGenerationChangedPredicate reconciles NVIDIADrivers on create, delete, and spec changes.
442+
func nvidiaDriverGenerationChangedPredicate() predicate.TypedPredicate[*nvidiav1alpha1.NVIDIADriver] {
444443
return predicate.TypedFuncs[*nvidiav1alpha1.NVIDIADriver]{
445444
CreateFunc: func(event.TypedCreateEvent[*nvidiav1alpha1.NVIDIADriver]) bool {
446445
return true
@@ -455,9 +454,7 @@ func nvidiaDriverGenerationOrDefaultLabelChangedPredicate() predicate.TypedPredi
455454
if e.ObjectOld.GetGeneration() != e.ObjectNew.GetGeneration() {
456455
return true
457456
}
458-
oldLabels := e.ObjectOld.GetLabels()
459-
newLabels := e.ObjectNew.GetLabels()
460-
return oldLabels[consts.DefaultNVIDIADriverLabel] != newLabels[consts.DefaultNVIDIADriverLabel]
457+
return false
461458
},
462459
GenericFunc: func(event.TypedGenericEvent[*nvidiav1alpha1.NVIDIADriver]) bool {
463460
return false

controllers/nvidiadriver_controller_test.go

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ import (
4040

4141
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
4242
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
43-
"github.com/NVIDIA/gpu-operator/internal/consts"
4443
"github.com/NVIDIA/gpu-operator/internal/validator"
4544
)
4645

@@ -335,30 +334,14 @@ func TestEnqueueAllNVIDIADrivers(t *testing.T) {
335334
require.Equal(t, []string{"default/driver-a", "default/driver-b"}, got)
336335
}
337336

338-
func TestNVIDIADriverGenerationOrDefaultLabelChangedPredicate(t *testing.T) {
339-
p := nvidiaDriverGenerationOrDefaultLabelChangedPredicate()
337+
func TestNVIDIADriverGenerationChangedPredicate(t *testing.T) {
338+
p := nvidiaDriverGenerationChangedPredicate()
340339

341340
require.True(t, p.Update(event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]{
342341
ObjectOld: &nvidiav1alpha1.NVIDIADriver{ObjectMeta: metav1.ObjectMeta{Name: "driver", Generation: 1}},
343342
ObjectNew: &nvidiav1alpha1.NVIDIADriver{ObjectMeta: metav1.ObjectMeta{Name: "driver", Generation: 2}},
344343
}))
345344

346-
require.True(t, p.Update(event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]{
347-
ObjectOld: &nvidiav1alpha1.NVIDIADriver{
348-
ObjectMeta: metav1.ObjectMeta{
349-
Name: "driver",
350-
Generation: 1,
351-
},
352-
},
353-
ObjectNew: &nvidiav1alpha1.NVIDIADriver{
354-
ObjectMeta: metav1.ObjectMeta{
355-
Name: "driver",
356-
Generation: 1,
357-
Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"},
358-
},
359-
},
360-
}))
361-
362345
require.False(t, p.Update(event.TypedUpdateEvent[*nvidiav1alpha1.NVIDIADriver]{
363346
ObjectOld: &nvidiav1alpha1.NVIDIADriver{
364347
ObjectMeta: metav1.ObjectMeta{

controllers/nvidiadriver_default.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ import (
3131

3232
// isDefaultNVIDIADriver returns true when the NVIDIADriver is marked as the fallback driver.
3333
func isDefaultNVIDIADriver(driver *nvidiav1alpha1.NVIDIADriver) bool {
34-
return driver != nil && driver.Labels[consts.DefaultNVIDIADriverLabel] == "true"
34+
return driver != nil && driver.Spec.Default
3535
}
3636

3737
// nvidiaDriverCRDEnabled returns true when ClusterPolicy driver management is enabled through NVIDIADriver CRs.
@@ -41,11 +41,15 @@ func nvidiaDriverCRDEnabled(clusterPolicy *gpuv1.ClusterPolicy) bool {
4141
clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType()
4242
}
4343

44-
// validateNVIDIADriverNodeSelector rejects selectors that use operator-managed routing labels.
44+
// validateNVIDIADriverNodeSelector rejects selectors that use operator-managed routing labels
45+
// or scope the default fallback driver.
4546
func validateNVIDIADriverNodeSelector(driver *nvidiav1alpha1.NVIDIADriver) error {
4647
if driver == nil || driver.Spec.NodeSelector == nil {
4748
return nil
4849
}
50+
if isDefaultNVIDIADriver(driver) && len(driver.Spec.NodeSelector) > 0 {
51+
return fmt.Errorf("default NVIDIADriver %q cannot use nodeSelector", driver.Name)
52+
}
4953
if _, ok := driver.Spec.NodeSelector[consts.NVIDIADriverOwnerLabel]; ok {
5054
return fmt.Errorf("NVIDIADriver %q nodeSelector cannot use reserved label %q", driver.Name, consts.NVIDIADriverOwnerLabel)
5155
}

controllers/nvidiadriver_default_test.go

Lines changed: 23 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,8 @@ func TestAssignNVIDIADriverOwnersGivesSpecificDriversPrecedence(t *testing.T) {
9090
require.NoError(t, corev1.AddToScheme(scheme))
9191

9292
defaultDriver := &nvidiav1alpha1.NVIDIADriver{
93-
ObjectMeta: metav1.ObjectMeta{
94-
Name: consts.DefaultNVIDIADriverName,
95-
Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"},
96-
},
93+
ObjectMeta: metav1.ObjectMeta{Name: consts.DefaultNVIDIADriverName},
94+
Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true},
9795
}
9896
specificDriver := &nvidiav1alpha1.NVIDIADriver{
9997
ObjectMeta: metav1.ObjectMeta{Name: "h100-driver"},
@@ -150,16 +148,14 @@ func TestAssignNVIDIADriverOwnersAllowsMissingDefaultDriver(t *testing.T) {
150148
require.Equal(t, "h100-driver", specificNode.Labels[consts.NVIDIADriverOwnerLabel])
151149
}
152150

153-
func TestAssignNVIDIADriverOwnersUsesLabeledDefaultDriverWithArbitraryName(t *testing.T) {
151+
func TestAssignNVIDIADriverOwnersUsesDefaultDriverWithArbitraryName(t *testing.T) {
154152
scheme := runtime.NewScheme()
155153
require.NoError(t, nvidiav1alpha1.AddToScheme(scheme))
156154
require.NoError(t, corev1.AddToScheme(scheme))
157155

158156
defaultDriver := &nvidiav1alpha1.NVIDIADriver{
159-
ObjectMeta: metav1.ObjectMeta{
160-
Name: "fallback-driver",
161-
Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"},
162-
},
157+
ObjectMeta: metav1.ObjectMeta{Name: "fallback-driver"},
158+
Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true},
163159
}
164160
node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{
165161
Name: "gpu-node",
@@ -180,16 +176,12 @@ func TestAssignNVIDIADriverOwnersErrorsOnMultipleDefaultDrivers(t *testing.T) {
180176
require.NoError(t, corev1.AddToScheme(scheme))
181177

182178
defaultDriverA := &nvidiav1alpha1.NVIDIADriver{
183-
ObjectMeta: metav1.ObjectMeta{
184-
Name: "fallback-a",
185-
Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"},
186-
},
179+
ObjectMeta: metav1.ObjectMeta{Name: "fallback-a"},
180+
Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true},
187181
}
188182
defaultDriverB := &nvidiav1alpha1.NVIDIADriver{
189-
ObjectMeta: metav1.ObjectMeta{
190-
Name: "fallback-b",
191-
Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"},
192-
},
183+
ObjectMeta: metav1.ObjectMeta{Name: "fallback-b"},
184+
Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true},
193185
}
194186
node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{
195187
Name: "gpu-node",
@@ -236,17 +228,15 @@ func TestAssignNVIDIADriverOwnersRejectsReservedOwnerLabelSelector(t *testing.T)
236228
require.Equal(t, "existing-driver", node.Labels[consts.NVIDIADriverOwnerLabel])
237229
}
238230

239-
func TestAssignNVIDIADriverOwnersHonorsDefaultDriverNodeSelector(t *testing.T) {
231+
func TestAssignNVIDIADriverOwnersRejectsDefaultDriverNodeSelector(t *testing.T) {
240232
scheme := runtime.NewScheme()
241233
require.NoError(t, nvidiav1alpha1.AddToScheme(scheme))
242234
require.NoError(t, corev1.AddToScheme(scheme))
243235

244236
defaultDriver := &nvidiav1alpha1.NVIDIADriver{
245-
ObjectMeta: metav1.ObjectMeta{
246-
Name: consts.DefaultNVIDIADriverName,
247-
Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"},
248-
},
237+
ObjectMeta: metav1.ObjectMeta{Name: consts.DefaultNVIDIADriverName},
249238
Spec: nvidiav1alpha1.NVIDIADriverSpec{
239+
Default: true,
250240
NodeSelector: map[string]string{"driver-default": "true"},
251241
},
252242
}
@@ -271,14 +261,17 @@ func TestAssignNVIDIADriverOwnersHonorsDefaultDriverNodeSelector(t *testing.T) {
271261

272262
k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(defaultDriver, specificDriver, defaultNode, unmatchedNode, specificNode).Build()
273263

274-
require.NoError(t, assignNVIDIADriverOwners(context.Background(), k8sClient))
264+
err := assignNVIDIADriverOwners(context.Background(), k8sClient)
265+
require.Error(t, err)
266+
require.Contains(t, err.Error(), "default NVIDIADriver")
267+
require.Contains(t, err.Error(), "cannot use nodeSelector")
275268

276269
require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "default-node"}, defaultNode))
277270
require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "unmatched-node"}, unmatchedNode))
278271
require.NoError(t, k8sClient.Get(context.Background(), client.ObjectKey{Name: "specific-node"}, specificNode))
279-
require.Equal(t, consts.DefaultNVIDIADriverName, defaultNode.Labels[consts.NVIDIADriverOwnerLabel])
280-
require.NotContains(t, unmatchedNode.Labels, consts.NVIDIADriverOwnerLabel)
281-
require.Equal(t, "h100-driver", specificNode.Labels[consts.NVIDIADriverOwnerLabel])
272+
require.NotContains(t, defaultNode.Labels, consts.NVIDIADriverOwnerLabel)
273+
require.Equal(t, consts.DefaultNVIDIADriverName, unmatchedNode.Labels[consts.NVIDIADriverOwnerLabel])
274+
require.NotContains(t, specificNode.Labels, consts.NVIDIADriverOwnerLabel)
282275
}
283276

284277
func TestAssignNVIDIADriverOwnersDoesNotFallbackToDefaultOnUserDriverConflict(t *testing.T) {
@@ -287,10 +280,8 @@ func TestAssignNVIDIADriverOwnersDoesNotFallbackToDefaultOnUserDriverConflict(t
287280
require.NoError(t, corev1.AddToScheme(scheme))
288281

289282
defaultDriver := &nvidiav1alpha1.NVIDIADriver{
290-
ObjectMeta: metav1.ObjectMeta{
291-
Name: consts.DefaultNVIDIADriverName,
292-
Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"},
293-
},
283+
ObjectMeta: metav1.ObjectMeta{Name: consts.DefaultNVIDIADriverName},
284+
Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true},
294285
}
295286
driverA := &nvidiav1alpha1.NVIDIADriver{
296287
ObjectMeta: metav1.ObjectMeta{Name: "driver-a"},
@@ -329,10 +320,8 @@ func TestAssignNVIDIADriverOwnersDoesNotChangeOwnersWhenAnyUserDriverConflicts(t
329320
require.NoError(t, corev1.AddToScheme(scheme))
330321

331322
defaultDriver := &nvidiav1alpha1.NVIDIADriver{
332-
ObjectMeta: metav1.ObjectMeta{
333-
Name: consts.DefaultNVIDIADriverName,
334-
Labels: map[string]string{consts.DefaultNVIDIADriverLabel: "true"},
335-
},
323+
ObjectMeta: metav1.ObjectMeta{Name: consts.DefaultNVIDIADriverName},
324+
Spec: nvidiav1alpha1.NVIDIADriverSpec{Default: true},
336325
}
337326
goldDriver := &nvidiav1alpha1.NVIDIADriver{
338327
ObjectMeta: metav1.ObjectMeta{Name: "demo-gold"},

0 commit comments

Comments
 (0)