Skip to content

Commit c7d6ba0

Browse files
committed
add support for default nvidiadriver and migration from clusterpolicy to nvidiadriver
changes include: * added default nvidiadriver which doesn't conflict with other user-specified nvidiadrivers * added migration support from clusterpolicy to nvidiadriver * added/updated e2e tests for nvidiadriver workflow * use new field to mark a nvidiadriver as default instead of using a label * don't allow adding nodeselector to default nvidiadriver, it should cover entire cluster * move default NVIDIADriver ownership helpers into internal/nvidiadriver and re-use * requeue after node owner label changes so later reconcile steps read updated cache state * use patch instead of update for node label updates Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 9aec22d commit c7d6ba0

29 files changed

Lines changed: 2106 additions & 132 deletions

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2183,6 +2183,11 @@ func (d *DriverSpec) UseNvidiaDriverCRDType() bool {
21832183
return *d.UseNvidiaDriverCRD
21842184
}
21852185

2186+
// IsNVIDIADriverCRDEnabled returns true if driver management is enabled through NVIDIADriver CRs.
2187+
func (d *DriverSpec) IsNVIDIADriverCRDEnabled() bool {
2188+
return d != nil && d.IsEnabled() && d.UseNvidiaDriverCRDType()
2189+
}
2190+
21862191
// UsePrecompiledDrivers returns true if driver install is enabled using pre-compiled modules
21872192
func (d *DriverSpec) UsePrecompiledDrivers() bool {
21882193
if d.UsePrecompiled == nil {
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
* Copyright (c) NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package v1
18+
19+
import (
20+
"testing"
21+
22+
"github.com/stretchr/testify/require"
23+
)
24+
25+
func TestDriverSpecIsNVIDIADriverCRDEnabled(t *testing.T) {
26+
driverEnabled := true
27+
driverDisabled := false
28+
crdEnabled := true
29+
crdDisabled := false
30+
31+
tests := []struct {
32+
name string
33+
driverEnabled *bool
34+
crdEnabled *bool
35+
expected bool
36+
}{
37+
{
38+
name: "driver enabled by default and CRD enabled",
39+
crdEnabled: &crdEnabled,
40+
expected: true,
41+
},
42+
{
43+
name: "CRD disabled",
44+
crdEnabled: &crdDisabled,
45+
expected: false,
46+
},
47+
{
48+
name: "driver disabled",
49+
driverEnabled: &driverDisabled,
50+
crdEnabled: &crdEnabled,
51+
expected: false,
52+
},
53+
{
54+
name: "driver explicitly enabled and CRD enabled",
55+
driverEnabled: &driverEnabled,
56+
crdEnabled: &crdEnabled,
57+
expected: true,
58+
},
59+
}
60+
61+
for _, tc := range tests {
62+
t.Run(tc.name, func(t *testing.T) {
63+
driverSpec := DriverSpec{
64+
Enabled: tc.driverEnabled,
65+
UseNvidiaDriverCRD: tc.crdEnabled,
66+
}
67+
68+
require.Equal(t, tc.expected, driverSpec.IsNVIDIADriverCRDEnabled())
69+
})
70+
}
71+
72+
t.Run("nil driver spec", func(t *testing.T) {
73+
var driverSpec *DriverSpec
74+
require.False(t, driverSpec.IsNVIDIADriverCRDEnabled())
75+
})
76+
}

api/nvidia/v1alpha1/nvidiadriver_types.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,19 @@ const (
3636
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
3737
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
3838

39-
// NVIDIADriverSpec defines the desired state of NVIDIADriver
39+
// NVIDIADriverSpec defines the desired state of NVIDIADriver.
40+
// The CEL validation allows non-default drivers to use nodeSelector, but requires
41+
// default drivers to leave nodeSelector unset or empty.
42+
// +kubebuilder:validation:XValidation:rule="has(self.default) && self.default ? !has(self.nodeSelector) || size(self.nodeSelector) == 0 : true",message="default NVIDIADriver must not use nodeSelector"
4043
type NVIDIADriverSpec struct {
4144
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
4245
// Important: Run "make" to regenerate code after modifying this file
4346

47+
// Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
48+
// that do not match any non-default NVIDIADriver nodeSelector.
49+
// +kubebuilder:default=false
50+
Default bool `json:"default"`
51+
4452
// +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager
4553
// +kubebuilder:default=gpu
4654
// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="driverType is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting."
@@ -504,6 +512,7 @@ type NVIDIADriverStatus struct {
504512
//+kubebuilder:subresource:status
505513
//+kubebuilder:resource:scope=Cluster,shortName={"nvd","nvdriver","nvdrivers"}
506514
//+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0
515+
//+kubebuilder:printcolumn:name="Default",type=boolean,JSONPath=`.spec.default`,priority=0
507516
//+kubebuilder:printcolumn:name="Age",type=string,JSONPath=`.metadata.creationTimestamp`,priority=0
508517

509518
// NVIDIADriver is the Schema for the nvidiadrivers API
@@ -524,6 +533,31 @@ type NVIDIADriverList struct {
524533
Items []NVIDIADriver `json:"items"`
525534
}
526535

536+
// IsDefault returns true when the NVIDIADriver is marked as the fallback driver.
537+
func (d *NVIDIADriver) IsDefault() bool {
538+
return d != nil && d.Spec.Default
539+
}
540+
541+
// HasDeletionTimestamp returns true when the NVIDIADriver is marked for deletion.
542+
func (d *NVIDIADriver) HasDeletionTimestamp() bool {
543+
return d != nil && !d.GetDeletionTimestamp().IsZero()
544+
}
545+
546+
// ValidateNodeSelector rejects selectors that use operator-managed routing labels
547+
// or scope the default fallback driver.
548+
func (d *NVIDIADriver) ValidateNodeSelector() error {
549+
if d == nil || d.Spec.NodeSelector == nil {
550+
return nil
551+
}
552+
if d.IsDefault() && len(d.Spec.NodeSelector) > 0 {
553+
return fmt.Errorf("default NVIDIADriver %q cannot use nodeSelector", d.Name)
554+
}
555+
if _, ok := d.Spec.NodeSelector[consts.NVIDIADriverOwnerLabel]; ok {
556+
return fmt.Errorf("NVIDIADriver %q nodeSelector cannot use reserved label %q", d.Name, consts.NVIDIADriverOwnerLabel)
557+
}
558+
return nil
559+
}
560+
527561
// UsePrecompiledDrivers returns true if usePrecompiled option is enabled in spec
528562
func (d *NVIDIADriverSpec) UsePrecompiledDrivers() bool {
529563
if d.UsePrecompiled == nil {

api/nvidia/v1alpha1/nvidiadriver_types_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,26 @@ import (
2020
"testing"
2121

2222
"github.com/stretchr/testify/require"
23+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2324
)
2425

2526
const (
2627
testDigest = "10d1df8034373061366d4fb17b364b3b28d766b54d5a0b700c1a5a75378cf125"
2728
)
2829

30+
func TestNVIDIADriverHasDeletionTimestamp(t *testing.T) {
31+
var nilDriver *NVIDIADriver
32+
require.False(t, nilDriver.HasDeletionTimestamp())
33+
require.False(t, (&NVIDIADriver{}).HasDeletionTimestamp())
34+
35+
deletionTimestamp := metav1.Now()
36+
require.True(t, (&NVIDIADriver{
37+
ObjectMeta: metav1.ObjectMeta{
38+
DeletionTimestamp: &deletionTimestamp,
39+
},
40+
}).HasDeletionTimestamp())
41+
}
42+
2943
func TestGetImagePath(t *testing.T) {
3044
testCases := []struct {
3145
description string

bundle/manifests/nvidia.com_nvidiadrivers.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ spec:
2222
- jsonPath: .status.state
2323
name: Status
2424
type: string
25+
- jsonPath: .spec.default
26+
name: Default
27+
type: boolean
2528
- jsonPath: .metadata.creationTimestamp
2629
name: Age
2730
type: string
@@ -48,7 +51,10 @@ spec:
4851
metadata:
4952
type: object
5053
spec:
51-
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
54+
description: |-
55+
NVIDIADriverSpec defines the desired state of NVIDIADriver.
56+
The CEL validation allows non-default drivers to use nodeSelector, but requires
57+
default drivers to leave nodeSelector unset or empty.
5258
properties:
5359
annotations:
5460
additionalProperties:
@@ -70,6 +76,12 @@ spec:
7076
name:
7177
type: string
7278
type: object
79+
default:
80+
default: false
81+
description: |-
82+
Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
83+
that do not match any non-default NVIDIADriver nodeSelector.
84+
type: boolean
7385
driverType:
7486
default: gpu
7587
description: DriverType defines NVIDIA driver type
@@ -968,9 +980,14 @@ spec:
968980
type: string
969981
type: object
970982
required:
983+
- default
971984
- driverType
972985
- image
973986
type: object
987+
x-kubernetes-validations:
988+
- message: default NVIDIADriver must not use nodeSelector
989+
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990+
size(self.nodeSelector) == 0 : true'
974991
status:
975992
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
976993
properties:

config/crd/bases/nvidia.com_nvidiadrivers.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ spec:
2222
- jsonPath: .status.state
2323
name: Status
2424
type: string
25+
- jsonPath: .spec.default
26+
name: Default
27+
type: boolean
2528
- jsonPath: .metadata.creationTimestamp
2629
name: Age
2730
type: string
@@ -48,7 +51,10 @@ spec:
4851
metadata:
4952
type: object
5053
spec:
51-
description: NVIDIADriverSpec defines the desired state of NVIDIADriver
54+
description: |-
55+
NVIDIADriverSpec defines the desired state of NVIDIADriver.
56+
The CEL validation allows non-default drivers to use nodeSelector, but requires
57+
default drivers to leave nodeSelector unset or empty.
5258
properties:
5359
annotations:
5460
additionalProperties:
@@ -70,6 +76,12 @@ spec:
7076
name:
7177
type: string
7278
type: object
79+
default:
80+
default: false
81+
description: |-
82+
Default indicates that this NVIDIADriver acts as the fallback driver daemon set manager for GPU nodes
83+
that do not match any non-default NVIDIADriver nodeSelector.
84+
type: boolean
7385
driverType:
7486
default: gpu
7587
description: DriverType defines NVIDIA driver type
@@ -968,9 +980,14 @@ spec:
968980
type: string
969981
type: object
970982
required:
983+
- default
971984
- driverType
972985
- image
973986
type: object
987+
x-kubernetes-validations:
988+
- message: default NVIDIADriver must not use nodeSelector
989+
rule: 'has(self.default) && self.default ? !has(self.nodeSelector) ||
990+
size(self.nodeSelector) == 0 : true'
974991
status:
975992
description: NVIDIADriverStatus defines the observed state of NVIDIADriver
976993
properties:

0 commit comments

Comments
 (0)