Skip to content

Commit 87fa6c0

Browse files
Add DRA driver operand to GPUClusterConfig controller
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent ccc0f7a commit 87fa6c0

36 files changed

Lines changed: 2304 additions & 89 deletions

api/nvidia/v1alpha1/gpuclusterconfig_types.go

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@ const (
3939
// or the device-plugin; the driver is installed separately (host-installed or via an
4040
// NVIDIADriver CR) and GPUClusterConfig waits for driver readiness before proceeding.
4141
type GPUClusterConfigSpec struct {
42-
// DraDriver defines the spec for the NVIDIA DRA driver stack (gpus + computeDomains).
43-
DraDriver DraDriverSpec `json:"draDriver"`
42+
// DRADriver defines the spec for the NVIDIA DRA driver stack (gpus + computeDomains).
43+
DRADriver DRADriverSpec `json:"draDriver"`
4444

4545
// DCGM defines the spec for the standalone NVIDIA DCGM hostengine. Disabled by default;
46-
// when disabled, dcgm-exporter uses its embedded nv-hostengine.
47-
// NOTE: the reused enabled field carries no server-side default, so an omitted enabled
48-
// is nil here; the controller is responsible for interpreting nil (see defaults handling).
46+
// when disabled, dcgm-exporter uses its embedded nv-hostengine. NOTE: the reused enabled
47+
// field carries no server-side default and its IsEnabled() treats nil as enabled, so the
48+
// controller must default nil enabled to disabled here.
4949
DCGM *nvidiav1.DCGMSpec `json:"dcgm,omitempty"`
5050

5151
// DCGMExporter defines the spec for NVIDIA DCGM Exporter. Enabled by default, but the
@@ -64,9 +64,9 @@ type GPUClusterConfigSpec struct {
6464
Daemonsets nvidiav1.DaemonsetsSpec `json:"daemonsets,omitempty"`
6565
}
6666

67-
// DraDriverSpec defines the spec for the NVIDIA DRA driver stack. There is no top-level
67+
// DRADriverSpec defines the spec for the NVIDIA DRA driver stack. There is no top-level
6868
// enabled toggle; enablement is per capability (gpus / computeDomains).
69-
type DraDriverSpec struct {
69+
type DRADriverSpec struct {
7070
// NVIDIA DRA driver image repository
7171
// +kubebuilder:validation:Optional
7272
Repository string `json:"repository,omitempty"`
@@ -87,45 +87,60 @@ type DraDriverSpec struct {
8787
// +kubebuilder:validation:Optional
8888
ImagePullSecrets []string `json:"imagePullSecrets,omitempty"`
8989

90+
// FeatureGates is a map of feature gate names to a boolean enabling or disabling each.
91+
// It is rendered as the FEATURE_GATES environment variable on the DRA driver containers.
92+
// +kubebuilder:validation:Optional
93+
FeatureGates map[string]bool `json:"featureGates,omitempty"`
94+
9095
// GPUs configures the gpu.nvidia.com capability of the DRA driver.
91-
GPUs DraDriverGPUsSpec `json:"gpus,omitempty"`
96+
GPUs DRADriverGPUsSpec `json:"gpus,omitempty"`
9297

9398
// ComputeDomains configures the compute-domain capability of the DRA driver.
94-
ComputeDomains DraDriverComputeDomainsSpec `json:"computeDomains,omitempty"`
99+
ComputeDomains DRADriverComputeDomainsSpec `json:"computeDomains,omitempty"`
100+
}
101+
102+
// IsGPUsEnabled returns true if the gpus capability of the DRA driver is enabled.
103+
func (d *DRADriverSpec) IsGPUsEnabled() bool {
104+
return d.GPUs.Enabled != nil && *d.GPUs.Enabled
105+
}
106+
107+
// IsComputeDomainsEnabled returns true if the computeDomains capability of the DRA driver is enabled.
108+
func (d *DRADriverSpec) IsComputeDomainsEnabled() bool {
109+
return d.ComputeDomains.Enabled != nil && *d.ComputeDomains.Enabled
95110
}
96111

97-
// DraDriverGPUsSpec configures the gpus capability of the DRA driver. It maps onto the
112+
// DRADriverGPUsSpec configures the gpus capability of the DRA driver. It maps onto the
98113
// gpus container of the upstream kubelet-plugin DaemonSet.
99-
type DraDriverGPUsSpec struct {
114+
type DRADriverGPUsSpec struct {
100115
// Enabled indicates if the gpus capability of the DRA driver is enabled.
101116
// +kubebuilder:default=true
102117
Enabled *bool `json:"enabled,omitempty"`
103118

104119
// KubeletPlugin configures the kubelet-plugin workload for the gpus capability.
105-
KubeletPlugin DraDriverKubeletPluginSpec `json:"kubeletPlugin,omitempty"`
120+
KubeletPlugin DRADriverKubeletPluginSpec `json:"kubeletPlugin,omitempty"`
106121
}
107122

108-
// DraDriverComputeDomainsSpec configures the computeDomains capability of the DRA driver.
123+
// DRADriverComputeDomainsSpec configures the computeDomains capability of the DRA driver.
109124
// The kubeletPlugin maps onto the computeDomains container of the upstream kubelet-plugin
110125
// DaemonSet; the controller is a separate Deployment.
111-
type DraDriverComputeDomainsSpec struct {
126+
type DRADriverComputeDomainsSpec struct {
112127
// Enabled indicates if the computeDomains capability of the DRA driver is enabled.
113128
// +kubebuilder:default=true
114129
Enabled *bool `json:"enabled,omitempty"`
115130

116131
// Controller configures the compute-domain controller Deployment.
117-
Controller DraDriverControllerSpec `json:"controller,omitempty"`
132+
Controller DRADriverControllerSpec `json:"controller,omitempty"`
118133

119134
// KubeletPlugin configures the kubelet-plugin workload for the computeDomains capability.
120-
KubeletPlugin DraDriverKubeletPluginSpec `json:"kubeletPlugin,omitempty"`
135+
KubeletPlugin DRADriverKubeletPluginSpec `json:"kubeletPlugin,omitempty"`
121136
}
122137

123-
// DraDriverKubeletPluginSpec defines configuration for a DRA driver kubelet-plugin container.
138+
// DRADriverKubeletPluginSpec defines configuration for a DRA driver kubelet-plugin container.
124139
// Per-component scheduling fields augment/override the shared daemonsets defaults for this
125140
// workload. The gpus and computeDomains kubelet-plugin blocks map onto the two containers of
126141
// a single kubelet-plugin DaemonSet, so the renderer reconciles pod-level scheduling when
127142
// both blocks set it.
128-
type DraDriverKubeletPluginSpec struct {
143+
type DRADriverKubeletPluginSpec struct {
129144
// Optional: List of environment variables
130145
// +kubebuilder:validation:Optional
131146
Env []nvidiav1.EnvVar `json:"env,omitempty"`
@@ -156,10 +171,10 @@ type DraDriverKubeletPluginSpec struct {
156171
Affinity *corev1.Affinity `json:"affinity,omitempty"`
157172
}
158173

159-
// DraDriverControllerSpec defines configuration for the compute-domain controller Deployment.
174+
// DRADriverControllerSpec defines configuration for the compute-domain controller Deployment.
160175
// As a Deployment (not a DaemonSet) it carries its own scheduling configuration rather than
161176
// inheriting the shared daemonsets defaults.
162-
type DraDriverControllerSpec struct {
177+
type DRADriverControllerSpec struct {
163178
// Optional: List of environment variables
164179
// +kubebuilder:validation:Optional
165180
Env []nvidiav1.EnvVar `json:"env,omitempty"`

api/nvidia/v1alpha1/zz_generated.deepcopy.go

Lines changed: 28 additions & 21 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/nvidia.com_gpuclusterconfigs.yaml

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -365,9 +365,9 @@ spec:
365365
dcgm:
366366
description: |-
367367
DCGM defines the spec for the standalone NVIDIA DCGM hostengine. Disabled by default;
368-
when disabled, dcgm-exporter uses its embedded nv-hostengine.
369-
NOTE: the reused enabled field carries no server-side default, so an omitted enabled
370-
is nil here; the controller is responsible for interpreting nil (see defaults handling).
368+
when disabled, dcgm-exporter uses its embedded nv-hostengine. NOTE: the reused enabled
369+
field carries no server-side default and its IsEnabled() treats nil as enabled, so the
370+
controller must default nil enabled to disabled here.
371371
properties:
372372
args:
373373
description: 'Optional: List of arguments'
@@ -716,7 +716,7 @@ spec:
716716
type: string
717717
type: object
718718
draDriver:
719-
description: DraDriver defines the spec for the NVIDIA DRA driver
719+
description: DRADriver defines the spec for the NVIDIA DRA driver
720720
stack (gpus + computeDomains).
721721
properties:
722722
computeDomains:
@@ -2795,6 +2795,13 @@ spec:
27952795
type: array
27962796
type: object
27972797
type: object
2798+
featureGates:
2799+
additionalProperties:
2800+
type: boolean
2801+
description: |-
2802+
FeatureGates is a map of feature gate names to a boolean enabling or disabling each.
2803+
It is rendered as the FEATURE_GATES environment variable on the DRA driver containers.
2804+
type: object
27982805
gpus:
27992806
description: GPUs configures the gpu.nvidia.com capability of
28002807
the DRA driver.
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
---
2+
apiVersion: apiextensions.k8s.io/v1
3+
kind: CustomResourceDefinition
4+
metadata:
5+
annotations:
6+
controller-gen.kubebuilder.io/version: v0.17.1
7+
name: computedomaincliques.resource.nvidia.com
8+
spec:
9+
group: resource.nvidia.com
10+
names:
11+
kind: ComputeDomainClique
12+
listKind: ComputeDomainCliqueList
13+
plural: computedomaincliques
14+
singular: computedomainclique
15+
scope: Namespaced
16+
versions:
17+
- name: v1beta1
18+
schema:
19+
openAPIV3Schema:
20+
description: |-
21+
ComputeDomainClique holds information about a specific clique within a ComputeDomain.
22+
It is created in the driver namespace and named as "<computeDomainUID>.<cliqueID>".
23+
properties:
24+
apiVersion:
25+
description: |-
26+
APIVersion defines the versioned schema of this representation of an object.
27+
Servers should convert recognized schemas to the latest internal value, and
28+
may reject unrecognized values.
29+
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
30+
type: string
31+
daemons:
32+
items:
33+
description: ComputeDomainDaemonInfo provides information about each
34+
daemon in a ComputeDomainClique.
35+
properties:
36+
cliqueID:
37+
type: string
38+
index:
39+
description: |-
40+
The Index field is used to ensure a consistent IP-to-DNS name
41+
mapping across all machines within an IMEX domain. Each node's index
42+
directly determines its DNS name within a given NVLink partition
43+
(i.e. clique). In other words, the 2-tuple of (CliqueID, Index) will
44+
always be unique. This field is marked as optional (but not
45+
omitempty) in order to support downgrades and avoid an API bump.
46+
type: integer
47+
ipAddress:
48+
type: string
49+
nodeName:
50+
type: string
51+
status:
52+
default: NotReady
53+
description: |-
54+
The Status field tracks the readiness of the IMEX daemon running on
55+
this node. It gets switched to Ready whenever the IMEX daemon is
56+
ready to broker GPU memory exchanges and switches to NotReady when
57+
it is not. It is marked as optional in order to support downgrades
58+
and avoid an API bump.
59+
enum:
60+
- Ready
61+
- NotReady
62+
type: string
63+
required:
64+
- cliqueID
65+
- ipAddress
66+
- nodeName
67+
type: object
68+
type: array
69+
x-kubernetes-list-map-keys:
70+
- nodeName
71+
x-kubernetes-list-type: map
72+
kind:
73+
description: |-
74+
Kind is a string value representing the REST resource this object represents.
75+
Servers may infer this from the endpoint the client submits requests to.
76+
Cannot be updated.
77+
In CamelCase.
78+
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
79+
type: string
80+
metadata:
81+
type: object
82+
type: object
83+
served: true
84+
storage: true

0 commit comments

Comments
 (0)