Skip to content

Commit 9f08bec

Browse files
Add GPUClusterConfig v1alpha1 CRD types and scaffolding
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent 7f01855 commit 9f08bec

15 files changed

Lines changed: 12896 additions & 6 deletions

PROJECT

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,12 @@ resources:
2323
kind: NVIDIADriver
2424
path: github.com/NVIDIA/gpu-operator/api/v1alpha1
2525
version: v1alpha1
26+
- api:
27+
crdVersion: v1
28+
controller: true
29+
domain: com
30+
group: nvidia
31+
kind: GPUClusterConfig
32+
path: github.com/NVIDIA/gpu-operator/api/v1alpha1
33+
version: v1alpha1
2634
version: "3"
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
/**
2+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package v1alpha1
18+
19+
import (
20+
corev1 "k8s.io/api/core/v1"
21+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22+
23+
nvidiav1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
24+
)
25+
26+
const (
27+
GPUClusterConfigCRDName = "GPUClusterConfig"
28+
)
29+
30+
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
31+
32+
// GPUClusterConfigSpec defines the desired state of GPUClusterConfig, the DRA-based
33+
// software-enablement stack. Unlike ClusterPolicy, it does not manage the NVIDIA driver
34+
// or the device-plugin; the driver is installed separately (host-installed or via an
35+
// NVIDIADriver CR) and GPUClusterConfig waits for driver readiness before proceeding.
36+
type GPUClusterConfigSpec struct {
37+
// DraDriver defines the spec for the NVIDIA DRA driver stack (gpus + computeDomains).
38+
DraDriver DraDriverSpec `json:"draDriver"`
39+
40+
// DCGM defines the spec for the standalone NVIDIA DCGM hostengine. Disabled by default;
41+
// when disabled, dcgm-exporter uses its embedded nv-hostengine.
42+
// NOTE: the reused enabled field carries no server-side default, so an omitted enabled
43+
// is nil here; the controller is responsible for interpreting nil (see defaults handling).
44+
DCGM *nvidiav1.DCGMSpec `json:"dcgm,omitempty"`
45+
46+
// DCGMExporter defines the spec for NVIDIA DCGM Exporter. Enabled by default, but the
47+
// reused enabled field carries no server-side default; the controller defaults nil enabled.
48+
DCGMExporter *nvidiav1.DCGMExporterSpec `json:"dcgmExporter,omitempty"`
49+
50+
// GFD defines the spec for the standalone GPU Feature Discovery operand. Enabled by default,
51+
// but the reused enabled field carries no server-side default; the controller defaults nil enabled.
52+
GFD *nvidiav1.GPUFeatureDiscoverySpec `json:"gfd,omitempty"`
53+
54+
// HostPaths defines the host paths used in host-path volumes for various components.
55+
HostPaths nvidiav1.HostPathsSpec `json:"hostPaths,omitempty"`
56+
57+
// Daemonsets defines the common configuration applied to all DaemonSets deployed
58+
// by the GPUClusterConfig controller.
59+
Daemonsets nvidiav1.DaemonsetsSpec `json:"daemonsets,omitempty"`
60+
}
61+
62+
// DraDriverSpec defines the spec for the NVIDIA DRA driver stack. There is no top-level
63+
// enabled toggle; enablement is per capability (gpus / computeDomains).
64+
type DraDriverSpec struct {
65+
// NVIDIA DRA driver image repository
66+
// +kubebuilder:validation:Optional
67+
Repository string `json:"repository,omitempty"`
68+
69+
// NVIDIA DRA driver image name
70+
// +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+
71+
Image string `json:"image,omitempty"`
72+
73+
// NVIDIA DRA driver image tag
74+
// +kubebuilder:validation:Optional
75+
Version string `json:"version,omitempty"`
76+
77+
// Image pull policy
78+
// +kubebuilder:validation:Optional
79+
ImagePullPolicy string `json:"imagePullPolicy,omitempty"`
80+
81+
// Image pull secrets
82+
// +kubebuilder:validation:Optional
83+
ImagePullSecrets []string `json:"imagePullSecrets,omitempty"`
84+
85+
// GPUs configures the gpu.nvidia.com capability of the DRA driver.
86+
GPUs DraDriverGPUsSpec `json:"gpus,omitempty"`
87+
88+
// ComputeDomains configures the compute-domain capability of the DRA driver.
89+
ComputeDomains DraDriverComputeDomainsSpec `json:"computeDomains,omitempty"`
90+
}
91+
92+
// DraDriverGPUsSpec configures the gpus capability of the DRA driver. It maps onto the
93+
// gpus container of the upstream kubelet-plugin DaemonSet.
94+
type DraDriverGPUsSpec struct {
95+
// Enabled indicates if the gpus capability of the DRA driver is enabled.
96+
// +kubebuilder:default=true
97+
Enabled *bool `json:"enabled,omitempty"`
98+
99+
// KubeletPlugin configures the kubelet-plugin workload for the gpus capability.
100+
KubeletPlugin DraDriverKubeletPluginSpec `json:"kubeletPlugin,omitempty"`
101+
}
102+
103+
// DraDriverComputeDomainsSpec configures the computeDomains capability of the DRA driver.
104+
// The kubeletPlugin maps onto the computeDomains container of the upstream kubelet-plugin
105+
// DaemonSet; the controller is a separate Deployment.
106+
type DraDriverComputeDomainsSpec struct {
107+
// Enabled indicates if the computeDomains capability of the DRA driver is enabled.
108+
// +kubebuilder:default=true
109+
Enabled *bool `json:"enabled,omitempty"`
110+
111+
// Controller configures the compute-domain controller Deployment.
112+
Controller DraDriverControllerSpec `json:"controller,omitempty"`
113+
114+
// KubeletPlugin configures the kubelet-plugin workload for the computeDomains capability.
115+
KubeletPlugin DraDriverKubeletPluginSpec `json:"kubeletPlugin,omitempty"`
116+
}
117+
118+
// DraDriverKubeletPluginSpec defines configuration for a DRA driver kubelet-plugin container.
119+
// Per-component scheduling fields augment/override the shared daemonsets defaults for this
120+
// workload. The gpus and computeDomains kubelet-plugin blocks map onto the two containers of
121+
// a single kubelet-plugin DaemonSet, so the renderer reconciles pod-level scheduling when
122+
// both blocks set it.
123+
type DraDriverKubeletPluginSpec struct {
124+
// Optional: List of environment variables
125+
// +kubebuilder:validation:Optional
126+
Env []nvidiav1.EnvVar `json:"env,omitempty"`
127+
128+
// Optional: Define resources requests and limits for the kubelet-plugin container
129+
// +kubebuilder:validation:Optional
130+
Resources *nvidiav1.ResourceRequirements `json:"resources,omitempty"`
131+
132+
// HealthcheckPort is the port running a gRPC health service checked by a livenessProbe.
133+
// Set to a negative value to disable the service and the probe.
134+
// +kubebuilder:validation:Optional
135+
HealthcheckPort *int32 `json:"healthcheckPort,omitempty"`
136+
137+
// +kubebuilder:validation:Optional
138+
// PriorityClassName for the kubelet-plugin DaemonSet pods
139+
PriorityClassName string `json:"priorityClassName,omitempty"`
140+
141+
// +kubebuilder:validation:Optional
142+
// NodeSelector for the kubelet-plugin DaemonSet pods
143+
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
144+
145+
// +kubebuilder:validation:Optional
146+
// Tolerations for the kubelet-plugin DaemonSet pods
147+
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
148+
149+
// +kubebuilder:validation:Optional
150+
// Affinity rules for the kubelet-plugin DaemonSet pods
151+
Affinity *corev1.Affinity `json:"affinity,omitempty"`
152+
}
153+
154+
// DraDriverControllerSpec defines configuration for the compute-domain controller Deployment.
155+
// As a Deployment (not a DaemonSet) it carries its own scheduling configuration rather than
156+
// inheriting the shared daemonsets defaults.
157+
type DraDriverControllerSpec struct {
158+
// Optional: List of environment variables
159+
// +kubebuilder:validation:Optional
160+
Env []nvidiav1.EnvVar `json:"env,omitempty"`
161+
162+
// Optional: Define resources requests and limits for the controller container
163+
// +kubebuilder:validation:Optional
164+
Resources *nvidiav1.ResourceRequirements `json:"resources,omitempty"`
165+
166+
// +kubebuilder:validation:Optional
167+
// PriorityClassName for the controller Deployment pods
168+
PriorityClassName string `json:"priorityClassName,omitempty"`
169+
170+
// +kubebuilder:validation:Optional
171+
// NodeSelector for the controller Deployment pods
172+
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
173+
174+
// +kubebuilder:validation:Optional
175+
// Tolerations for the controller Deployment pods
176+
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
177+
178+
// +kubebuilder:validation:Optional
179+
// Affinity rules for the controller Deployment pods
180+
Affinity *corev1.Affinity `json:"affinity,omitempty"`
181+
}
182+
183+
// GPUClusterConfigStatus defines the observed state of GPUClusterConfig
184+
type GPUClusterConfigStatus struct {
185+
// +kubebuilder:validation:Enum=ignored;ready;notReady;disabled
186+
// State indicates the status of the GPUClusterConfig instance
187+
State State `json:"state"`
188+
// Namespace indicates the namespace in which the operator and operands are installed
189+
Namespace string `json:"namespace,omitempty"`
190+
// Conditions is a list of conditions representing the GPUClusterConfig's current state.
191+
Conditions []metav1.Condition `json:"conditions,omitempty"`
192+
}
193+
194+
// +genclient
195+
// +genclient:nonNamespaced
196+
//+kubebuilder:object:root=true
197+
//+kubebuilder:subresource:status
198+
//+kubebuilder:resource:scope=Cluster,shortName={"gcc"}
199+
//+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0
200+
//+kubebuilder:printcolumn:name="Age",type=string,JSONPath=`.metadata.creationTimestamp`,priority=0
201+
202+
// GPUClusterConfig is the Schema for the gpuclusterconfigs API
203+
type GPUClusterConfig struct {
204+
metav1.TypeMeta `json:",inline"`
205+
metav1.ObjectMeta `json:"metadata,omitempty"`
206+
207+
Spec GPUClusterConfigSpec `json:"spec,omitempty"`
208+
Status GPUClusterConfigStatus `json:"status,omitempty"`
209+
}
210+
211+
//+kubebuilder:object:root=true
212+
213+
// GPUClusterConfigList contains a list of GPUClusterConfig
214+
type GPUClusterConfigList struct {
215+
metav1.TypeMeta `json:",inline"`
216+
metav1.ListMeta `json:"metadata,omitempty"`
217+
Items []GPUClusterConfig `json:"items"`
218+
}

api/nvidia/v1alpha1/groupversion_info.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ var (
3838

3939
func addKnownTypes(scheme *runtime.Scheme) error {
4040
scheme.AddKnownTypes(SchemeGroupVersion, &NVIDIADriver{}, &NVIDIADriverList{})
41+
scheme.AddKnownTypes(SchemeGroupVersion, &GPUClusterConfig{}, &GPUClusterConfigList{})
4142
metav1.AddToGroupVersion(scheme, SchemeGroupVersion)
4243
return nil
4344
}

0 commit comments

Comments
 (0)