|
| 1 | +/** |
| 2 | +# Copyright (c) NVIDIA CORPORATION. All rights reserved. |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | +**/ |
| 16 | + |
| 17 | +package v1alpha1 |
| 18 | + |
| 19 | +import ( |
| 20 | + corev1 "k8s.io/api/core/v1" |
| 21 | + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| 22 | + |
| 23 | + nvidiav1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" |
| 24 | +) |
| 25 | + |
| 26 | +const ( |
| 27 | + GPUClusterConfigCRDName = "GPUClusterConfig" |
| 28 | +) |
| 29 | + |
| 30 | +// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. |
| 31 | + |
| 32 | +// GPUClusterConfigSpec defines the desired state of GPUClusterConfig, the DRA-based |
| 33 | +// software-enablement stack. Unlike ClusterPolicy, it does not manage the NVIDIA driver |
| 34 | +// or the device-plugin; the driver is installed separately (host-installed or via an |
| 35 | +// NVIDIADriver CR) and GPUClusterConfig waits for driver readiness before proceeding. |
| 36 | +type GPUClusterConfigSpec struct { |
| 37 | + // DraDriver defines the spec for the NVIDIA DRA driver stack (gpus + computeDomains). |
| 38 | + DraDriver DraDriverSpec `json:"draDriver"` |
| 39 | + |
| 40 | + // DCGM defines the spec for the standalone NVIDIA DCGM hostengine. Disabled by default; |
| 41 | + // when disabled, dcgm-exporter uses its embedded nv-hostengine. |
| 42 | + // NOTE: the reused enabled field carries no server-side default, so an omitted enabled |
| 43 | + // is nil here; the controller is responsible for interpreting nil (see defaults handling). |
| 44 | + DCGM *nvidiav1.DCGMSpec `json:"dcgm,omitempty"` |
| 45 | + |
| 46 | + // DCGMExporter defines the spec for NVIDIA DCGM Exporter. Enabled by default, but the |
| 47 | + // reused enabled field carries no server-side default; the controller defaults nil enabled. |
| 48 | + DCGMExporter *nvidiav1.DCGMExporterSpec `json:"dcgmExporter,omitempty"` |
| 49 | + |
| 50 | + // GFD defines the spec for the standalone GPU Feature Discovery operand. Enabled by default, |
| 51 | + // but the reused enabled field carries no server-side default; the controller defaults nil enabled. |
| 52 | + GFD *nvidiav1.GPUFeatureDiscoverySpec `json:"gfd,omitempty"` |
| 53 | + |
| 54 | + // HostPaths defines the host paths used in host-path volumes for various components. |
| 55 | + HostPaths nvidiav1.HostPathsSpec `json:"hostPaths,omitempty"` |
| 56 | + |
| 57 | + // Daemonsets defines the common configuration applied to all DaemonSets deployed |
| 58 | + // by the GPUClusterConfig controller. |
| 59 | + Daemonsets nvidiav1.DaemonsetsSpec `json:"daemonsets,omitempty"` |
| 60 | +} |
| 61 | + |
| 62 | +// DraDriverSpec defines the spec for the NVIDIA DRA driver stack. There is no top-level |
| 63 | +// enabled toggle; enablement is per capability (gpus / computeDomains). |
| 64 | +type DraDriverSpec struct { |
| 65 | + // NVIDIA DRA driver image repository |
| 66 | + // +kubebuilder:validation:Optional |
| 67 | + Repository string `json:"repository,omitempty"` |
| 68 | + |
| 69 | + // NVIDIA DRA driver image name |
| 70 | + // +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+ |
| 71 | + Image string `json:"image,omitempty"` |
| 72 | + |
| 73 | + // NVIDIA DRA driver image tag |
| 74 | + // +kubebuilder:validation:Optional |
| 75 | + Version string `json:"version,omitempty"` |
| 76 | + |
| 77 | + // Image pull policy |
| 78 | + // +kubebuilder:validation:Optional |
| 79 | + ImagePullPolicy string `json:"imagePullPolicy,omitempty"` |
| 80 | + |
| 81 | + // Image pull secrets |
| 82 | + // +kubebuilder:validation:Optional |
| 83 | + ImagePullSecrets []string `json:"imagePullSecrets,omitempty"` |
| 84 | + |
| 85 | + // GPUs configures the gpu.nvidia.com capability of the DRA driver. |
| 86 | + GPUs DraDriverGPUsSpec `json:"gpus,omitempty"` |
| 87 | + |
| 88 | + // ComputeDomains configures the compute-domain capability of the DRA driver. |
| 89 | + ComputeDomains DraDriverComputeDomainsSpec `json:"computeDomains,omitempty"` |
| 90 | +} |
| 91 | + |
| 92 | +// DraDriverGPUsSpec configures the gpus capability of the DRA driver. It maps onto the |
| 93 | +// gpus container of the upstream kubelet-plugin DaemonSet. |
| 94 | +type DraDriverGPUsSpec struct { |
| 95 | + // Enabled indicates if the gpus capability of the DRA driver is enabled. |
| 96 | + // +kubebuilder:default=true |
| 97 | + Enabled *bool `json:"enabled,omitempty"` |
| 98 | + |
| 99 | + // KubeletPlugin configures the kubelet-plugin workload for the gpus capability. |
| 100 | + KubeletPlugin DraDriverKubeletPluginSpec `json:"kubeletPlugin,omitempty"` |
| 101 | +} |
| 102 | + |
| 103 | +// DraDriverComputeDomainsSpec configures the computeDomains capability of the DRA driver. |
| 104 | +// The kubeletPlugin maps onto the computeDomains container of the upstream kubelet-plugin |
| 105 | +// DaemonSet; the controller is a separate Deployment. |
| 106 | +type DraDriverComputeDomainsSpec struct { |
| 107 | + // Enabled indicates if the computeDomains capability of the DRA driver is enabled. |
| 108 | + // +kubebuilder:default=true |
| 109 | + Enabled *bool `json:"enabled,omitempty"` |
| 110 | + |
| 111 | + // Controller configures the compute-domain controller Deployment. |
| 112 | + Controller DraDriverControllerSpec `json:"controller,omitempty"` |
| 113 | + |
| 114 | + // KubeletPlugin configures the kubelet-plugin workload for the computeDomains capability. |
| 115 | + KubeletPlugin DraDriverKubeletPluginSpec `json:"kubeletPlugin,omitempty"` |
| 116 | +} |
| 117 | + |
| 118 | +// DraDriverKubeletPluginSpec defines configuration for a DRA driver kubelet-plugin container. |
| 119 | +// Per-component scheduling fields augment/override the shared daemonsets defaults for this |
| 120 | +// workload. The gpus and computeDomains kubelet-plugin blocks map onto the two containers of |
| 121 | +// a single kubelet-plugin DaemonSet, so the renderer reconciles pod-level scheduling when |
| 122 | +// both blocks set it. |
| 123 | +type DraDriverKubeletPluginSpec struct { |
| 124 | + // Optional: List of environment variables |
| 125 | + // +kubebuilder:validation:Optional |
| 126 | + Env []nvidiav1.EnvVar `json:"env,omitempty"` |
| 127 | + |
| 128 | + // Optional: Define resources requests and limits for the kubelet-plugin container |
| 129 | + // +kubebuilder:validation:Optional |
| 130 | + Resources *nvidiav1.ResourceRequirements `json:"resources,omitempty"` |
| 131 | + |
| 132 | + // HealthcheckPort is the port running a gRPC health service checked by a livenessProbe. |
| 133 | + // Set to a negative value to disable the service and the probe. |
| 134 | + // +kubebuilder:validation:Optional |
| 135 | + HealthcheckPort *int32 `json:"healthcheckPort,omitempty"` |
| 136 | + |
| 137 | + // +kubebuilder:validation:Optional |
| 138 | + // PriorityClassName for the kubelet-plugin DaemonSet pods |
| 139 | + PriorityClassName string `json:"priorityClassName,omitempty"` |
| 140 | + |
| 141 | + // +kubebuilder:validation:Optional |
| 142 | + // NodeSelector for the kubelet-plugin DaemonSet pods |
| 143 | + NodeSelector map[string]string `json:"nodeSelector,omitempty"` |
| 144 | + |
| 145 | + // +kubebuilder:validation:Optional |
| 146 | + // Tolerations for the kubelet-plugin DaemonSet pods |
| 147 | + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` |
| 148 | + |
| 149 | + // +kubebuilder:validation:Optional |
| 150 | + // Affinity rules for the kubelet-plugin DaemonSet pods |
| 151 | + Affinity *corev1.Affinity `json:"affinity,omitempty"` |
| 152 | +} |
| 153 | + |
| 154 | +// DraDriverControllerSpec defines configuration for the compute-domain controller Deployment. |
| 155 | +// As a Deployment (not a DaemonSet) it carries its own scheduling configuration rather than |
| 156 | +// inheriting the shared daemonsets defaults. |
| 157 | +type DraDriverControllerSpec struct { |
| 158 | + // Optional: List of environment variables |
| 159 | + // +kubebuilder:validation:Optional |
| 160 | + Env []nvidiav1.EnvVar `json:"env,omitempty"` |
| 161 | + |
| 162 | + // Optional: Define resources requests and limits for the controller container |
| 163 | + // +kubebuilder:validation:Optional |
| 164 | + Resources *nvidiav1.ResourceRequirements `json:"resources,omitempty"` |
| 165 | + |
| 166 | + // +kubebuilder:validation:Optional |
| 167 | + // PriorityClassName for the controller Deployment pods |
| 168 | + PriorityClassName string `json:"priorityClassName,omitempty"` |
| 169 | + |
| 170 | + // +kubebuilder:validation:Optional |
| 171 | + // NodeSelector for the controller Deployment pods |
| 172 | + NodeSelector map[string]string `json:"nodeSelector,omitempty"` |
| 173 | + |
| 174 | + // +kubebuilder:validation:Optional |
| 175 | + // Tolerations for the controller Deployment pods |
| 176 | + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` |
| 177 | + |
| 178 | + // +kubebuilder:validation:Optional |
| 179 | + // Affinity rules for the controller Deployment pods |
| 180 | + Affinity *corev1.Affinity `json:"affinity,omitempty"` |
| 181 | +} |
| 182 | + |
| 183 | +// GPUClusterConfigStatus defines the observed state of GPUClusterConfig |
| 184 | +type GPUClusterConfigStatus struct { |
| 185 | + // +kubebuilder:validation:Enum=ignored;ready;notReady;disabled |
| 186 | + // State indicates the status of the GPUClusterConfig instance |
| 187 | + State State `json:"state"` |
| 188 | + // Namespace indicates the namespace in which the operator and operands are installed |
| 189 | + Namespace string `json:"namespace,omitempty"` |
| 190 | + // Conditions is a list of conditions representing the GPUClusterConfig's current state. |
| 191 | + Conditions []metav1.Condition `json:"conditions,omitempty"` |
| 192 | +} |
| 193 | + |
| 194 | +// +genclient |
| 195 | +// +genclient:nonNamespaced |
| 196 | +//+kubebuilder:object:root=true |
| 197 | +//+kubebuilder:subresource:status |
| 198 | +//+kubebuilder:resource:scope=Cluster,shortName={"gcc"} |
| 199 | +//+kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0 |
| 200 | +//+kubebuilder:printcolumn:name="Age",type=string,JSONPath=`.metadata.creationTimestamp`,priority=0 |
| 201 | + |
| 202 | +// GPUClusterConfig is the Schema for the gpuclusterconfigs API |
| 203 | +type GPUClusterConfig struct { |
| 204 | + metav1.TypeMeta `json:",inline"` |
| 205 | + metav1.ObjectMeta `json:"metadata,omitempty"` |
| 206 | + |
| 207 | + Spec GPUClusterConfigSpec `json:"spec,omitempty"` |
| 208 | + Status GPUClusterConfigStatus `json:"status,omitempty"` |
| 209 | +} |
| 210 | + |
| 211 | +//+kubebuilder:object:root=true |
| 212 | + |
| 213 | +// GPUClusterConfigList contains a list of GPUClusterConfig |
| 214 | +type GPUClusterConfigList struct { |
| 215 | + metav1.TypeMeta `json:",inline"` |
| 216 | + metav1.ListMeta `json:"metadata,omitempty"` |
| 217 | + Items []GPUClusterConfig `json:"items"` |
| 218 | +} |
0 commit comments