Skip to content

Commit 4db7a7a

Browse files
authored
feat: enforce singleton Gpu CR
1 parent 9629d50 commit 4db7a7a

5 files changed

Lines changed: 59 additions & 13 deletions

File tree

api/v1beta1/gpu_types.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,15 +68,16 @@ type GpuStatus struct {
6868
// +kubebuilder:object:root=true
6969
// +kubebuilder:subresource:status
7070
// +kubebuilder:resource:scope=Cluster
71+
// +kubebuilder:validation:XValidation:rule="self.metadata.name == 'gpu'",message="only a singleton Gpu resource named 'gpu' is allowed per cluster"
7172
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=`.status.conditions[?(@.type=="Ready")].status`
7273
// +kubebuilder:printcolumn:name="Reason",type="string",JSONPath=`.status.conditions[?(@.type=="Ready")].reason`
7374
// +kubebuilder:printcolumn:name="Driver Version",type="string",JSONPath=".status.driver.version"
7475
// +kubebuilder:printcolumn:name="Nodes Ready",type="integer",JSONPath=".status.driver.nodesReady"
7576
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
7677

7778
// Gpu is the user-facing resource for managing GPU support in a Kyma cluster.
78-
// One Gpu resource exists per cluster. It configures the NVIDIA GPU Operator
79-
// and reports GPU health status.
79+
// One Gpu resource exists per cluster, named "gpu". It configures the NVIDIA
80+
// GPU Operator and reports GPU health status.
8081
type Gpu struct {
8182
metav1.TypeMeta `json:",inline"`
8283

config/crd/bases/gpu.kyma-project.io_gpus.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ spec:
3535
openAPIV3Schema:
3636
description: |-
3737
Gpu is the user-facing resource for managing GPU support in a Kyma cluster.
38-
One Gpu resource exists per cluster. It configures the NVIDIA GPU Operator
39-
and reports GPU health status.
38+
One Gpu resource exists per cluster, named "gpu". It configures the NVIDIA
39+
GPU Operator and reports GPU health status.
4040
properties:
4141
apiVersion:
4242
description: |-
@@ -155,6 +155,9 @@ spec:
155155
type: string
156156
type: object
157157
type: object
158+
x-kubernetes-validations:
159+
- message: only a singleton Gpu resource named 'gpu' is allowed per cluster
160+
rule: self.metadata.name == 'gpu'
158161
served: true
159162
storage: true
160163
subresources:

internal/controller/conditions.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,15 @@ const (
2828
condValidatorPassed = "ValidatorPassed"
2929

3030
// Condition reasons.
31-
reasonWaiting = "Waiting" // outcome not yet determined; controller is still watching
32-
reasonProgressing = "Progressing" // resource exists and is converging toward the desired state
33-
reasonReady = "Ready" // condition is fully met
34-
reasonPassed = "Passed" // one-shot check succeeded (e.g. preflight)
35-
reasonFailed = "Failed" // definitively failed; requires user action
36-
reasonInstalled = "Installed" // Helm release applied successfully
37-
reasonUninstalling = "Uninstalling" // Helm release removal in progress
38-
reasonReadError = "ReadError" // Kubernetes API read failed
31+
reasonWaiting = "Waiting" // outcome not yet determined; controller is still watching
32+
reasonProgressing = "Progressing" // resource exists and is converging toward the desired state
33+
reasonReady = "Ready" // condition is fully met
34+
reasonPassed = "Passed" // one-shot check succeeded (e.g. preflight)
35+
reasonFailed = "Failed" // definitively failed; requires user action
36+
reasonInstalled = "Installed" // Helm release applied successfully
37+
reasonUninstalling = "Uninstalling" // Helm release removal in progress
38+
reasonReadError = "ReadError" // Kubernetes API read failed
39+
reasonForbiddenName = "ForbiddenCRName" // Gpu CR has a name other than the expected singleton name
3940
)
4041

4142
// computeReadySummary derives the Ready summary condition from the four managed inputs.

internal/controller/gpu_controller.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ const (
5151
gpuOperatorNamespace = "gpu-operator"
5252
driverAppLabel = "nvidia-driver-daemonset"
5353
clusterPolicyName = "cluster-policy"
54+
expectedCRName = "gpu"
5455
)
5556

5657
var clusterPolicyGVK = schema.GroupVersionKind{
@@ -131,6 +132,21 @@ func (r *GpuReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
131132
return r.reconcileDelete(ctx, gpu)
132133
}
133134

135+
// Singleton enforcement (defense-in-depth; CEL is the primary gate).
136+
if gpu.Name != expectedCRName {
137+
if err := r.applyStatus(ctx, gpu.Name, statusUpdate{
138+
conditions: []metav1.Condition{{
139+
Type: condReady,
140+
Status: metav1.ConditionFalse,
141+
Reason: reasonForbiddenName,
142+
Message: fmt.Sprintf("only a singleton Gpu CR named %q is reconciled; this CR is ignored", expectedCRName),
143+
}},
144+
}); err != nil {
145+
return ctrl.Result{}, err
146+
}
147+
return ctrl.Result{}, nil
148+
}
149+
134150
if !controllerutil.ContainsFinalizer(gpu, finalizer) {
135151
controllerutil.AddFinalizer(gpu, finalizer)
136152
if err := r.Update(ctx, gpu); err != nil {
@@ -279,6 +295,16 @@ func (r *GpuReconciler) reconcileDelete(ctx context.Context, gpu *gpuv1beta1.Gpu
279295
return ctrl.Result{}, nil
280296
}
281297

298+
// Rogue CR (name != expectedCRName) somehow has our finalizer. Drop it
299+
// without calling Helm - Uninstall would target the real release.
300+
if gpu.Name != expectedCRName {
301+
controllerutil.RemoveFinalizer(gpu, finalizer)
302+
if err := r.Update(ctx, gpu); err != nil {
303+
return ctrl.Result{}, fmt.Errorf("removing finalizer from rogue CR: %w", err)
304+
}
305+
return ctrl.Result{}, nil
306+
}
307+
282308
logger.Info("Gpu CR deleted, uninstalling GPU Operator")
283309

284310
// Best-effort status update - do not block deletion if this fails.

internal/controller/gpu_controller_test.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ func (f *fakeInstaller) Uninstall(_ context.Context) error {
5555
var _ helm.Installer = &fakeInstaller{}
5656

5757
var _ = Describe("GpuReconciler", func() {
58-
const gpuName = "test-gpu"
58+
const gpuName = "gpu"
5959

6060
var (
6161
reconciler *GpuReconciler
@@ -93,6 +93,21 @@ var _ = Describe("GpuReconciler", func() {
9393
})
9494
})
9595

96+
Describe("singleton enforcement", func() {
97+
It("rejects creation of any Gpu CR whose name is not 'gpu' via CEL on the CRD", func() {
98+
// CEL rejects at admission; the reconciler-side check is defense-in-depth
99+
// for the case where CEL is not in effect, which envtest cannot simulate.
100+
rogue := &gpuv1beta1.Gpu{
101+
ObjectMeta: metav1.ObjectMeta{Name: "rogue-gpu"},
102+
}
103+
err := k8sClient.Create(ctx, rogue)
104+
Expect(err).To(HaveOccurred())
105+
Expect(err.Error()).To(ContainSubstring("singleton Gpu resource named 'gpu'"))
106+
107+
Expect(installer.installCalls).To(Equal(0))
108+
})
109+
})
110+
96111
Describe("preflight", func() {
97112
BeforeEach(func() {
98113
newGpu(gpuName)

0 commit comments

Comments
 (0)