Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions api/v1beta1/gpu_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,16 @@ type GpuStatus struct {
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:scope=Cluster
// +kubebuilder:validation:XValidation:rule="self.metadata.name == 'gpu'",message="only a singleton Gpu resource named 'gpu' is allowed per cluster"
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=`.status.conditions[?(@.type=="Ready")].status`
// +kubebuilder:printcolumn:name="Reason",type="string",JSONPath=`.status.conditions[?(@.type=="Ready")].reason`
// +kubebuilder:printcolumn:name="Driver Version",type="string",JSONPath=".status.driver.version"
// +kubebuilder:printcolumn:name="Nodes Ready",type="integer",JSONPath=".status.driver.nodesReady"
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"

// Gpu is the user-facing resource for managing GPU support in a Kyma cluster.
// One Gpu resource exists per cluster. It configures the NVIDIA GPU Operator
// and reports GPU health status.
// One Gpu resource exists per cluster, named "gpu". It configures the NVIDIA
// GPU Operator and reports GPU health status.
type Gpu struct {
metav1.TypeMeta `json:",inline"`

Expand Down
7 changes: 5 additions & 2 deletions config/crd/bases/gpu.kyma-project.io_gpus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ spec:
openAPIV3Schema:
description: |-
Gpu is the user-facing resource for managing GPU support in a Kyma cluster.
One Gpu resource exists per cluster. It configures the NVIDIA GPU Operator
and reports GPU health status.
One Gpu resource exists per cluster, named "gpu". It configures the NVIDIA
GPU Operator and reports GPU health status.
properties:
apiVersion:
description: |-
Expand Down Expand Up @@ -155,6 +155,9 @@ spec:
type: string
type: object
type: object
x-kubernetes-validations:
- message: only a singleton Gpu resource named 'gpu' is allowed per cluster
rule: self.metadata.name == 'gpu'
served: true
storage: true
subresources:
Expand Down
17 changes: 9 additions & 8 deletions internal/controller/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,15 @@ const (
condValidatorPassed = "ValidatorPassed"

// Condition reasons.
reasonWaiting = "Waiting" // outcome not yet determined; controller is still watching
reasonProgressing = "Progressing" // resource exists and is converging toward the desired state
reasonReady = "Ready" // condition is fully met
reasonPassed = "Passed" // one-shot check succeeded (e.g. preflight)
reasonFailed = "Failed" // definitively failed; requires user action
reasonInstalled = "Installed" // Helm release applied successfully
reasonUninstalling = "Uninstalling" // Helm release removal in progress
reasonReadError = "ReadError" // Kubernetes API read failed
reasonWaiting = "Waiting" // outcome not yet determined; controller is still watching
reasonProgressing = "Progressing" // resource exists and is converging toward the desired state
reasonReady = "Ready" // condition is fully met
reasonPassed = "Passed" // one-shot check succeeded (e.g. preflight)
reasonFailed = "Failed" // definitively failed; requires user action
reasonInstalled = "Installed" // Helm release applied successfully
reasonUninstalling = "Uninstalling" // Helm release removal in progress
reasonReadError = "ReadError" // Kubernetes API read failed
reasonForbiddenName = "ForbiddenCRName" // Gpu CR has a name other than the expected singleton name
)

// computeReadySummary derives the Ready summary condition from the four managed inputs.
Expand Down
26 changes: 26 additions & 0 deletions internal/controller/gpu_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ const (
gpuOperatorNamespace = "gpu-operator"
driverAppLabel = "nvidia-driver-daemonset"
clusterPolicyName = "cluster-policy"
expectedCRName = "gpu"
)

var clusterPolicyGVK = schema.GroupVersionKind{
Expand Down Expand Up @@ -131,6 +132,21 @@ func (r *GpuReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
return r.reconcileDelete(ctx, gpu)
}

// Singleton enforcement (defense-in-depth; CEL is the primary gate).
if gpu.Name != expectedCRName {
if err := r.applyStatus(ctx, gpu.Name, statusUpdate{
conditions: []metav1.Condition{{
Type: condReady,
Status: metav1.ConditionFalse,
Reason: reasonForbiddenName,
Message: fmt.Sprintf("only a singleton Gpu CR named %q is reconciled; this CR is ignored", expectedCRName),
}},
}); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
Comment thread
vrdc-sap marked this conversation as resolved.

if !controllerutil.ContainsFinalizer(gpu, finalizer) {
controllerutil.AddFinalizer(gpu, finalizer)
if err := r.Update(ctx, gpu); err != nil {
Expand Down Expand Up @@ -279,6 +295,16 @@ func (r *GpuReconciler) reconcileDelete(ctx context.Context, gpu *gpuv1beta1.Gpu
return ctrl.Result{}, nil
}

// Rogue CR (name != expectedCRName) somehow has our finalizer. Drop it
// without calling Helm - Uninstall would target the real release.
if gpu.Name != expectedCRName {
controllerutil.RemoveFinalizer(gpu, finalizer)
if err := r.Update(ctx, gpu); err != nil {
return ctrl.Result{}, fmt.Errorf("removing finalizer from rogue CR: %w", err)
}
return ctrl.Result{}, nil
}

logger.Info("Gpu CR deleted, uninstalling GPU Operator")

// Best-effort status update - do not block deletion if this fails.
Expand Down
17 changes: 16 additions & 1 deletion internal/controller/gpu_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func (f *fakeInstaller) Uninstall(_ context.Context) error {
var _ helm.Installer = &fakeInstaller{}

var _ = Describe("GpuReconciler", func() {
const gpuName = "test-gpu"
const gpuName = "gpu"

var (
reconciler *GpuReconciler
Expand Down Expand Up @@ -93,6 +93,21 @@ var _ = Describe("GpuReconciler", func() {
})
})

Describe("singleton enforcement", func() {
It("rejects creation of any Gpu CR whose name is not 'gpu' via CEL on the CRD", func() {
// CEL rejects at admission; the reconciler-side check is defense-in-depth
// for the case where CEL is not in effect, which envtest cannot simulate.
rogue := &gpuv1beta1.Gpu{
ObjectMeta: metav1.ObjectMeta{Name: "rogue-gpu"},
}
err := k8sClient.Create(ctx, rogue)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("singleton Gpu resource named 'gpu'"))

Expect(installer.installCalls).To(Equal(0))
})
})

Describe("preflight", func() {
BeforeEach(func() {
newGpu(gpuName)
Expand Down
Loading