Skip to content

Commit f374249

Browse files
authored
feat: harden deletion with timeout, namespace cleanup
1 parent 8ad4cee commit f374249

5 files changed

Lines changed: 140 additions & 15 deletions

File tree

config/rbac/role.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ rules:
2424
- namespaces
2525
verbs:
2626
- create
27+
- delete
2728
- get
2829
- list
2930
- watch

internal/controller/gpu_controller.go

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package controller
1616

1717
import (
1818
"context"
19+
"errors"
1920
"fmt"
2021
"strings"
2122
"time"
@@ -47,6 +48,7 @@ import (
4748

4849
const (
4950
requeueWarn = 30 * time.Second
51+
deleteTimeout = 3 * time.Minute
5052
finalizer = "gpu.kyma-project.io/gpu-operator"
5153
gpuOperatorNamespace = "gpu-operator"
5254
driverAppLabel = "nvidia-driver-daemonset"
@@ -92,7 +94,7 @@ type GpuReconciler struct {
9294
// +kubebuilder:rbac:groups=gpu.kyma-project.io,resources=gpus/status,verbs=get;update;patch
9395
// +kubebuilder:rbac:groups=gpu.kyma-project.io,resources=gpus/finalizers,verbs=update
9496
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
95-
// +kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch;create
97+
// +kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch;create;delete
9698
// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;update;patch;delete
9799
// +kubebuilder:rbac:groups="",resources=serviceaccounts;configmaps,verbs=get;list;watch;create;update;patch;delete
98100
// +kubebuilder:rbac:groups=apps,resources=deployments;daemonsets,verbs=get;list;watch;create;update;patch;delete
@@ -318,13 +320,43 @@ func (r *GpuReconciler) reconcileDelete(ctx context.Context, gpu *gpuv1beta1.Gpu
318320
}
319321

320322
// Uninstall is idempotent - returns nil if the release is already gone.
321-
if err := r.Installer.Uninstall(ctx); err != nil {
323+
if err := r.Installer.Uninstall(ctx, deleteTimeout); err != nil {
324+
if errors.Is(err, context.DeadlineExceeded) {
325+
logger.Error(err, "helm uninstall timed out, forcing finalizer removal; manual cleanup of gpu-operator namespace may be required")
326+
if nsErr := r.deleteNamespace(ctx, gpuOperatorNamespace); nsErr != nil {
327+
logger.Error(nsErr, "failed to delete namespace after helm timeout")
328+
}
329+
return r.removeFinalizer(ctx, gpu.Name)
330+
}
322331
return ctrl.Result{}, fmt.Errorf("helm uninstall: %w", err)
323332
}
324333

325-
// Re-read after the status apply above bumped ResourceVersion.
334+
// Helm never deletes namespaces; clean it up explicitly with foreground propagation
335+
// so all child resources are gone before the namespace itself disappears.
336+
if err := r.deleteNamespace(ctx, gpuOperatorNamespace); err != nil {
337+
return ctrl.Result{}, err
338+
}
339+
340+
return r.removeFinalizer(ctx, gpu.Name)
341+
}
342+
343+
func (r *GpuReconciler) deleteNamespace(ctx context.Context, name string) error {
344+
ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}}
345+
if err := r.Delete(ctx, ns, client.PropagationPolicy(metav1.DeletePropagationForeground)); err != nil {
346+
if apierrors.IsNotFound(err) {
347+
return nil
348+
}
349+
return fmt.Errorf("deleting namespace %s: %w", name, err)
350+
}
351+
return nil
352+
}
353+
354+
func (r *GpuReconciler) removeFinalizer(ctx context.Context, name string) (ctrl.Result, error) {
326355
live := &gpuv1beta1.Gpu{}
327-
if err := r.Get(ctx, types.NamespacedName{Name: gpu.Name}, live); err != nil {
356+
if err := r.Get(ctx, types.NamespacedName{Name: name}, live); err != nil {
357+
if apierrors.IsNotFound(err) {
358+
return ctrl.Result{}, nil
359+
}
328360
return ctrl.Result{}, fmt.Errorf("re-fetching Gpu CR for finalizer removal: %w", err)
329361
}
330362
controllerutil.RemoveFinalizer(live, finalizer)

internal/controller/gpu_controller_test.go

Lines changed: 89 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ package controller
1717
import (
1818
"context"
1919
"errors"
20+
"fmt"
21+
"time"
2022

2123
. "github.com/onsi/ginkgo/v2"
2224
. "github.com/onsi/gomega"
@@ -47,7 +49,7 @@ func (f *fakeInstaller) InstallOrUpgrade(_ context.Context, _ []byte, _ map[stri
4749
return f.installErr
4850
}
4951

50-
func (f *fakeInstaller) Uninstall(_ context.Context) error {
52+
func (f *fakeInstaller) Uninstall(_ context.Context, _ time.Duration) error {
5153
f.uninstallCalled = true
5254
return f.uninstallErr
5355
}
@@ -464,22 +466,22 @@ var _ = Describe("GpuReconciler", func() {
464466
})
465467

466468
Describe("deletion", func() {
467-
It("calls Helm uninstall and removes the finalizer", func() {
469+
BeforeEach(func() {
468470
newGpu(gpuName)
469471
_, _ = reconciler.Reconcile(ctx, req) // adds finalizer
470472
newGpuNode("gpu-node-del", "g4dn.xlarge", "Garden Linux 1312.3")
471473
DeferCleanup(deleteNode, "gpu-node-del")
472474
_, err := reconciler.Reconcile(ctx, req)
473475
Expect(err).NotTo(HaveOccurred())
474476
Expect(installer.installCalls).To(Equal(1))
477+
})
475478

476-
By("deleting the CR")
479+
It("calls Helm uninstall and removes the finalizer on success", func() {
477480
gpu := &gpuv1beta1.Gpu{}
478481
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: gpuName}, gpu)).To(Succeed())
479482
Expect(k8sClient.Delete(ctx, gpu)).To(Succeed())
480483

481-
By("reconciling the deletion")
482-
_, err = reconciler.Reconcile(ctx, req)
484+
_, err := reconciler.Reconcile(ctx, req)
483485
Expect(err).NotTo(HaveOccurred())
484486
Expect(installer.uninstallCalled).To(BeTrue())
485487

@@ -489,6 +491,88 @@ var _ = Describe("GpuReconciler", func() {
489491
Expect(gpu.Finalizers).NotTo(ContainElement(finalizer))
490492
}
491493
})
494+
495+
It("returns an error and keeps the finalizer on non-timeout Helm failure", func() {
496+
installer.uninstallErr = errors.New("simulated uninstall failure")
497+
498+
gpu := &gpuv1beta1.Gpu{}
499+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: gpuName}, gpu)).To(Succeed())
500+
Expect(k8sClient.Delete(ctx, gpu)).To(Succeed())
501+
502+
_, err := reconciler.Reconcile(ctx, req)
503+
Expect(err).To(HaveOccurred())
504+
Expect(err.Error()).To(ContainSubstring("helm uninstall"))
505+
506+
// Finalizer must still be present so the CR is not lost.
507+
gpu = &gpuv1beta1.Gpu{}
508+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: gpuName}, gpu)).To(Succeed())
509+
Expect(gpu.Finalizers).To(ContainElement(finalizer))
510+
})
511+
512+
It("force-removes the finalizer when Helm uninstall times out", func() {
513+
installer.uninstallErr = fmt.Errorf("uninstalling gpu-operator: %w", context.DeadlineExceeded)
514+
515+
ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: gpuOperatorNamespace}}
516+
err := k8sClient.Create(ctx, ns)
517+
if err != nil {
518+
Expect(err.Error()).To(ContainSubstring("already exists"))
519+
}
520+
521+
gpu := &gpuv1beta1.Gpu{}
522+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: gpuName}, gpu)).To(Succeed())
523+
Expect(k8sClient.Delete(ctx, gpu)).To(Succeed())
524+
525+
_, err = reconciler.Reconcile(ctx, req)
526+
Expect(err).NotTo(HaveOccurred(), "timeout must force-remove finalizer, not block the CR forever")
527+
528+
// Namespace cleanup must be attempted even on timeout.
529+
liveNs := &corev1.Namespace{}
530+
err = k8sClient.Get(ctx, types.NamespacedName{Name: gpuOperatorNamespace}, liveNs)
531+
if err == nil {
532+
Expect(liveNs.DeletionTimestamp).NotTo(BeNil(), "namespace should be terminating even after timeout")
533+
}
534+
535+
gpu = &gpuv1beta1.Gpu{}
536+
err = k8sClient.Get(ctx, types.NamespacedName{Name: gpuName}, gpu)
537+
if err == nil {
538+
Expect(gpu.Finalizers).NotTo(ContainElement(finalizer))
539+
}
540+
})
541+
542+
It("deletes the gpu-operator namespace after successful Helm uninstall", func() {
543+
ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: gpuOperatorNamespace}}
544+
err := k8sClient.Create(ctx, ns)
545+
if err != nil {
546+
// Namespace may already exist (e.g. terminating from a prior test) - that's fine,
547+
// we just need it to be present so deleteNamespace can act on it.
548+
Expect(err.Error()).To(ContainSubstring("already exists"))
549+
}
550+
551+
gpu := &gpuv1beta1.Gpu{}
552+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: gpuName}, gpu)).To(Succeed())
553+
Expect(k8sClient.Delete(ctx, gpu)).To(Succeed())
554+
555+
_, err = reconciler.Reconcile(ctx, req)
556+
Expect(err).NotTo(HaveOccurred())
557+
Expect(installer.uninstallCalled).To(BeTrue(), "Uninstall must be called before namespace cleanup")
558+
559+
liveNs := &corev1.Namespace{}
560+
err = k8sClient.Get(ctx, types.NamespacedName{Name: gpuOperatorNamespace}, liveNs)
561+
// Either deleted (NotFound) or marked for deletion (DeletionTimestamp set).
562+
if err == nil {
563+
Expect(liveNs.DeletionTimestamp).NotTo(BeNil(), "namespace should be terminating")
564+
}
565+
})
566+
567+
It("ignores NotFound when deleting the gpu-operator namespace", func() {
568+
// Namespace does not exist - deleteNamespace must not return an error.
569+
gpu := &gpuv1beta1.Gpu{}
570+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: gpuName}, gpu)).To(Succeed())
571+
Expect(k8sClient.Delete(ctx, gpu)).To(Succeed())
572+
573+
_, err := reconciler.Reconcile(ctx, req)
574+
Expect(err).NotTo(HaveOccurred())
575+
})
492576
})
493577
})
494578

internal/helm/installer.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"bytes"
1919
"context"
2020
"fmt"
21+
"time"
2122

2223
"helm.sh/helm/v3/pkg/action"
2324
"helm.sh/helm/v3/pkg/chart"
@@ -71,8 +72,11 @@ func (c *Client) InstallOrUpgrade(ctx context.Context, chartData []byte, values
7172
return upgrade(ctx, cfg, chrt, values)
7273
}
7374

74-
// Uninstall removes the NVIDIA GPU Operator Helm release.
75-
func (c *Client) Uninstall(_ context.Context) error {
75+
// Uninstall removes the NVIDIA GPU Operator Helm release. It waits for pods to
76+
// terminate up to timeout so a rapid reinstall does not race with the old driver unloading.
77+
// ctx cancellation is not honored: Helm's action.Uninstall.Run does not accept a context,
78+
// so the call blocks until the driver pods are gone or u.Timeout expires.
79+
func (c *Client) Uninstall(_ context.Context, timeout time.Duration) error {
7680
cfg, err := c.actionConfig()
7781
if err != nil {
7882
return err
@@ -87,7 +91,8 @@ func (c *Client) Uninstall(_ context.Context) error {
8791
}
8892

8993
u := action.NewUninstall(cfg)
90-
u.Wait = false
94+
u.Wait = true
95+
u.Timeout = timeout
9196
if _, err := u.Run(releaseName); err != nil {
9297
return fmt.Errorf("uninstalling gpu-operator: %w", err)
9398
}

internal/helm/interface.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@ limitations under the License.
1414

1515
package helm
1616

17-
import "context"
17+
import (
18+
"context"
19+
"time"
20+
)
1821

1922
// Installer is the contract the controller uses to drive Helm operations.
2023
// The concrete *Client talks to a real cluster; tests inject a fake.
@@ -23,6 +26,6 @@ type Installer interface {
2326
InstallOrUpgrade(ctx context.Context, chartData []byte, values map[string]any) error
2427

2528
// Uninstall removes the GPU Operator Helm release. It is idempotent: returns nil
26-
// if no release exists.
27-
Uninstall(ctx context.Context) error
29+
// if no release exists. timeout bounds how long the uninstall waits for pods to terminate.
30+
Uninstall(ctx context.Context, timeout time.Duration) error
2831
}

0 commit comments

Comments
 (0)