Skip to content

Commit 142dfdd

Browse files
committed
fix: remove GC-owned/namespace-scoped resources from CleanOrphanedResources
Signed-off-by: Liam Beckman <lbeckman314@gmail.com> Assisted-by: Claude:claude-sonnet-4-5
1 parent 7c134e7 commit 142dfdd

2 files changed

Lines changed: 32 additions & 83 deletions

File tree

compute/kubernetes/backend.go

Lines changed: 14 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import (
55
"context"
66
"encoding/json"
77
"fmt"
8-
"strings"
98
"time"
109

1110
"dario.cat/mergo"
@@ -596,81 +595,35 @@ func (b *Backend) CleanOrphanedResources(ctx context.Context) {
596595
namespace := b.conf.Kubernetes.JobsNamespace
597596
taskIDs := make(map[string]struct{})
598597

599-
// Collect task IDs from each resource type
600-
if pvcs, err := b.client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{LabelSelector: "app=funnel"}); err == nil {
601-
if err != nil {
602-
b.log.Error("backlog cleanup: listing PVCs", err)
603-
}
604-
for _, r := range pvcs.Items {
605-
if id, ok := r.Labels["taskId"]; ok {
606-
taskIDs[id] = struct{}{}
607-
}
608-
}
609-
}
598+
// Collect task IDs from resources that cleanResources manages directly.
599+
// ConfigMaps, PVCs, Roles, and RoleBindings are now owned by the Job via ownerReferences
600+
// and are garbage-collected by Kubernetes automatically — they are intentionally excluded here.
610601

611-
// PVs
612-
if pvs, err := b.client.CoreV1().PersistentVolumes().List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("app=funnel,namespace=%s", namespace)}); err == nil {
613-
if err != nil {
614-
b.log.Error("backlog cleanup: listing PVs", err)
615-
}
602+
// PVs (cluster-scoped; cannot be owned by a namespaced Job, so must be cleaned explicitly)
603+
pvs, err := b.client.CoreV1().PersistentVolumes().List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("app=funnel,namespace=%s", namespace)})
604+
if err != nil {
605+
b.log.Error("backlog cleanup: listing PVs", err)
606+
} else {
616607
for _, r := range pvs.Items {
617608
if id, ok := r.Labels["taskId"]; ok {
618609
taskIDs[id] = struct{}{}
619610
}
620611
}
621612
}
622613

623-
// ConfigMaps
624-
if cms, err := b.client.CoreV1().ConfigMaps(namespace).List(ctx, metav1.ListOptions{LabelSelector: "app=funnel"}); err == nil {
625-
if err != nil {
626-
b.log.Error("backlog cleanup: listing ConfigMaps", err)
627-
}
628-
const cmPrefix = "funnel-worker-config-"
629-
for _, r := range cms.Items {
630-
if id, ok := r.Labels["taskId"]; ok {
631-
taskIDs[id] = struct{}{}
632-
} else if strings.HasPrefix(r.Name, cmPrefix) {
633-
taskIDs[strings.TrimPrefix(r.Name, cmPrefix)] = struct{}{}
634-
}
635-
}
636-
}
637-
638-
// ServiceAccounts
639-
if sas, err := b.client.CoreV1().ServiceAccounts(namespace).List(ctx, metav1.ListOptions{LabelSelector: "app=funnel"}); err == nil {
640-
if err != nil {
641-
b.log.Error("backlog cleanup: listing ServiceAccounts", err)
642-
}
614+
// ServiceAccounts (shared SAs are not owned by a Job; task-scoped SAs may also be orphaned
615+
// if they were created before ownerRef support was added)
616+
sas, err := b.client.CoreV1().ServiceAccounts(namespace).List(ctx, metav1.ListOptions{LabelSelector: "app=funnel"})
617+
if err != nil {
618+
b.log.Error("backlog cleanup: listing ServiceAccounts", err)
619+
} else {
643620
for _, r := range sas.Items {
644621
if id, ok := r.Labels["taskId"]; ok {
645622
taskIDs[id] = struct{}{}
646623
}
647624
}
648625
}
649626

650-
// Roles
651-
if roles, err := b.client.RbacV1().Roles(namespace).List(ctx, metav1.ListOptions{LabelSelector: "app=funnel"}); err == nil {
652-
if err != nil {
653-
b.log.Error("backlog cleanup: listing Roles", err)
654-
}
655-
for _, r := range roles.Items {
656-
if id, ok := r.Labels["taskId"]; ok {
657-
taskIDs[id] = struct{}{}
658-
}
659-
}
660-
}
661-
662-
// RoleBindings
663-
if rbs, err := b.client.RbacV1().RoleBindings(namespace).List(ctx, metav1.ListOptions{LabelSelector: "app=funnel"}); err == nil {
664-
if err != nil {
665-
b.log.Error("backlog cleanup: listing RoleBindings", err)
666-
}
667-
for _, r := range rbs.Items {
668-
if id, ok := r.Labels["taskId"]; ok {
669-
taskIDs[id] = struct{}{}
670-
}
671-
}
672-
}
673-
674627
// TODO: Add Executor Jobs here beacause orphaned tasks can result in orphaned jobs
675628

676629
for taskID := range taskIDs {

compute/kubernetes/backend_test.go

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,6 @@ func TestTaskSubmission(t *testing.T) {
8686
// Create a fake Kubernetes client
8787
fakeClient := fake.NewSimpleClientset()
8888

89-
// Inject a deterministic UID on every Job create so ownerRef propagation can be verified.
90-
const testJobUID = "test-job-uid-1234"
91-
fakeClient.PrependReactor("create", "jobs", func(action k8stesting.Action) (bool, runtime.Object, error) {
92-
obj := action.(k8stesting.CreateAction).GetObject().(*batchv1.Job)
93-
obj.UID = testJobUID
94-
return false, obj, nil
95-
})
96-
9789
// Create a mock configuration
9890
conf := config.DefaultConfig()
9991
conf.Kubernetes.Namespace = "test-namespace"
@@ -163,17 +155,20 @@ spec:
163155
t.Errorf("expected Job name '%s', got '%s'", task.Id, job.Name)
164156
}
165157

166-
// Verify that the ConfigMap was created with the Job's UID in its ownerRef.
167-
configMapName := "funnel-worker-config-" + task.Id
168-
cm, err := fakeClient.CoreV1().ConfigMaps(conf.Kubernetes.JobsNamespace).Get(context.Background(), configMapName, metav1.GetOptions{})
158+
// Seed a PV so we can verify cleanResources deletes it.
159+
pvName := "funnel-worker-pv-" + task.Id
160+
_, err = fakeClient.CoreV1().PersistentVolumes().Create(context.Background(), &corev1.PersistentVolume{
161+
ObjectMeta: metav1.ObjectMeta{
162+
Name: pvName,
163+
Labels: map[string]string{
164+
"app": "funnel",
165+
"taskId": task.Id,
166+
"namespace": conf.Kubernetes.JobsNamespace,
167+
},
168+
},
169+
}, metav1.CreateOptions{})
169170
if err != nil {
170-
t.Fatalf("failed to get ConfigMap: %v", err)
171-
}
172-
if len(cm.OwnerReferences) == 0 {
173-
t.Fatal("expected ConfigMap to have an ownerReference, but got none")
174-
}
175-
if got := cm.OwnerReferences[0].UID; got != testJobUID {
176-
t.Errorf("expected ConfigMap ownerRef UID %q, got %q", testJobUID, got)
171+
t.Fatalf("failed to create test PV: %v", err)
177172
}
178173

179174
// Clean up resources
@@ -188,10 +183,11 @@ spec:
188183
t.Error("expected Job to be deleted, but it still exists")
189184
}
190185

191-
// ConfigMap deletion is handled by Kubernetes garbage collection via ownerReferences,
192-
// not explicitly by cleanResources. The fake clientset does not simulate cascading GC,
193-
// so we only verify the ownerRef is set correctly (asserted above).
194-
186+
// Verify that the PV was deleted
187+
_, err = fakeClient.CoreV1().PersistentVolumes().Get(context.Background(), pvName, metav1.GetOptions{})
188+
if err == nil {
189+
t.Error("expected PV to be deleted, but it still exists")
190+
}
195191
}
196192

197193
func TestSubmit_AppliesNodeSelectorAndTolerationsToWorkerJob(t *testing.T) {

0 commit comments

Comments
 (0)