Skip to content

Commit 6286073

Browse files
fix(redis): quiet skip on Regrade when namespace is missing (#36)
The reconciler iterates every active `resources` row of type=redis and calls provisioner Regrade per row. If a tenant namespace was force-deleted or a cluster wipe raced ahead of platform-DB cleanup, the namespace is gone — but the resource row is still status='active' so the reconciler keeps trying every ~5min tick. Pre-fix path: Secrets.Get → IsNotFound (b/c namespace missing) → exec fallback → Pods.List → empty → WARN `no Redis pod found — soft skip`. Twice per orphan per tick, indistinguishable in logs from a real legacy pod missing its Secret. Verified live in prod (commit eeb33ab) on 2026-05-30: two orphaned namespaces emitting ~576 WARN/day combined, drowning real signals (`maxmemory_policy CONFIG REWRITE failed`, `non-integer maxmemory`). Fix: at Regrade entry, do a single Namespaces.Get. If IsNotFound, return SkipReason="namespace not found (resource orphaned)" with an INFO log (distinct from exec-fallback WARNs so operators can differentiate). Saves the Secrets.Get + Pods.List round-trip per orphan per tick. Tests: * TestRegrade_NamespaceMissing_QuietSkip — new orphan path: zero exec calls, exact SkipReason match. * TestRegrade_NamespaceLookupTransientError_PropagatesError — non- IsNotFound errors from namespace lookup MUST propagate so the reconciler retries (paid customers' regrades not silently dropped). * TestRegrade_NoPodsAtAll_SoftSkip rewritten: now asserts namespace EXISTS but is pod-less (the genuine soft-skip case) and that the SkipReason is NOT the orphan sentinel. * Updated all 7 existing Regrade tests in coverage_test.go + k8s_test.go that previously relied on the empty-cluster path to add the corev1.Namespace fixture (via a new nsObject helper). Co-authored-by: Manas Srivastava <[email protected]> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent eeb33ab commit 6286073

3 files changed

Lines changed: 140 additions & 13 deletions

File tree

internal/backend/redis/coverage_test.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,6 +1037,11 @@ func TestK8sBackend_Regrade_NoServiceSoftSkip(t *testing.T) {
10371037
b := newFakeK8sBackend(t)
10381038
ctx := context.Background()
10391039
const ns = "no-svc-regrade-ns"
1040+
// Namespace present (pre-orphan-check fix requires this) + secret present
1041+
// + service absent → soft-skip with the legacy-resource SkipReason.
1042+
_, _ = b.cs.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{
1043+
ObjectMeta: metav1.ObjectMeta{Name: ns},
1044+
}, metav1.CreateOptions{})
10401045
_, _ = b.cs.CoreV1().Secrets(ns).Create(ctx, &corev1.Secret{
10411046
ObjectMeta: metav1.ObjectMeta{Name: "redis-auth", Namespace: ns},
10421047
Data: map[string][]byte{"REDIS_PASSWORD": []byte("x")},
@@ -1054,8 +1059,9 @@ func TestK8sBackend_Regrade_NoServiceSoftSkip(t *testing.T) {
10541059
}
10551060

10561061
// TestK8sBackend_Regrade_DefaultProviderResourceID — when providerResourceID is
1057-
// empty, the namespace is derived from the token. Secret missing → exec path
1058-
// (no pods → soft-skip), but the namespace lookup still happens.
1062+
// empty, the namespace is derived from the token (redisK8sNsPrefix+token). With
1063+
// an empty cluster the derived namespace is missing → orphan short-circuit fires
1064+
// → quiet skip with SkipReason="namespace not found (resource orphaned)".
10591065
func TestK8sBackend_Regrade_DefaultProviderResourceID(t *testing.T) {
10601066
b := newFakeK8sBackend(t)
10611067
result, err := b.Regrade(context.Background(), "tok-derive", "", 50)
@@ -1065,6 +1071,9 @@ func TestK8sBackend_Regrade_DefaultProviderResourceID(t *testing.T) {
10651071
if result.Applied {
10661072
t.Error("Applied should be false for empty cluster")
10671073
}
1074+
if !strings.Contains(result.SkipReason, "namespace not found") {
1075+
t.Errorf("SkipReason = %q; want 'namespace not found ...' (derived ns missing → orphan path)", result.SkipReason)
1076+
}
10681077
}
10691078

10701079
// TestK8sBackend_StorageBytes_DefaultNamespace exercises the empty-PRID branch
@@ -1263,6 +1272,9 @@ func TestK8sBackend_Regrade_DirectPath_HappyAndIdempotent(t *testing.T) {
12631272

12641273
b := newFakeK8sBackend(t)
12651274
const ns = "regrade-real-ns"
1275+
_, _ = b.cs.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{
1276+
ObjectMeta: metav1.ObjectMeta{Name: ns},
1277+
}, metav1.CreateOptions{})
12661278
_, _ = b.cs.CoreV1().Secrets(ns).Create(ctx, &corev1.Secret{
12671279
ObjectMeta: metav1.ObjectMeta{Name: "redis-auth", Namespace: ns},
12681280
Data: map[string][]byte{"REDIS_PASSWORD": []byte("")},
@@ -1307,6 +1319,9 @@ func TestK8sBackend_Regrade_DirectPath_Unlimited(t *testing.T) {
13071319

13081320
b := newFakeK8sBackend(t)
13091321
const ns = "regrade-unlim-ns"
1322+
_, _ = b.cs.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{
1323+
ObjectMeta: metav1.ObjectMeta{Name: ns},
1324+
}, metav1.CreateOptions{})
13101325
_, _ = b.cs.CoreV1().Secrets(ns).Create(ctx, &corev1.Secret{
13111326
ObjectMeta: metav1.ObjectMeta{Name: "redis-auth", Namespace: ns},
13121327
Data: map[string][]byte{"REDIS_PASSWORD": []byte("")},
@@ -1516,14 +1531,16 @@ func TestK8sBackend_StorageBytes_GetSecretError(t *testing.T) {
15161531
}
15171532

15181533
// TestK8sBackend_Regrade_GetSecretError covers the non-NotFound secret error
1519-
// branch on Regrade.
1534+
// branch on Regrade. The namespace fixture is required so the new orphan
1535+
// short-circuit does not fire before the secret reactor runs.
15201536
func TestK8sBackend_Regrade_GetSecretError(t *testing.T) {
1521-
cs := fake.NewClientset()
1537+
const ns = "ns"
1538+
cs := fake.NewClientset(&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns}})
15221539
cs.PrependReactor("get", "secrets", func(_ k8stesting.Action) (bool, runtime.Object, error) {
15231540
return true, nil, fmt.Errorf("apiserver unavailable")
15241541
})
15251542
b := &K8sBackend{cs: cs}
1526-
_, err := b.Regrade(context.Background(), "tok", "ns", 50)
1543+
_, err := b.Regrade(context.Background(), "tok", ns, 50)
15271544
if err == nil {
15281545
t.Error("Regrade should propagate non-NotFound secret errors")
15291546
}
@@ -1532,8 +1549,8 @@ func TestK8sBackend_Regrade_GetSecretError(t *testing.T) {
15321549
// TestK8sBackend_Regrade_GetServiceError covers the non-NotFound service error
15331550
// branch on Regrade.
15341551
func TestK8sBackend_Regrade_GetServiceError(t *testing.T) {
1535-
cs := fake.NewClientset()
15361552
const ns = "svcerr-ns"
1553+
cs := fake.NewClientset(&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns}})
15371554
_, _ = cs.CoreV1().Secrets(ns).Create(context.Background(), &corev1.Secret{
15381555
ObjectMeta: metav1.ObjectMeta{Name: "redis-auth", Namespace: ns},
15391556
Data: map[string][]byte{"REDIS_PASSWORD": []byte("x")},
@@ -1858,6 +1875,9 @@ func TestK8sBackend_Regrade_ConfigGetError(t *testing.T) {
18581875
b := newFakeK8sBackend(t)
18591876
ctx := context.Background()
18601877
const ns = "cgerr-ns"
1878+
_, _ = b.cs.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{
1879+
ObjectMeta: metav1.ObjectMeta{Name: ns},
1880+
}, metav1.CreateOptions{})
18611881
_, _ = b.cs.CoreV1().Secrets(ns).Create(ctx, &corev1.Secret{
18621882
ObjectMeta: metav1.ObjectMeta{Name: "redis-auth", Namespace: ns},
18631883
Data: map[string][]byte{"REDIS_PASSWORD": []byte("x")},

internal/backend/redis/k8s.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,30 @@ func (b *K8sBackend) Regrade(ctx context.Context, token, providerResourceID stri
623623
ns = redisK8sNsPrefix + token
624624
}
625625

626+
// ── Orphaned-resource short-circuit ───────────────────────────────────────
627+
//
628+
// If the namespace itself is gone, the resource row is orphaned: deprovision
629+
// raced ahead of platform-DB cleanup, the namespace was force-deleted, or a
630+
// test cluster was wiped. Without this guard the reconciler hits the
631+
// Secrets.Get → IsNotFound → exec-fallback → Pods.List → empty path on every
632+
// sweep forever, logging WARN twice per orphan per ~5min tick (verified
633+
// 2026-05-30 in prod: 2 orphaned namespaces emitting 576 WARN/day combined).
634+
//
635+
// Quiet skip: log at INFO (one line per tick is acceptable; debug-only would
636+
// hide a real cluster-wide namespace outage) and return a distinct SkipReason
637+
// so operators can differentiate orphaned rows ("namespace not found") from
638+
// genuine legacy-pod drift ("exec fallback: no pod found"). Also saves two
639+
// kube-apiserver round-trips (Secrets.Get + Pods.List) per orphaned row.
640+
if _, err := b.cs.CoreV1().Namespaces().Get(ctx, ns, metav1.GetOptions{}); err != nil {
641+
if k8serrors.IsNotFound(err) {
642+
slog.Info("k8s.redis.Regrade: namespace not found — orphaned resource, skip",
643+
"namespace", ns, "token", token)
644+
return RegradeResult{Applied: false, SkipReason: "namespace not found (resource orphaned)"}, nil
645+
}
646+
// Other errors (permission, transport): surface so the caller can retry.
647+
return RegradeResult{Applied: false}, fmt.Errorf("k8s redis.Regrade: get namespace: %w", err)
648+
}
649+
626650
// ── Secret-based path (modern resources) ──────────────────────────────────
627651
//
628652
// Modern resources (provisioned after the redis-auth Secret convention) store

internal/backend/redis/k8s_test.go

Lines changed: 90 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"k8s.io/apimachinery/pkg/runtime"
4646
"k8s.io/apimachinery/pkg/runtime/schema"
4747
"k8s.io/client-go/kubernetes/fake"
48+
ktesting "k8s.io/client-go/testing"
4849
)
4950

5051
// ─── Tier sizing regression guards ──────────────────────────────────────────
@@ -489,6 +490,14 @@ func TestExecCommandConstruction_AlreadyCorrect(t *testing.T) {
489490

490491
// ─── Integration-style tests: Regrade routing ────────────────────────────────
491492

493+
// nsObject returns a minimal corev1.Namespace fixture for use with fake.NewClientset.
494+
// Required after the orphaned-resource short-circuit (k8s.go) — Regrade now returns
495+
// SkipReason="namespace not found (resource orphaned)" if the namespace is absent,
496+
// so every Regrade test that exercises the existing code paths must include it.
497+
func nsObject(name string) *corev1.Namespace {
498+
return &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}}
499+
}
500+
492501
// TestRegrade_SecretPresent_DoesNotUseExec asserts that when the redis-auth
493502
// Secret IS present, Regrade uses the direct-connection path and never calls
494503
// the exec fallback.
@@ -512,7 +521,7 @@ func TestRegrade_SecretPresent_DoesNotUseExec(t *testing.T) {
512521
ObjectMeta: metav1.ObjectMeta{Name: "redis", Namespace: ns},
513522
Spec: corev1.ServiceSpec{ClusterIP: "127.0.0.1"},
514523
}
515-
cs := fake.NewClientset(secret, svc)
524+
cs := fake.NewClientset(nsObject(ns), secret, svc)
516525

517526
fe := &fakeExecor{}
518527
b := &K8sBackend{cs: cs, execor: fe}
@@ -545,7 +554,7 @@ func TestRegrade_SecretAbsent_UsesExecPath(t *testing.T) {
545554
// No secret — secret lookup returns NotFound.
546555
// Pod is present and Running so exec can proceed.
547556
pod := runningPod(ns, "redis-aaa", "redis")
548-
cs := fake.NewClientset(pod)
557+
cs := fake.NewClientset(nsObject(ns), pod)
549558

550559
fe := &fakeExecor{
551560
responses: []fakeExecResponse{
@@ -596,7 +605,7 @@ func TestRegrade_ExecFails_SoftSkip(t *testing.T) {
596605
const token = "tok"
597606

598607
pod := runningPod(ns, "redis-aaa", "redis")
599-
cs := fake.NewClientset(pod)
608+
cs := fake.NewClientset(nsObject(ns), pod)
600609

601610
fe := &fakeExecor{
602611
responses: []fakeExecResponse{
@@ -633,7 +642,7 @@ func TestRegrade_NoPodRunning_SoftSkip(t *testing.T) {
633642
},
634643
Status: corev1.PodStatus{Phase: corev1.PodPending},
635644
}
636-
cs := fake.NewClientset(pod)
645+
cs := fake.NewClientset(nsObject(ns), pod)
637646

638647
fe := &fakeExecor{}
639648
b := &K8sBackend{cs: cs, execor: fe}
@@ -650,13 +659,15 @@ func TestRegrade_NoPodRunning_SoftSkip(t *testing.T) {
650659
}
651660
}
652661

653-
// TestRegrade_NoPodsAtAll_SoftSkip asserts that when the namespace has no pods
654-
// at all, the exec fallback returns a soft-skip.
662+
// TestRegrade_NoPodsAtAll_SoftSkip asserts that when the namespace EXISTS but has
663+
// no pods at all (e.g. a legacy pod was deleted but the namespace lingers), the
664+
// exec fallback returns a soft-skip.
655665
func TestRegrade_NoPodsAtAll_SoftSkip(t *testing.T) {
656666
const ns = "instant-customer-tok"
657667
const token = "tok"
658668

659-
cs := fake.NewClientset() // empty cluster
669+
// Namespace exists, but no secret + no pods → exec fallback → no pod found.
670+
cs := fake.NewClientset(nsObject(ns))
660671

661672
fe := &fakeExecor{}
662673
b := &K8sBackend{cs: cs, execor: fe}
@@ -668,6 +679,78 @@ func TestRegrade_NoPodsAtAll_SoftSkip(t *testing.T) {
668679
if result.Applied {
669680
t.Errorf("Applied should be false when namespace has no pods")
670681
}
682+
// Distinct skip reason: not the orphan-namespace path.
683+
if result.SkipReason == "namespace not found (resource orphaned)" {
684+
t.Errorf("SkipReason should be exec-fallback reason, not orphan-ns reason; got %q", result.SkipReason)
685+
}
686+
}
687+
688+
// TestRegrade_NamespaceMissing_QuietSkip asserts the orphaned-resource short-circuit:
689+
// when the namespace is gone (deprovisioned tenant, force-deleted ns, wiped cluster),
690+
// Regrade returns a quiet skip with the orphan SkipReason and makes NO Secrets.Get
691+
// or Pods.List call — verified by checking the fakeExecor.calls count stays zero AND
692+
// the SkipReason exactly matches the orphan sentinel string.
693+
//
694+
// Regression guard: prevents the WARN-spam-per-tick behaviour observed in prod on
695+
// 2026-05-30 (2 orphaned namespaces emitting ~576 WARN/day combined).
696+
func TestRegrade_NamespaceMissing_QuietSkip(t *testing.T) {
697+
const ns = "instant-customer-gone"
698+
const token = "gone"
699+
700+
// Empty cluster — no namespace exists. The pre-fix code would have
701+
// hit Secrets.Get → IsNotFound → exec-fallback → Pods.List → WARN.
702+
cs := fake.NewClientset()
703+
704+
fe := &fakeExecor{}
705+
b := &K8sBackend{cs: cs, execor: fe}
706+
707+
result, err := b.Regrade(context.Background(), token, ns, 50)
708+
if err != nil {
709+
t.Fatalf("Regrade must not propagate errors for missing namespace; got: %v", err)
710+
}
711+
if result.Applied {
712+
t.Errorf("Applied should be false when namespace is missing")
713+
}
714+
const wantSkip = "namespace not found (resource orphaned)"
715+
if result.SkipReason != wantSkip {
716+
t.Errorf("SkipReason = %q; want %q (distinct from exec-fallback reasons so operators can differentiate orphans from real legacy drift)",
717+
result.SkipReason, wantSkip)
718+
}
719+
// Critical: the exec path MUST NOT be entered for orphans. That was the
720+
// pre-fix bug — fanout to Pods.List per orphan per tick.
721+
if len(fe.calls) != 0 {
722+
t.Errorf("fakeExecor must not be called when namespace is missing; got %d calls", len(fe.calls))
723+
}
724+
}
725+
726+
// TestRegrade_NamespaceLookupTransientError_PropagatesError asserts that a
727+
// non-IsNotFound error from the namespace lookup (e.g. permission denied,
728+
// apiserver transport failure) is surfaced to the caller — NOT swallowed as a
729+
// silent skip. The reconciler needs the error to schedule a retry; treating it
730+
// as "orphaned" would cause skipped grades the customer pays for.
731+
func TestRegrade_NamespaceLookupTransientError_PropagatesError(t *testing.T) {
732+
const ns = "instant-customer-tok"
733+
const token = "tok"
734+
735+
cs := fake.NewClientset(nsObject(ns))
736+
// Inject a transient error on the namespace GET reactor.
737+
cs.PrependReactor("get", "namespaces", func(_ ktesting.Action) (bool, runtime.Object, error) {
738+
return true, nil, fmt.Errorf("etcd: leader changed")
739+
})
740+
741+
fe := &fakeExecor{}
742+
b := &K8sBackend{cs: cs, execor: fe}
743+
744+
result, err := b.Regrade(context.Background(), token, ns, 50)
745+
if err == nil {
746+
t.Fatalf("Regrade must propagate non-IsNotFound errors so the reconciler retries; got nil error, result=%+v", result)
747+
}
748+
if result.Applied {
749+
t.Errorf("Applied should be false on namespace lookup error")
750+
}
751+
if len(fe.calls) != 0 {
752+
t.Errorf("fakeExecor must not be called on namespace lookup error; got %d calls", len(fe.calls))
753+
}
671754
}
672755

673756
// TestRegrade_TeamTier_Unlimited verifies that the team tier (unlimited, targetMB=-1)

0 commit comments

Comments
 (0)