Skip to content

Commit 7d2ff0d

Browse files
feat(jobs): orphan_sweep PASS 3 enhanced reasons + PASS 6 stuck-build
PROBLEM. The prod orphan instant-deploy-04dc0b31 (2026-05-14) sat in ImagePullBackOff for 9h+. PASS 3 left it alone because the deployments row was status='deploying', not 'deleted', and its team was active. deploy_status_reconcile flips a row to 'failed' only when k8s reports DeploymentReplicaFailure=True — a stuck pod never trips that. PASS 3 ENHANCEMENTS - Per-namespace reason labels (team_tombstoned, no_db_row, failed_old_deployment) drive new Prometheus metric instant_orphan_sweep_reaped_total{reason}. - no_db_row now applies a 1h grace via GetNamespaceAge to avoid racing with in-flight provisions. - failed_old_deployment reaps instant-deploy-* whose row is status='failed' AND created_at > 6h ago (autopsy stays in deployment_events). - Proposed-reap structured log lands BEFORE every delete with full evidence (constraint #3: operator must see what is about to happen). PASS 6 (NEW) — STUCK-BUILD DETECTION Catches deployments stuck in 'building'/'deploying' for >30min whose only pod is in ImagePullBackOff/ErrImagePull/CrashLoopBackOff. Flips the row to 'failed' + sets error_message. The autopsy is captured by the next deploy_status_reconcile tick (one source of truth). SAFETY - Whitelist on the three prefixes (instant-deploy-*, instant-customer-*, instant-stack-*) enforced by the ListNamespacesWithPrefix seam. - All grace thresholds (1h/6h/30min) conservative on purpose. - Per-namespace fail-open posture matches the existing passes. - Pure classifier extracted (classifyDeployOrphan) for direct table-driven testing. TESTS - TestOrphanSweep_NamespaceWithoutDBRow_ReapsAfterGrace - TestOrphanSweep_FailedDeployment_ReapedAfter6h - TestOrphanSweep_Pass6_StuckBuild_FlipsToFailed - TestOrphanSweep_Pass6_RunningPod_DoesNotFlip - TestOrphanSweep_PrefixWhitelist_RefusesUnknownNamespace - TestOrphanSweep_ClassifyDeployOrphan_TableDriven (10 row shapes) - TestOrphanSweep_StuckBuildWaitingReasons_Registry (registry-iterating per CLAUDE.md rule 18) METRICS - instant_orphan_sweep_reaped_total{reason} — PASS 3/4/5/6 reaps - instant_orphan_sweep_reap_failed_total{reason} — k8s/DB failures Companion infra PR adds the Prom alerts (no_db_row > 0 over 1h → P0). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent b3f093c commit 7d2ff0d

5 files changed

Lines changed: 1052 additions & 53 deletions

File tree

internal/jobs/k8s_namespace_client.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ package jobs
3232
import (
3333
"context"
3434
"fmt"
35+
"time"
3536

37+
corev1 "k8s.io/api/core/v1"
3638
apierrors "k8s.io/apimachinery/pkg/api/errors"
3739
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3840
"k8s.io/client-go/kubernetes"
@@ -138,3 +140,85 @@ func (c *k8sNamespaceClient) listNamespacesWithPrefix(ctx context.Context, prefi
138140
}
139141
return out, nil
140142
}
143+
144+
// GetNamespaceAge returns time.Since(namespace.CreationTimestamp). NotFound
145+
// maps to (0, nil) — the orphan_sweep PASS 3 caller treats that as "the
146+
// namespace was reaped by another path; nothing to do this tick".
147+
func (c *k8sNamespaceClient) GetNamespaceAge(ctx context.Context, namespace string) (time.Duration, error) {
148+
ns, err := c.cs.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{})
149+
if err != nil {
150+
if apierrors.IsNotFound(err) {
151+
return 0, nil
152+
}
153+
return 0, fmt.Errorf("k8sNamespaceClient.GetNamespaceAge %q: %w", namespace, err)
154+
}
155+
created := ns.CreationTimestamp.Time
156+
if created.IsZero() {
157+
// A namespace with no CreationTimestamp is anomalous — be
158+
// conservative and report it as freshly created so the
159+
// no_db_row reap path skips it this tick.
160+
return 0, nil
161+
}
162+
return time.Since(created), nil
163+
}
164+
165+
// k8sPodStateClient is the production PodStateProvider used by PASS 6.
166+
// Wraps the same kubernetes.Clientset used by k8sNamespaceClient — both
167+
// are constructed in StartWorkers from a single newDeployK8sClientset()
168+
// call so they share a TCP connection pool to the k8s API.
169+
type k8sPodStateClient struct {
170+
cs *kubernetes.Clientset
171+
}
172+
173+
// NewK8sPodStateClient builds the PASS 6 pod-state seam. Returns (nil, err)
174+
// when no cluster is reachable — caller passes nil to
175+
// (*OrphanSweepReconciler).WithPodStateProvider and PASS 6 stays disabled.
176+
func NewK8sPodStateClient() (PodStateProvider, error) {
177+
cs, err := newDeployK8sClientset()
178+
if err != nil {
179+
return nil, err
180+
}
181+
return &k8sPodStateClient{cs: cs}, nil
182+
}
183+
184+
// ListPodWaitingReasons returns the waiting-state reason of every pod's
185+
// primary container in `namespace`. A pod whose primary container is NOT
186+
// in Waiting (Running, ContainerCreating that has progressed, Terminated)
187+
// contributes "" to the slice — the PASS 6 caller treats any "" as
188+
// "build is progressing; leave alone".
189+
//
190+
// A NotFound on the namespace yields (nil, nil) — the namespace was reaped
191+
// by another path before PASS 6 could check it.
192+
func (c *k8sPodStateClient) ListPodWaitingReasons(ctx context.Context, namespace string) ([]string, error) {
193+
list, err := c.cs.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{})
194+
if err != nil {
195+
if apierrors.IsNotFound(err) {
196+
return nil, nil
197+
}
198+
return nil, fmt.Errorf("k8sPodStateClient.ListPodWaitingReasons %q: %w", namespace, err)
199+
}
200+
out := make([]string, 0, len(list.Items))
201+
for i := range list.Items {
202+
pod := &list.Items[i]
203+
out = append(out, podPrimaryContainerWaitingReason(pod))
204+
}
205+
return out, nil
206+
}
207+
208+
// podPrimaryContainerWaitingReason returns the Waiting.Reason of the
209+
// pod's first container, or "" when not in Waiting state. Returning the
210+
// first container's state is sufficient for PASS 6: instant-deploy-*
211+
// pods are single-container by construction (the api's k8s provider
212+
// creates exactly one app container per Deployment).
213+
func podPrimaryContainerWaitingReason(pod *corev1.Pod) string {
214+
if len(pod.Status.ContainerStatuses) == 0 {
215+
// Pod hasn't reached the container status step yet (Pending +
216+
// scheduling). Report empty — caller treats as progressing.
217+
return ""
218+
}
219+
cs := pod.Status.ContainerStatuses[0]
220+
if cs.State.Waiting != nil {
221+
return cs.State.Waiting.Reason
222+
}
223+
return ""
224+
}

0 commit comments

Comments
 (0)