Skip to content

Commit 4eb1207

Browse files
fix(deploy): mark broken-image deploys failed on ProgressDeadlineExceeded (#101)
A deploy whose built image cannot start (CreateContainerError "no command specified" from an empty image, ImagePullBackOff, CrashLoopBackOff) was reported "deploying" forever: deploymentStatusFromK8s only checked DeploymentReplicaFailure + replica counts, so a created-but-unstartable pod (UnavailableReplicas>0) mapped to "deploying". The failure-autopsy is gated on newStatus==failed, so it never fired — no autopsy event, no deploy.failed audit, no failure email. This is the runtime twin of the build-Job-failed override. - deploymentStatusFromK8s: Progressing=False/ProgressDeadlineExceeded with no available replica -> failed (checked after the healthy branch so a partially-failed redeploy whose old ReplicaSet still serves stays healthy). Kept in sync with the api's deploymentStatus. - extractPodFailure: classify CreateContainerError / CreateContainerConfigError / RunContainerError -> new StartFailed reason (precise hint instead of Unknown). - new metric instant_deploy_runtime_failed_detected_total{reason} (twin of instant_deploy_job_failed_detected_total); alert + dashboard tile + catalog row land in the infra repo (rule 25). Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent ecd26c5 commit 4eb1207

7 files changed

Lines changed: 267 additions & 1 deletion

internal/jobs/deploy_failure_autopsy.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ const (
121121
workerFailureReasonCrashLoopBackOff = "CrashLoopBackOff"
122122
workerFailureReasonBuildFailed = "BuildFailed"
123123
workerFailureReasonDeadlineExceeded = "DeadlineExceeded"
124+
workerFailureReasonStartFailed = "StartFailed"
124125
workerFailureReasonError = "Error"
125126
workerFailureReasonUnknown = "Unknown"
126127
)
@@ -158,6 +159,11 @@ var workerFailureHint = map[string]string{
158159
"Large base images or slow package installs can cause this. " +
159160
"Try a smaller base image (e.g. alpine) and pre-install dependencies in the Dockerfile.",
160161

162+
workerFailureReasonStartFailed: "Kubernetes created your app's pod but the container could not start. " +
163+
"The most common cause is a built image with no CMD/ENTRYPOINT (nothing to run) " +
164+
"or an invalid container configuration. Make sure your Dockerfile ends with a " +
165+
"CMD or ENTRYPOINT instruction, then re-deploy.",
166+
161167
workerFailureReasonError: "A Kubernetes replica failure was detected. " +
162168
"This is often a transient scheduling or resource constraint. " +
163169
"Re-deploy to retry; if it persists, check your Dockerfile for correct CMD/ENTRYPOINT.",
@@ -708,6 +714,14 @@ func extractPodFailure(pod *corev1.Pod, result *autopsyResult) {
708714
result.event = fmt.Sprintf("ImagePullBackOff: %s", w.Message)
709715
case "CrashLoopBackOff":
710716
result.reason = workerFailureReasonCrashLoopBackOff
717+
case "CreateContainerError", "CreateContainerConfigError", "RunContainerError":
718+
// The pod was created but its container can't start — modal
719+
// cause is a built image with no CMD/ENTRYPOINT ("no command
720+
// specified") or an invalid container config. The container
721+
// never runs, so there are no app logs; the waiting Message is
722+
// the most useful diagnostic we can surface.
723+
result.reason = workerFailureReasonStartFailed
724+
result.event = fmt.Sprintf("%s: %s", w.Reason, w.Message)
711725
}
712726
}
713727
// lastState gives us the terminated exit code even for CrashLoopBackOff.

internal/jobs/deploy_failure_autopsy_test.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ var workerKnownReasons = []string{
3939
workerFailureReasonCrashLoopBackOff,
4040
workerFailureReasonBuildFailed,
4141
workerFailureReasonDeadlineExceeded,
42+
workerFailureReasonStartFailed,
4243
workerFailureReasonError,
4344
workerFailureReasonUnknown,
4445
}
@@ -193,6 +194,28 @@ func TestExtractPodFailure_ImagePullBackOff(t *testing.T) {
193194
}
194195
}
195196

197+
// TestExtractPodFailure_StartFailed covers the broken-image runtime case: the
198+
// pod is created but the container can't start (CreateContainerError "no command
199+
// specified" from a 474-byte empty image, CreateContainerConfigError, or
200+
// RunContainerError). The reason must classify as StartFailed and the waiting
201+
// message must be surfaced in event (the only diagnostic — there are no logs).
202+
func TestExtractPodFailure_StartFailed(t *testing.T) {
203+
for _, waitReason := range []string{"CreateContainerError", "CreateContainerConfigError", "RunContainerError"} {
204+
t.Run(waitReason, func(t *testing.T) {
205+
pod := buildPodWithWaiting(waitReason, "failed to generate spec: no command specified")
206+
result := &autopsyResult{reason: workerFailureReasonUnknown}
207+
extractPodFailure(pod, result)
208+
209+
if result.reason != workerFailureReasonStartFailed {
210+
t.Errorf("reason = %q, want StartFailed", result.reason)
211+
}
212+
if result.event == "" {
213+
t.Error("expected non-empty event carrying the waiting message")
214+
}
215+
})
216+
}
217+
}
218+
196219
func TestExtractPodFailure_Evicted(t *testing.T) {
197220
pod := &corev1.Pod{
198221
ObjectMeta: metav1.ObjectMeta{Name: "evicted-pod"},

internal/jobs/deploy_lifecycle_coverage_test.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,54 @@ func TestDeploymentStatusFromK8s_Matrix(t *testing.T) {
214214
d: &appsv1.Deployment{Status: appsv1.DeploymentStatus{UnavailableReplicas: 1}},
215215
want: deployStatusDeploying,
216216
},
217+
{
218+
// Broken-image runtime silent-failure: progress deadline exceeded,
219+
// no available replica → failed (pre-fix this was "deploying" forever).
220+
name: "ProgressDeadlineExceeded with zero available is failed",
221+
d: &appsv1.Deployment{
222+
Status: appsv1.DeploymentStatus{
223+
UnavailableReplicas: 1,
224+
Conditions: []appsv1.DeploymentCondition{{
225+
Type: appsv1.DeploymentProgressing,
226+
Status: corev1.ConditionFalse,
227+
Reason: progressDeadlineExceededReason,
228+
}},
229+
},
230+
},
231+
want: deployStatusFailed,
232+
},
233+
{
234+
// A serving deploy whose newest rollout timed out (failed redeploy,
235+
// previous ReplicaSet still serving) stays healthy — available-replica
236+
// check precedes the deadline check.
237+
name: "ProgressDeadlineExceeded but a replica is available stays healthy",
238+
d: &appsv1.Deployment{
239+
Status: appsv1.DeploymentStatus{
240+
AvailableReplicas: 1,
241+
Conditions: []appsv1.DeploymentCondition{{
242+
Type: appsv1.DeploymentProgressing,
243+
Status: corev1.ConditionFalse,
244+
Reason: progressDeadlineExceededReason,
245+
}},
246+
},
247+
},
248+
want: deployStatusHealthy,
249+
},
250+
{
251+
// Progressing=True (rollout within deadline) is NOT a deadline failure.
252+
name: "Progressing True with zero available is deploying",
253+
d: &appsv1.Deployment{
254+
Status: appsv1.DeploymentStatus{
255+
UnavailableReplicas: 1,
256+
Conditions: []appsv1.DeploymentCondition{{
257+
Type: appsv1.DeploymentProgressing,
258+
Status: corev1.ConditionTrue,
259+
Reason: "ReplicaSetUpdated",
260+
}},
261+
},
262+
},
263+
want: deployStatusDeploying,
264+
},
217265
{
218266
name: "all zeros is building",
219267
d: &appsv1.Deployment{Status: appsv1.DeploymentStatus{}},
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
package jobs
2+
3+
// deploy_runtime_failed_metric_test.go — covers the runtime rollout-failure
4+
// detector wired into computeNewStatus (broken-image silent-failure fix,
5+
// 2026-06-08). Asserts BOTH that a ProgressDeadlineExceeded rollout maps to
6+
// "failed" AND that instant_deploy_runtime_failed_detected_total increments for
7+
// that path only (and NOT for a generic DeploymentReplicaFailure, which is a
8+
// distinct cause out of this counter's scope).
9+
10+
import (
11+
"context"
12+
"testing"
13+
14+
sqlmock "github.com/DATA-DOG/go-sqlmock"
15+
"github.com/prometheus/client_golang/prometheus/testutil"
16+
appsv1 "k8s.io/api/apps/v1"
17+
corev1 "k8s.io/api/core/v1"
18+
19+
"instant.dev/worker/internal/metrics"
20+
)
21+
22+
func TestComputeNewStatus_ProgressDeadlineExceeded_FailsAndCountsMetric(t *testing.T) {
23+
db, _, err := sqlmock.New()
24+
if err != nil {
25+
t.Fatalf("sqlmock.New: %v", err)
26+
}
27+
defer db.Close()
28+
29+
k8s := newFakeDeployStatusK8s()
30+
// Rollout exceeded its progress deadline with no available replica —
31+
// the broken-image runtime failure (container can't start).
32+
k8s.objs["instant-deploy-pde|app-pde"] = &appsv1.Deployment{
33+
Status: appsv1.DeploymentStatus{
34+
UnavailableReplicas: 1,
35+
Conditions: []appsv1.DeploymentCondition{{
36+
Type: appsv1.DeploymentProgressing,
37+
Status: corev1.ConditionFalse,
38+
Reason: progressDeadlineExceededReason,
39+
}},
40+
},
41+
}
42+
w := NewDeployStatusReconciler(db, k8s)
43+
44+
before := testutil.ToFloat64(
45+
metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))
46+
47+
status, err := w.computeNewStatus(context.Background(), "app-pde")
48+
if err != nil {
49+
t.Fatalf("computeNewStatus: %v", err)
50+
}
51+
if status != deployStatusFailed {
52+
t.Errorf("status = %q, want failed", status)
53+
}
54+
55+
after := testutil.ToFloat64(
56+
metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))
57+
if after-before != 1 {
58+
t.Errorf("DeployRuntimeFailedDetectedTotal delta = %v, want 1", after-before)
59+
}
60+
}
61+
62+
// TestComputeNewStatus_ReplicaFailure_DoesNotCountRuntimeMetric pins the
63+
// attribution boundary: a DeploymentReplicaFailure also maps to "failed" but
64+
// must NOT increment the runtime-progress-deadline counter (distinct cause).
65+
func TestComputeNewStatus_ReplicaFailure_DoesNotCountRuntimeMetric(t *testing.T) {
66+
db, _, err := sqlmock.New()
67+
if err != nil {
68+
t.Fatalf("sqlmock.New: %v", err)
69+
}
70+
defer db.Close()
71+
72+
k8s := newFakeDeployStatusK8s()
73+
k8s.objs["instant-deploy-rf|app-rf"] = &appsv1.Deployment{
74+
Status: appsv1.DeploymentStatus{
75+
Conditions: []appsv1.DeploymentCondition{{
76+
Type: appsv1.DeploymentReplicaFailure,
77+
Status: corev1.ConditionTrue,
78+
}},
79+
},
80+
}
81+
w := NewDeployStatusReconciler(db, k8s)
82+
83+
before := testutil.ToFloat64(
84+
metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))
85+
86+
status, err := w.computeNewStatus(context.Background(), "app-rf")
87+
if err != nil {
88+
t.Fatalf("computeNewStatus: %v", err)
89+
}
90+
if status != deployStatusFailed {
91+
t.Errorf("status = %q, want failed", status)
92+
}
93+
94+
after := testutil.ToFloat64(
95+
metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))
96+
if after != before {
97+
t.Errorf("replica-failure must not bump runtime counter: delta = %v", after-before)
98+
}
99+
}

internal/jobs/deploy_status_reconcile.go

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,20 @@ const (
142142
deployStatusFailed = "failed"
143143
deployStatusStopped = "stopped"
144144

145+
// progressDeadlineExceededReason is the Reason k8s stamps on a Deployment's
146+
// Progressing condition (status=False) when a rollout fails to make progress
147+
// within spec.progressDeadlineSeconds (default 600s). k8s does not export it
148+
// as a typed constant (deploymentutil.TimedOutReason internally), so it is
149+
// named here per the no-hardcoded-strings rule. Kept verbatim in sync with
150+
// the api's k8s provider (progressDeadlineExceededReason in client.go).
151+
progressDeadlineExceededReason = "ProgressDeadlineExceeded"
152+
153+
// runtimeFailReasonProgressDeadline is the bounded `reason` label on
154+
// instant_deploy_runtime_failed_detected_total for a rollout that exceeded
155+
// its progress deadline with no available replica (the broken-image runtime
156+
// silent-failure class).
157+
runtimeFailReasonProgressDeadline = "progress_deadline_exceeded"
158+
145159
// stuckBuildingReapMessage is stamped onto a reaped row's error_message
146160
// (only when the api hadn't already written one) so the user-facing
147161
// failure surface explains why the build never produced an app.
@@ -607,7 +621,15 @@ func (w *DeployStatusReconciler) computeNewStatus(ctx context.Context, providerI
607621
return deployStatusBuilding, nil
608622
}
609623

610-
return deploymentStatusFromK8s(deploy), nil
624+
status := deploymentStatusFromK8s(deploy)
625+
if status == deployStatusFailed && deploymentProgressDeadlineExceeded(deploy) {
626+
// Runtime rollout-failure detection (broken-image silent-failure fix,
627+
// 2026-06-08). Attribute ONLY the progress-deadline path —
628+
// DeploymentReplicaFailure also maps to failed but is a distinct cause
629+
// (the ReplicaSet could not create pods) and is not this counter's scope.
630+
metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline).Inc()
631+
}
632+
return status, nil
611633
}
612634

613635
// jobIsFailed reports whether a kaniko build Job has reached a terminal
@@ -674,12 +696,40 @@ func deploymentStatusFromK8s(deploy *appsv1.Deployment) string {
674696
if deploy.Status.AvailableReplicas >= 1 {
675697
return deployStatusHealthy
676698
}
699+
// Rollout exceeded its progress deadline with NO available replica: the
700+
// pods were created but their containers cannot start — the modal cause is
701+
// a broken built image (CreateContainerError "no command specified",
702+
// ImagePullBackOff, or CrashLoopBackOff). k8s does NOT retry past the
703+
// deadline, so this is terminal. Without this branch such a deploy reports
704+
// "deploying" forever (UnavailableReplicas>0 below) and never transitions to
705+
// failed — so the failure-autopsy (gated on newStatus==failed) never fires
706+
// and the user gets no failure email. This is the runtime twin of the
707+
// build-Job-failed override (jobIsFailed). Checked AFTER the healthy branch
708+
// so a partially-failed redeploy whose previous ReplicaSet still serves is
709+
// reported healthy, not failed. Kept in sync with the api's deploymentStatus.
710+
if deploymentProgressDeadlineExceeded(deploy) {
711+
return deployStatusFailed
712+
}
677713
if deploy.Status.UpdatedReplicas > 0 || deploy.Status.UnavailableReplicas > 0 {
678714
return deployStatusDeploying
679715
}
680716
return deployStatusBuilding
681717
}
682718

719+
// deploymentProgressDeadlineExceeded reports whether the Deployment's
720+
// Progressing condition is False with reason ProgressDeadlineExceeded — k8s's
721+
// definitive "this rollout will not make progress" verdict.
722+
func deploymentProgressDeadlineExceeded(deploy *appsv1.Deployment) bool {
723+
for _, cond := range deploy.Status.Conditions {
724+
if cond.Type == appsv1.DeploymentProgressing &&
725+
cond.Status == corev1.ConditionFalse &&
726+
cond.Reason == progressDeadlineExceededReason {
727+
return true
728+
}
729+
}
730+
return false
731+
}
732+
683733
// deployNamespaceFromProviderID derives the per-deployment namespace from the
684734
// provider_id stored on a deployments row. provider_id = "app-<appID>";
685735
// namespace = "instant-deploy-<appID>". Returns "" for rows whose provider_id

internal/metrics/metrics.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,34 @@ var (
665665
Help: "Kaniko build Jobs detected in Failed state by deploy_status_reconcile (silent-deploy-failure fix, 2026-05-30). Labelled by Job Failed-condition reason.",
666666
}, []string{"reason"})
667667

668+
// ── deploy_status_reconcile — runtime rollout-failure detector (2026-06-08) ──
669+
//
670+
// The build-side twin of DeployJobFailedDetectedTotal. Increments when the
671+
// reconciler flips a deployment to "failed" because the runtime k8s
672+
// Deployment exceeded its progress deadline with NO available replica
673+
// (Progressing=False, reason=ProgressDeadlineExceeded). This catches the
674+
// silent RUNTIME failure class the build-Job detector misses: the build
675+
// SUCCEEDED but the produced image cannot start (CreateContainerError "no
676+
// command specified", ImagePullBackOff, CrashLoopBackOff). Pre-fix these
677+
// deploys reported "deploying" forever and never autopsied or emailed.
678+
//
679+
// Label `reason`: bounded — currently only "progress_deadline_exceeded".
680+
//
681+
// NR alert (infra/newrelic/alerts/deploy-runtime-failed.json):
682+
// sum(rate(instant_deploy_runtime_failed_detected_total[15m])) > 0
683+
// for 15m → P1 page (user-visible recoverable: deploys are failing to
684+
// start at runtime — likely a broken base image, a registry/pull-secret
685+
// regression, or a platform image-build defect producing empty images).
686+
//
687+
// Catalog row (infra/observability/METRICS-CATALOG.md):
688+
// instant_deploy_runtime_failed_detected_total | counter | reason | lazy
689+
// (first observation is a real runtime-failure detection; the label is
690+
// primed in metrics_test.go so /metrics exposes it from process start).
691+
DeployRuntimeFailedDetectedTotal = promauto.NewCounterVec(prometheus.CounterOpts{
692+
Name: "instant_deploy_runtime_failed_detected_total",
693+
Help: "Runtime Deployments detected as failed-to-progress by deploy_status_reconcile (ProgressDeadlineExceeded with no available replica — broken-image silent-failure fix, 2026-06-08). Labelled by detection reason.",
694+
}, []string{"reason"})
695+
668696
// ── deploy_failure_autopsy — capture outcome counter (PR 2, 2026-05-30) ──
669697
//
670698
// Increments once per captureDeploymentAutopsy call, labelled by outcome.

internal/metrics/metrics_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ func TestAllMetrics_AreRegistered(t *testing.T) {
101101
E2ECohortSweptTotal.WithLabelValues("failed").Add(0)
102102
E2ECohortSweptTotal.WithLabelValues("skipped_not_cohort").Add(0)
103103
DeployJobFailedDetectedTotal.WithLabelValues("BackoffLimitExceeded").Add(0)
104+
// Prime the runtime-failure detector label so /metrics exposes it from
105+
// process start (lazy *Vec; first real observation is a ProgressDeadlineExceeded
106+
// detection in deploy_status_reconcile).
107+
DeployRuntimeFailedDetectedTotal.WithLabelValues("progress_deadline_exceeded").Add(0)
104108
// Prime all four DeployAutopsyCapturedTotal outcome label values so
105109
// /metrics exposes them from process start (lazy emit otherwise leaves
106110
// the panel empty until the first real autopsy fires).

0 commit comments

Comments
 (0)