fix(deploy): mark broken-image deploys failed on ProgressDeadlineExceeded (#101)

mastermanas805 · claude · web-flow · commit 4eb120707cdc · 2026-06-08T22:28:34.000+05:30
A deploy whose built image cannot start (CreateContainerError "no command specified" from an empty image, ImagePullBackOff, CrashLoopBackOff) was reported "deploying" forever: deploymentStatusFromK8s only checked DeploymentReplicaFailure + replica counts, so a created-but-unstartable pod (UnavailableReplicas&gt;0) mapped to "deploying". The failure-autopsy is gated on newStatus==failed, so it never fired — no autopsy event, no deploy.failed audit, no failure email. This is the runtime twin of the build-Job-failed override.

- deploymentStatusFromK8s: Progressing=False/ProgressDeadlineExceeded with no available replica -&gt; failed (checked after the healthy branch so a partially-failed redeploy whose old ReplicaSet still serves stays healthy). Kept in sync with the api's deploymentStatus.

- extractPodFailure: classify CreateContainerError / CreateContainerConfigError / RunContainerError -&gt; new StartFailed reason (precise hint instead of Unknown).

- new metric instant_deploy_runtime_failed_detected_total{reason} (twin of instant_deploy_job_failed_detected_total); alert + dashboard tile + catalog row land in the infra repo (rule 25).

Co-authored-by: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/internal/jobs/deploy_failure_autopsy.go b/internal/jobs/deploy_failure_autopsy.go
@@ -121,6 +121,7 @@ const (
 	workerFailureReasonCrashLoopBackOff = "CrashLoopBackOff"
 	workerFailureReasonBuildFailed      = "BuildFailed"
 	workerFailureReasonDeadlineExceeded = "DeadlineExceeded"
+	workerFailureReasonStartFailed      = "StartFailed"
 	workerFailureReasonError            = "Error"
 	workerFailureReasonUnknown          = "Unknown"
 )
@@ -158,6 +159,11 @@ var workerFailureHint = map[string]string{
 		"Large base images or slow package installs can cause this. " +
 		"Try a smaller base image (e.g. alpine) and pre-install dependencies in the Dockerfile.",
 
+	workerFailureReasonStartFailed: "Kubernetes created your app's pod but the container could not start. " +
+		"The most common cause is a built image with no CMD/ENTRYPOINT (nothing to run) " +
+		"or an invalid container configuration. Make sure your Dockerfile ends with a " +
+		"CMD or ENTRYPOINT instruction, then re-deploy.",
+
 	workerFailureReasonError: "A Kubernetes replica failure was detected. " +
 		"This is often a transient scheduling or resource constraint. " +
 		"Re-deploy to retry; if it persists, check your Dockerfile for correct CMD/ENTRYPOINT.",
@@ -708,6 +714,14 @@ func extractPodFailure(pod *corev1.Pod, result *autopsyResult) {
 				result.event = fmt.Sprintf("ImagePullBackOff: %s", w.Message)
 			case "CrashLoopBackOff":
 				result.reason = workerFailureReasonCrashLoopBackOff
+			case "CreateContainerError", "CreateContainerConfigError", "RunContainerError":
+				// The pod was created but its container can't start — modal
+				// cause is a built image with no CMD/ENTRYPOINT ("no command
+				// specified") or an invalid container config. The container
+				// never runs, so there are no app logs; the waiting Message is
+				// the most useful diagnostic we can surface.
+				result.reason = workerFailureReasonStartFailed
+				result.event = fmt.Sprintf("%s: %s", w.Reason, w.Message)
 			}
 		}
 		// lastState gives us the terminated exit code even for CrashLoopBackOff.
diff --git a/internal/jobs/deploy_failure_autopsy_test.go b/internal/jobs/deploy_failure_autopsy_test.go
@@ -39,6 +39,7 @@ var workerKnownReasons = []string{
 	workerFailureReasonCrashLoopBackOff,
 	workerFailureReasonBuildFailed,
 	workerFailureReasonDeadlineExceeded,
+	workerFailureReasonStartFailed,
 	workerFailureReasonError,
 	workerFailureReasonUnknown,
 }
@@ -193,6 +194,28 @@ func TestExtractPodFailure_ImagePullBackOff(t *testing.T) {
 	}
 }
 
+// TestExtractPodFailure_StartFailed covers the broken-image runtime case: the
+// pod is created but the container can't start (CreateContainerError "no command
+// specified" from a 474-byte empty image, CreateContainerConfigError, or
+// RunContainerError). The reason must classify as StartFailed and the waiting
+// message must be surfaced in event (the only diagnostic — there are no logs).
+func TestExtractPodFailure_StartFailed(t *testing.T) {
+	for _, waitReason := range []string{"CreateContainerError", "CreateContainerConfigError", "RunContainerError"} {
+		t.Run(waitReason, func(t *testing.T) {
+			pod := buildPodWithWaiting(waitReason, "failed to generate spec: no command specified")
+			result := &autopsyResult{reason: workerFailureReasonUnknown}
+			extractPodFailure(pod, result)
+
+			if result.reason != workerFailureReasonStartFailed {
+				t.Errorf("reason = %q, want StartFailed", result.reason)
+			}
+			if result.event == "" {
+				t.Error("expected non-empty event carrying the waiting message")
+			}
+		})
+	}
+}
+
 func TestExtractPodFailure_Evicted(t *testing.T) {
 	pod := &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{Name: "evicted-pod"},
diff --git a/internal/jobs/deploy_lifecycle_coverage_test.go b/internal/jobs/deploy_lifecycle_coverage_test.go
@@ -214,6 +214,54 @@ func TestDeploymentStatusFromK8s_Matrix(t *testing.T) {
 			d:    &appsv1.Deployment{Status: appsv1.DeploymentStatus{UnavailableReplicas: 1}},
 			want: deployStatusDeploying,
 		},
+		{
+			// Broken-image runtime silent-failure: progress deadline exceeded,
+			// no available replica → failed (pre-fix this was "deploying" forever).
+			name: "ProgressDeadlineExceeded with zero available is failed",
+			d: &appsv1.Deployment{
+				Status: appsv1.DeploymentStatus{
+					UnavailableReplicas: 1,
+					Conditions: []appsv1.DeploymentCondition{{
+						Type:   appsv1.DeploymentProgressing,
+						Status: corev1.ConditionFalse,
+						Reason: progressDeadlineExceededReason,
+					}},
+				},
+			},
+			want: deployStatusFailed,
+		},
+		{
+			// A serving deploy whose newest rollout timed out (failed redeploy,
+			// previous ReplicaSet still serving) stays healthy — available-replica
+			// check precedes the deadline check.
+			name: "ProgressDeadlineExceeded but a replica is available stays healthy",
+			d: &appsv1.Deployment{
+				Status: appsv1.DeploymentStatus{
+					AvailableReplicas: 1,
+					Conditions: []appsv1.DeploymentCondition{{
+						Type:   appsv1.DeploymentProgressing,
+						Status: corev1.ConditionFalse,
+						Reason: progressDeadlineExceededReason,
+					}},
+				},
+			},
+			want: deployStatusHealthy,
+		},
+		{
+			// Progressing=True (rollout within deadline) is NOT a deadline failure.
+			name: "Progressing True with zero available is deploying",
+			d: &appsv1.Deployment{
+				Status: appsv1.DeploymentStatus{
+					UnavailableReplicas: 1,
+					Conditions: []appsv1.DeploymentCondition{{
+						Type:   appsv1.DeploymentProgressing,
+						Status: corev1.ConditionTrue,
+						Reason: "ReplicaSetUpdated",
+					}},
+				},
+			},
+			want: deployStatusDeploying,
+		},
 		{
 			name: "all zeros is building",
 			d:    &appsv1.Deployment{Status: appsv1.DeploymentStatus{}},
diff --git a/internal/jobs/deploy_runtime_failed_metric_test.go b/internal/jobs/deploy_runtime_failed_metric_test.go
@@ -0,0 +1,99 @@
+package jobs
+
+// deploy_runtime_failed_metric_test.go — covers the runtime rollout-failure
+// detector wired into computeNewStatus (broken-image silent-failure fix,
+// 2026-06-08). Asserts BOTH that a ProgressDeadlineExceeded rollout maps to
+// "failed" AND that instant_deploy_runtime_failed_detected_total increments for
+// that path only (and NOT for a generic DeploymentReplicaFailure, which is a
+// distinct cause out of this counter's scope).
+
+import (
+	"context"
+	"testing"
+
+	sqlmock "github.com/DATA-DOG/go-sqlmock"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+
+	"instant.dev/worker/internal/metrics"
+)
+
+func TestComputeNewStatus_ProgressDeadlineExceeded_FailsAndCountsMetric(t *testing.T) {
+	db, _, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock.New: %v", err)
+	}
+	defer db.Close()
+
+	k8s := newFakeDeployStatusK8s()
+	// Rollout exceeded its progress deadline with no available replica —
+	// the broken-image runtime failure (container can't start).
+	k8s.objs["instant-deploy-pde|app-pde"] = &appsv1.Deployment{
+		Status: appsv1.DeploymentStatus{
+			UnavailableReplicas: 1,
+			Conditions: []appsv1.DeploymentCondition{{
+				Type:   appsv1.DeploymentProgressing,
+				Status: corev1.ConditionFalse,
+				Reason: progressDeadlineExceededReason,
+			}},
+		},
+	}
+	w := NewDeployStatusReconciler(db, k8s)
+
+	before := testutil.ToFloat64(
+		metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))
+
+	status, err := w.computeNewStatus(context.Background(), "app-pde")
+	if err != nil {
+		t.Fatalf("computeNewStatus: %v", err)
+	}
+	if status != deployStatusFailed {
+		t.Errorf("status = %q, want failed", status)
+	}
+
+	after := testutil.ToFloat64(
+		metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))
+	if after-before != 1 {
+		t.Errorf("DeployRuntimeFailedDetectedTotal delta = %v, want 1", after-before)
+	}
+}
+
+// TestComputeNewStatus_ReplicaFailure_DoesNotCountRuntimeMetric pins the
+// attribution boundary: a DeploymentReplicaFailure also maps to "failed" but
+// must NOT increment the runtime-progress-deadline counter (distinct cause).
+func TestComputeNewStatus_ReplicaFailure_DoesNotCountRuntimeMetric(t *testing.T) {
+	db, _, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock.New: %v", err)
+	}
+	defer db.Close()
+
+	k8s := newFakeDeployStatusK8s()
+	k8s.objs["instant-deploy-rf|app-rf"] = &appsv1.Deployment{
+		Status: appsv1.DeploymentStatus{
+			Conditions: []appsv1.DeploymentCondition{{
+				Type:   appsv1.DeploymentReplicaFailure,
+				Status: corev1.ConditionTrue,
+			}},
+		},
+	}
+	w := NewDeployStatusReconciler(db, k8s)
+
+	before := testutil.ToFloat64(
+		metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))
+
+	status, err := w.computeNewStatus(context.Background(), "app-rf")
+	if err != nil {
+		t.Fatalf("computeNewStatus: %v", err)
+	}
+	if status != deployStatusFailed {
+		t.Errorf("status = %q, want failed", status)
+	}
+
+	after := testutil.ToFloat64(
+		metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))
+	if after != before {
+		t.Errorf("replica-failure must not bump runtime counter: delta = %v", after-before)
+	}
+}
diff --git a/internal/jobs/deploy_status_reconcile.go b/internal/jobs/deploy_status_reconcile.go
@@ -142,6 +142,20 @@ const (
 	deployStatusFailed    = "failed"
 	deployStatusStopped   = "stopped"
 
+	// progressDeadlineExceededReason is the Reason k8s stamps on a Deployment's
+	// Progressing condition (status=False) when a rollout fails to make progress
+	// within spec.progressDeadlineSeconds (default 600s). k8s does not export it
+	// as a typed constant (deploymentutil.TimedOutReason internally), so it is
+	// named here per the no-hardcoded-strings rule. Kept verbatim in sync with
+	// the api's k8s provider (progressDeadlineExceededReason in client.go).
+	progressDeadlineExceededReason = "ProgressDeadlineExceeded"
+
+	// runtimeFailReasonProgressDeadline is the bounded `reason` label on
+	// instant_deploy_runtime_failed_detected_total for a rollout that exceeded
+	// its progress deadline with no available replica (the broken-image runtime
+	// silent-failure class).
+	runtimeFailReasonProgressDeadline = "progress_deadline_exceeded"
+
 	// stuckBuildingReapMessage is stamped onto a reaped row's error_message
 	// (only when the api hadn't already written one) so the user-facing
 	// failure surface explains why the build never produced an app.
@@ -607,7 +621,15 @@ func (w *DeployStatusReconciler) computeNewStatus(ctx context.Context, providerI
 		return deployStatusBuilding, nil
 	}
 
-	return deploymentStatusFromK8s(deploy), nil
+	status := deploymentStatusFromK8s(deploy)
+	if status == deployStatusFailed && deploymentProgressDeadlineExceeded(deploy) {
+		// Runtime rollout-failure detection (broken-image silent-failure fix,
+		// 2026-06-08). Attribute ONLY the progress-deadline path —
+		// DeploymentReplicaFailure also maps to failed but is a distinct cause
+		// (the ReplicaSet could not create pods) and is not this counter's scope.
+		metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline).Inc()
+	}
+	return status, nil
 }
 
 // jobIsFailed reports whether a kaniko build Job has reached a terminal
@@ -674,12 +696,40 @@ func deploymentStatusFromK8s(deploy *appsv1.Deployment) string {
 	if deploy.Status.AvailableReplicas >= 1 {
 		return deployStatusHealthy
 	}
+	// Rollout exceeded its progress deadline with NO available replica: the
+	// pods were created but their containers cannot start — the modal cause is
+	// a broken built image (CreateContainerError "no command specified",
+	// ImagePullBackOff, or CrashLoopBackOff). k8s does NOT retry past the
+	// deadline, so this is terminal. Without this branch such a deploy reports
+	// "deploying" forever (UnavailableReplicas>0 below) and never transitions to
+	// failed — so the failure-autopsy (gated on newStatus==failed) never fires
+	// and the user gets no failure email. This is the runtime twin of the
+	// build-Job-failed override (jobIsFailed). Checked AFTER the healthy branch
+	// so a partially-failed redeploy whose previous ReplicaSet still serves is
+	// reported healthy, not failed. Kept in sync with the api's deploymentStatus.
+	if deploymentProgressDeadlineExceeded(deploy) {
+		return deployStatusFailed
+	}
 	if deploy.Status.UpdatedReplicas > 0 || deploy.Status.UnavailableReplicas > 0 {
 		return deployStatusDeploying
 	}
 	return deployStatusBuilding
 }
 
+// deploymentProgressDeadlineExceeded reports whether the Deployment's
+// Progressing condition is False with reason ProgressDeadlineExceeded — k8s's
+// definitive "this rollout will not make progress" verdict.
+func deploymentProgressDeadlineExceeded(deploy *appsv1.Deployment) bool {
+	for _, cond := range deploy.Status.Conditions {
+		if cond.Type == appsv1.DeploymentProgressing &&
+			cond.Status == corev1.ConditionFalse &&
+			cond.Reason == progressDeadlineExceededReason {
+			return true
+		}
+	}
+	return false
+}
+
 // deployNamespaceFromProviderID derives the per-deployment namespace from the
 // provider_id stored on a deployments row. provider_id = "app-<appID>";
 // namespace = "instant-deploy-<appID>". Returns "" for rows whose provider_id
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
@@ -665,6 +665,34 @@ var (
 		Help: "Kaniko build Jobs detected in Failed state by deploy_status_reconcile (silent-deploy-failure fix, 2026-05-30). Labelled by Job Failed-condition reason.",
 	}, []string{"reason"})
 
+	// ── deploy_status_reconcile — runtime rollout-failure detector (2026-06-08) ──
+	//
+	// The build-side twin of DeployJobFailedDetectedTotal. Increments when the
+	// reconciler flips a deployment to "failed" because the runtime k8s
+	// Deployment exceeded its progress deadline with NO available replica
+	// (Progressing=False, reason=ProgressDeadlineExceeded). This catches the
+	// silent RUNTIME failure class the build-Job detector misses: the build
+	// SUCCEEDED but the produced image cannot start (CreateContainerError "no
+	// command specified", ImagePullBackOff, CrashLoopBackOff). Pre-fix these
+	// deploys reported "deploying" forever and never autopsied or emailed.
+	//
+	// Label `reason`: bounded — currently only "progress_deadline_exceeded".
+	//
+	// NR alert (infra/newrelic/alerts/deploy-runtime-failed.json):
+	//   sum(rate(instant_deploy_runtime_failed_detected_total[15m])) > 0
+	//     for 15m → P1 page (user-visible recoverable: deploys are failing to
+	//     start at runtime — likely a broken base image, a registry/pull-secret
+	//     regression, or a platform image-build defect producing empty images).
+	//
+	// Catalog row (infra/observability/METRICS-CATALOG.md):
+	//   instant_deploy_runtime_failed_detected_total | counter | reason | lazy
+	//   (first observation is a real runtime-failure detection; the label is
+	//   primed in metrics_test.go so /metrics exposes it from process start).
+	DeployRuntimeFailedDetectedTotal = promauto.NewCounterVec(prometheus.CounterOpts{
+		Name: "instant_deploy_runtime_failed_detected_total",
+		Help: "Runtime Deployments detected as failed-to-progress by deploy_status_reconcile (ProgressDeadlineExceeded with no available replica — broken-image silent-failure fix, 2026-06-08). Labelled by detection reason.",
+	}, []string{"reason"})
+
 	// ── deploy_failure_autopsy — capture outcome counter (PR 2, 2026-05-30) ──
 	//
 	// Increments once per captureDeploymentAutopsy call, labelled by outcome.
diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go
@@ -101,6 +101,10 @@ func TestAllMetrics_AreRegistered(t *testing.T) {
 	E2ECohortSweptTotal.WithLabelValues("failed").Add(0)
 	E2ECohortSweptTotal.WithLabelValues("skipped_not_cohort").Add(0)
 	DeployJobFailedDetectedTotal.WithLabelValues("BackoffLimitExceeded").Add(0)
+	// Prime the runtime-failure detector label so /metrics exposes it from
+	// process start (lazy *Vec; first real observation is a ProgressDeadlineExceeded
+	// detection in deploy_status_reconcile).
+	DeployRuntimeFailedDetectedTotal.WithLabelValues("progress_deadline_exceeded").Add(0)
 	// Prime all four DeployAutopsyCapturedTotal outcome label values so
 	// /metrics exposes them from process start (lazy emit otherwise leaves
 	// the panel empty until the first real autopsy fires).