diff --git a/internal/jobs/propagation_runner.go b/internal/jobs/propagation_runner.go
index a953118..4e7a447 100644
--- a/internal/jobs/propagation_runner.go
+++ b/internal/jobs/propagation_runner.go
@@ -59,6 +59,7 @@ import (
 	"context"
 	"database/sql"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"log/slog"
 	"os"
@@ -70,6 +71,7 @@ import (
 	"go.opentelemetry.io/otel"
 
 	commonv1 "instant.dev/proto/common/v1"
+	"instant.dev/worker/internal/metrics"
 )
 
 // ─── named constants ──────────────────────────────────────────────────────────
@@ -93,9 +95,15 @@ const (
 // Mirrors api/internal/models/audit_kinds.go's AuditKindPropagation*
 // constants. Same drift contract as propagationKindTierElevation above.
 const (
-	auditKindPropagationApplied      = "propagation.applied"
-	auditKindPropagationRetrying     = "propagation.retrying"
-	auditKindPropagationDeadLettered = "propagation.dead_lettered"
+	auditKindPropagationApplied        = "propagation.applied"
+	auditKindPropagationRetrying       = "propagation.retrying"
+	auditKindPropagationDeadLettered   = "propagation.dead_lettered"
+	auditKindPropagationUnexpectedSkip = "propagation.unexpected_skip"
+	// CHAOS F2 (CHAOS-DRILL-2026-05-20): distinct audit kind for the
+	// old-worker/new-api rollout-skew dead-letter path. Lets the operator
+	// filter the image-skew signal independently of real-failure dead-letters
+	// via `audit_log.kind`. Mirrors api/internal/models/audit_kinds.go.
+	auditKindPropagationUnknownKindDeadLettered = "propagation.unknown_kind_dead_lettered"
 )
 
 // propagationActor is the audit_log.actor value the runner writes. Distinct
@@ -152,6 +160,105 @@ var propagationBackoffSchedule = []time.Duration{
 // gRPC error string.
 const propagationLastErrorMax = 1000
 
+// propagationAllowedSkipSubstrings enumerates the SkipReason fragments that
+// the provisioner uses to signal a TRUE no-op (idempotent re-apply / a
+// resource type with no regrade arm). Anything else is an unexpected_skip
+// and triggers retry/dead-letter per F1 (CHAOS-DRILL-2026-05-20).
+//
+// Held centrally (not inline) so the test surface and the bucketSkipReason
+// helper can share the list. Adding an entry here changes the silent-skip
+// surface; do it only when the provisioner adds a new "this is fine"
+// outcome that genuinely doesn't need a retry.
+var propagationAllowedSkipSubstrings = []string{
+	"already correct",
+	"unsupported resource type",
+	"backend does not support",
+}
+
+// isPropagationAllowedSkip returns true when the provisioner's SkipReason
+// is one of the allowed no-op signals.
+func isPropagationAllowedSkip(reason string) bool {
+	for _, sub := range propagationAllowedSkipSubstrings {
+		if strings.Contains(reason, sub) {
+			return true
+		}
+	}
+	return false
+}
+
+// propagationUnexpectedSkipDetail captures one offending resource for an
+// unexpected_skip retry. Multiple resources can fail in one dispatch; the
+// operator wants to see every one in last_error so they can fix the
+// underlying issue in batch.
+type propagationUnexpectedSkipDetail struct {
+	ResourceID   string
+	ResourceType string
+	SkipReason   string
+}
+
+// propagationUnexpectedSkipErr is the error returned by handleTierElevation
+// when at least one resource's RegradeResource returned (Applied=false,
+// SkipReason=<not in propagationAllowedSkipSubstrings>). The runner's
+// markRetry / markDeadLettered path detects this via errors.Is and emits
+// propagation.unexpected_skip audit kind (not propagation.applied) — the
+// operator-page signal for F1.
+//
+// CHAOS-DRILL-2026-05-20 F1: pre-fix the handler WARN-and-applied this
+// case (firstErr stayed nil); the row got stamped applied_at and the
+// regrade never landed. A paying customer's infra cap silently stayed at
+// the old tier and no alert fired. This error type forces the case to
+// retry → dead-letter and surfaces it loudly via a distinct audit kind +
+// Prometheus counter.
+type propagationUnexpectedSkipErr struct {
+	Resources []propagationUnexpectedSkipDetail
+}
+
+func (e *propagationUnexpectedSkipErr) Error() string {
+	if e == nil || len(e.Resources) == 0 {
+		return "unexpected_skip: <empty>"
+	}
+	parts := make([]string, 0, len(e.Resources))
+	for _, r := range e.Resources {
+		parts = append(parts, fmt.Sprintf("%s (%s): %s", r.ResourceID, r.ResourceType, r.SkipReason))
+	}
+	return "unexpected_skip from provisioner: " + strings.Join(parts, "; ")
+}
+
+// errPropagationUnexpectedSkipSentinel is the comparison target for
+// errors.Is. The Is method on propagationUnexpectedSkipErr matches it so
+// callers can switch on the class without depending on the concrete type.
+var errPropagationUnexpectedSkipSentinel = errors.New("propagation: unexpected_skip from provisioner")
+
+// Is satisfies errors.Is for the sentinel — any propagationUnexpectedSkipErr
+// matches errPropagationUnexpectedSkipSentinel.
+func (e *propagationUnexpectedSkipErr) Is(target error) bool {
+	return target == errPropagationUnexpectedSkipSentinel
+}
+
+// bucketSkipReason maps the raw provisioner SkipReason to a short canonical
+// bucket suitable for a Prometheus label. Keeps cardinality bounded — never
+// pass the raw SkipReason as a metric label. Adding a new bucket here is
+// safe; the default "other" catches anything unrecognised.
+func bucketSkipReason(reason string) string {
+	r := strings.ToLower(reason)
+	switch {
+	case strings.Contains(r, "postgres-admin secret") || strings.Contains(r, "postgres_admin"):
+		return "postgres_admin_secret_missing"
+	case strings.Contains(r, "redis-auth secret"):
+		return "redis_auth_secret_missing"
+	case strings.Contains(r, "namespace not found"):
+		return "namespace_not_found"
+	case strings.Contains(r, "not reachable") || strings.Contains(r, "unreachable"):
+		return "resource_not_reachable"
+	case strings.Contains(r, "pod not found") || strings.Contains(r, "no pod"):
+		return "pod_not_found"
+	case strings.Contains(r, "legacy resource") || strings.Contains(r, "legacy pod"):
+		return "legacy_resource"
+	default:
+		return "other"
+	}
+}
+
 // ─── job definition + handler registry ────────────────────────────────────────
 
 // PropagationRunnerArgs is the River job payload. No fields — sweep job.
@@ -282,8 +389,31 @@ func (w *PropagationRunnerWorker) Work(ctx context.Context, job *river.Job[Propa
 			// image could see a row from a newer api enqueue. Treat as a
 			// retryable failure so the row is NOT dead-lettered before
 			// the worker rolls forward.
+			//
+			// CHAOS F2 (CHAOS-DRILL-2026-05-20): apply the SAME maxAttempts
+			// ceiling here that real-failure rows get below. Without this,
+			// an old worker pod that doesn't recognise a freshly-enqueued
+			// kind retries forever — the chaos drill on 2026-05-20 saw a
+			// chaos_test_unknown_kind row reach attempts=10 in 4 minutes
+			// without ever transitioning to failed_at. Bounding the
+			// unknown-kind path on the same dead-letter horizon as a real
+			// failure (a) caps the eventual blast radius of an old image
+			// surviving the rollout window, (b) gives the operator a clean
+			// alert-able signal via instant_propagation_dead_lettered_total{reason="unknown_kind"}
+			// rather than an unbounded retry storm. The per-tick image-skew
+			// indicator stays in PropagationUnknownKindTotal — that fires on
+			// EVERY tick while the row is alive, so the operator can react
+			// in seconds instead of waiting the ~24h backoff for a
+			// dead-letter to fire.
 			unknownKind++
-			w.markRetry(ctx, row, fmt.Errorf("no handler registered for kind %q", row.kind))
+			metrics.PropagationUnknownKindTotal.WithLabelValues(row.kind).Inc()
+			dispatchErr := fmt.Errorf("no handler registered for kind %q", row.kind)
+			if row.attempts+1 >= propagationMaxAttempts {
+				deadLettered++
+				w.markUnknownKindDeadLettered(ctx, row, dispatchErr)
+				continue
+			}
+			w.markRetry(ctx, row, dispatchErr)
 			continue
 		}
 
@@ -424,10 +554,18 @@ func propagationBackoffFor(attempts int) time.Duration {
 }
 
 // markRetry persists attempts+1, schedules next_attempt_at via
-// propagationBackoffFor, persists last_error, and emits a DEBUG-level
-// propagation.retrying audit row. Best-effort: a DB failure here only
-// means the next tick may re-process the same row (idempotent handler
-// makes that safe) and the audit row is missing.
+// propagationBackoffFor, persists last_error, and emits an audit row.
+//
+// Audit kind selection (CHAOS-DRILL-2026-05-20 F1):
+//   - errors.Is(dispatchErr, errPropagationUnexpectedSkipSentinel) →
+//     propagation.unexpected_skip audit row + WARN slog (the F1 loud
+//     signal: provisioner returned applied=false with a skip reason
+//     outside the allowed set — regrade did NOT land).
+//   - else → propagation.retrying audit row + DEBUG slog (routine retry).
+//
+// Best-effort: a DB failure here only means the next tick may re-process
+// the same row (idempotent handler makes that safe) and the audit row is
+// missing.
 func (w *PropagationRunnerWorker) markRetry(ctx context.Context, row propagationRow, dispatchErr error) {
 	delay := propagationBackoffFor(row.attempts)
 	nextAttempt := w.now().Add(delay)
@@ -463,23 +601,47 @@ func (w *PropagationRunnerWorker) markRetry(ctx context.Context, row propagation
 		return
 	}
 
-	// DEBUG audit row — retries are routine during a Razorpay/provisioner
-	// outage; logging at WARN would spam. Operators inspect the row's
-	// `attempts` + `last_error` columns directly for diagnosis.
-	w.insertPropagationAuditRow(ctx, row, auditKindPropagationRetrying, fmt.Sprintf(
-		"%s propagation for team %s retrying (attempt %d/%d)",
-		row.kind, row.teamID.String(), row.attempts+1, propagationMaxAttempts,
+	// Audit kind: unexpected_skip is the loud F1 signal; retrying is the
+	// routine path. Both share the same DB UPDATE above — only the audit
+	// row + log line differ, so operators can grep/alert on the F1 class
+	// without false-positiving on a transient Razorpay outage.
+	isUnexpectedSkip := errors.Is(dispatchErr, errPropagationUnexpectedSkipSentinel)
+	auditKind := auditKindPropagationRetrying
+	if isUnexpectedSkip {
+		auditKind = auditKindPropagationUnexpectedSkip
+	}
+
+	w.insertPropagationAuditRow(ctx, row, auditKind, fmt.Sprintf(
+		"%s propagation for team %s retrying (attempt %d/%d, kind=%s)",
+		row.kind, row.teamID.String(), row.attempts+1, propagationMaxAttempts, auditKind,
 	), map[string]any{
-		"propagation_id":   row.id.String(),
-		"kind":             row.kind,
-		"team_id":          row.teamID.String(),
-		"target_tier":      nullableTierString(row.targetTier),
-		"attempts":         row.attempts + 1,
-		"max_attempts":     propagationMaxAttempts,
-		"next_attempt_at":  nextAttempt.UTC().Format(time.RFC3339),
-		"last_error":       lastErr,
+		"propagation_id":  row.id.String(),
+		"kind":            row.kind,
+		"team_id":         row.teamID.String(),
+		"target_tier":     nullableTierString(row.targetTier),
+		"attempts":        row.attempts + 1,
+		"max_attempts":    propagationMaxAttempts,
+		"next_attempt_at": nextAttempt.UTC().Format(time.RFC3339),
+		"last_error":      lastErr,
 	})
 
+	if isUnexpectedSkip {
+		// WARN-level: unexpected_skip is the F1 signal — louder than a
+		// routine retry (DEBUG), quieter than the eventual dead-letter
+		// (ERROR). Pre-fix this case was silently APPLIED, so even WARN
+		// is a step up.
+		slog.Warn("jobs.propagation_runner.unexpected_skip_retrying",
+			"propagation_id", row.id.String(),
+			"team_id", row.teamID.String(),
+			"kind", row.kind,
+			"attempts", row.attempts+1,
+			"max_attempts", propagationMaxAttempts,
+			"next_attempt_at", nextAttempt,
+			"last_error", lastErr,
+		)
+		return
+	}
+
 	slog.Debug("jobs.propagation_runner.retrying",
 		"propagation_id", row.id.String(),
 		"team_id", row.teamID.String(),
@@ -607,6 +769,24 @@ func (w *PropagationRunnerWorker) markDeadLettered(ctx context.Context, row prop
 	)
 	w.insertPropagationAuditRow(ctx, row, auditKindPropagationDeadLettered, summary, meta)
 
+	// CHAOS F3 (CHAOS-DRILL-2026-05-20): increment the Prom counter. Labels:
+	//   reason = "max_attempts" — the modal real-failure path. F1's
+	//                             unexpected_skip-as-failure, real
+	//                             RegradeResource gRPC errors, AND
+	//                             F1's markApplied DB failures all flow
+	//                             through here once they reach the
+	//                             propagationMaxAttempts ceiling, so a
+	//                             single `reason=max_attempts` covers
+	//                             every "we tried and tried and finally
+	//                             gave up" path.
+	//   kind   = row.kind — bounded by propagationKnownKinds; the
+	//                       unknown_kind path uses
+	//                       markUnknownKindDeadLettered which sets
+	//                       kind="unknown_kind" explicitly to keep
+	//                       cardinality bounded by code, not by
+	//                       attacker-controlled enqueue values.
+	metrics.PropagationDeadLetteredTotal.WithLabelValues("max_attempts", row.kind).Inc()
+
 	// CRITICAL severity: this is THE alert. NR Log alert filters on
 	// audit_kind='propagation.dead_lettered' OR on the message below.
 	slog.Error("jobs.propagation_runner.dead_lettered",
@@ -621,6 +801,98 @@ func (w *PropagationRunnerWorker) markDeadLettered(ctx context.Context, row prop
 	)
 }
 
+// markUnknownKindDeadLettered transitions a row to failed_at when its `kind`
+// has no registered handler AND the row has now reached propagationMaxAttempts.
+// Shares 90% of its body with markDeadLettered but emits a DISTINCT audit kind
+// (propagation.unknown_kind_dead_lettered) so an operator can filter on the
+// old-image-rollback signal independently of real provisioner failures, and
+// labels the Prom counter with reason="unknown_kind" + kind="unknown_kind"
+// (we deliberately do NOT pass row.kind into the kind label because the kind
+// is by definition not in propagationKnownKinds — passing it would let an
+// api-side enqueue blow up the worker's label cardinality without a code-side
+// review).
+//
+// CHAOS F2 (CHAOS-DRILL-2026-05-20): without this path, an unknown-kind row
+// retried forever — confirmed live during the chaos drill
+// (chaos_test_unknown_kind reached attempts=10 in 4 minutes without ever
+// transitioning to failed_at). With this path, the row dead-letters after
+// the same 10 attempts as any real failure, the operator sees a
+// propagation.unknown_kind_dead_lettered audit row + a Prom counter increment,
+// and the runaway retry loop ends.
+func (w *PropagationRunnerWorker) markUnknownKindDeadLettered(ctx context.Context, row propagationRow, dispatchErr error) {
+	lastErr := truncatePropagationError(dispatchErr.Error())
+	res, err := w.db.ExecContext(ctx, `
+		UPDATE pending_propagations
+		   SET attempts        = attempts + 1,
+		       last_attempt_at = now(),
+		       last_error      = $1,
+		       failed_at       = now()
+		 WHERE id = $2
+		   AND applied_at IS NULL
+		   AND failed_at IS NULL
+	`, lastErr, row.id)
+	if err != nil {
+		slog.Error("jobs.propagation_runner.unknown_kind_dead_letter_persist_failed",
+			"propagation_id", row.id.String(),
+			"team_id", row.teamID.String(),
+			"kind", row.kind,
+			"error", err,
+			"dispatch_error", lastErr,
+		)
+		return
+	}
+	affected, _ := res.RowsAffected()
+	if affected == 0 {
+		// Already terminal — sibling tick won the race.
+		return
+	}
+
+	// age_seconds — same best-effort pattern as markDeadLettered.
+	var ageSeconds float64
+	var createdAt sql.NullTime
+	if qErr := w.db.QueryRowContext(ctx,
+		`SELECT created_at FROM pending_propagations WHERE id = $1`, row.id,
+	).Scan(&createdAt); qErr == nil && createdAt.Valid {
+		ageSeconds = w.now().Sub(createdAt.Time).Seconds()
+	}
+
+	meta := map[string]any{
+		"propagation_id": row.id.String(),
+		"kind":           row.kind,
+		"team_id":        row.teamID.String(),
+		"target_tier":    nullableTierString(row.targetTier),
+		"attempts":       row.attempts + 1,
+		"max_attempts":   propagationMaxAttempts,
+		"last_error":     lastErr,
+		"age_seconds":    ageSeconds,
+		"failure_reason": "unknown_kind",
+	}
+	summary := fmt.Sprintf(
+		"unknown_kind propagation (kind=%q) for team %s DEAD-LETTERED after %d attempts — worker image is older than api enqueue; finish the rollout",
+		row.kind, row.teamID.String(), propagationMaxAttempts,
+	)
+	// Distinct audit kind so an operator-side NRQL filter can separate
+	// real-failure dead-letters from image-skew dead-letters.
+	w.insertPropagationAuditRow(ctx, row, auditKindPropagationUnknownKindDeadLettered, summary, meta)
+
+	// kind="unknown_kind" (the BUCKET name) — see comment on the counter
+	// declaration in metrics.go for the cardinality rationale.
+	metrics.PropagationDeadLetteredTotal.WithLabelValues("unknown_kind", "unknown_kind").Inc()
+
+	// CRITICAL severity — different action than the real-failure path.
+	// The fix here is an operator rolling the worker forward, NOT
+	// inspecting a specific team's resources.
+	slog.Error("jobs.propagation_runner.unknown_kind_dead_lettered",
+		"propagation_id", row.id.String(),
+		"team_id", row.teamID.String(),
+		"kind", row.kind,
+		"attempts", row.attempts+1,
+		"max_attempts", propagationMaxAttempts,
+		"last_error", lastErr,
+		"action", "worker pod is running an older image than the api enqueued — finish the rollout; manually re-enqueue this row only after the kind has a handler",
+	)
+}
+
 // insertPropagationAuditRow writes one audit_log row, best-effort. A miss
 // here only loses the operator-visible ledger entry; the slog line still
 // fires, and NR alerts can key on the message string when the audit row
@@ -672,10 +944,20 @@ func nullableTierString(s sql.NullString) string {
 // RegradeResource with the resource's per-row tier snapshot (MR-P1-21 — see
 // entitlement_reconciler.go for the snapshot-is-entitlement contract). Any
 // per-resource gRPC error fails the WHOLE row (so the entire row retries
-// with backoff). Per-resource "skip" outcomes (applied=false with
-// skip_reason="already correct" / "unsupported resource type" / "backend
-// does not support regrade") are NOT treated as failures: they are the
-// provisioner's idempotency / type-coverage signal.
+// with backoff). Per-resource "skip" outcomes whose SkipReason is in
+// propagationAllowedSkipSubstrings ("already correct" / "unsupported
+// resource type" / "backend does not support regrade") are NOT treated as
+// failures: they are the provisioner's idempotency / type-coverage signal.
+//
+// CHAOS-DRILL-2026-05-20 F1 fix: any OTHER SkipReason — e.g. "postgres-admin
+// secret not found", "namespace not found", "resource not reachable" — is
+// treated as a retryable failure. The runner detects the returned
+// propagationUnexpectedSkipErr via errors.Is and emits a distinct
+// propagation.unexpected_skip audit row (not propagation.applied). The row
+// retries per the backoff schedule and dead-letters at propagationMaxAttempts.
+// Pre-fix this case WARN-logged and fell through with firstErr==nil, so the
+// row got stamped applied_at and the regrade never landed — paying customers
+// ended up with "Pro on paper, hobby-grade infra" and no alert.
 //
 // Idempotency: re-running this handler is safe because the provisioner's
 // RegradeResource does CONFIG GET / applied_conn_limit comparison before
@@ -727,7 +1009,10 @@ func handleTierElevation(ctx context.Context, db *sql.DB, regrader propagationRe
 		return nil
 	}
 
-	var firstErr error
+	var (
+		firstErr            error
+		unexpectedSkipFound []propagationUnexpectedSkipDetail
+	)
 	for _, r := range resources {
 		resType, supported := resourceTypeFromString(r.resourceType)
 		if !supported {
@@ -752,22 +1037,47 @@ func handleTierElevation(ctx context.Context, db *sql.DB, regrader propagationRe
 			}
 			continue
 		}
-		// applied=false is NOT an error — it's the idempotency signal.
-		// We log only if the skip_reason indicates an actual provisioner-
-		// side problem (not "already correct" / type unsupported).
-		if !out.Applied && out.SkipReason != "" &&
-			!strings.Contains(out.SkipReason, "already correct") &&
-			!strings.Contains(out.SkipReason, "unsupported resource type") &&
-			!strings.Contains(out.SkipReason, "backend does not support") {
+		// applied=false with an ALLOWED SkipReason ("already correct" /
+		// "unsupported resource type" / "backend does not support") is the
+		// idempotency / type-coverage signal — treat as success.
+		//
+		// applied=false with any OTHER SkipReason is an unexpected_skip:
+		// the regrade DID NOT LAND but the provisioner returned no error.
+		// CHAOS-DRILL-2026-05-20 F1 fix: accumulate the offending resources
+		// and return a propagationUnexpectedSkipErr so the runner retries
+		// the row (audit_kind=propagation.unexpected_skip) and dead-letters
+		// at propagationMaxAttempts (audit_kind=propagation.dead_lettered).
+		// Pre-fix this case fell through silently and the row was stamped
+		// applied_at; the customer's infra was never regraded and no alert
+		// fired. The metric counter is the leading indicator the dead-letter
+		// alert is the lagging signal.
+		if !out.Applied && out.SkipReason != "" && !isPropagationAllowedSkip(out.SkipReason) {
 			slog.Warn("jobs.propagation_runner.tier_elevation.unexpected_skip",
 				"propagation_id", row.id.String(),
 				"resource_id", r.id.String(),
 				"resource_type", r.resourceType,
 				"skip_reason", out.SkipReason,
 			)
+			metrics.PropagationUnexpectedSkipTotal.WithLabelValues(
+				row.kind, r.resourceType, bucketSkipReason(out.SkipReason),
+			).Inc()
+			unexpectedSkipFound = append(unexpectedSkipFound, propagationUnexpectedSkipDetail{
+				ResourceID:   r.id.String(),
+				ResourceType: r.resourceType,
+				SkipReason:   out.SkipReason,
+			})
 		}
 	}
-	return firstErr
+
+	// A real gRPC error wins over an unexpected_skip — the gRPC error is
+	// the louder signal and a retry on it will also re-check the skip path.
+	if firstErr != nil {
+		return firstErr
+	}
+	if len(unexpectedSkipFound) > 0 {
+		return &propagationUnexpectedSkipErr{Resources: unexpectedSkipFound}
+	}
+	return nil
 }
 
 // resourceTypeFromString maps the resources.resource_type column value to
diff --git a/internal/jobs/propagation_unexpected_skip_test.go b/internal/jobs/propagation_unexpected_skip_test.go
new file mode 100644
index 0000000..eb79da8
--- /dev/null
+++ b/internal/jobs/propagation_unexpected_skip_test.go
@@ -0,0 +1,362 @@
+package jobs
+
+// propagation_unexpected_skip_test.go — CHAOS-DRILL-2026-05-20 F1 regression
+// coverage for "unexpected_skip silently marks the propagation APPLIED".
+//
+// Pre-fix bug: handleTierElevation in propagation_runner.go (lines 756–771
+// pre-patch) treated `(Applied=false, SkipReason=<any string not in the
+// allowed-skip whitelist>)` as success. A WARN log fired, firstErr stayed
+// nil, and the runner stamped applied_at on the row. A paying customer's
+// regrade never landed — there was no retry, no dead-letter, no alert.
+//
+// Fix: any non-allowed SkipReason now returns propagationUnexpectedSkipErr.
+// The runner's markRetry path detects this via errors.Is and emits a
+// propagation.unexpected_skip audit row (NOT propagation.applied). The row
+// retries per the backoff schedule and dead-letters at propagationMaxAttempts
+// with a propagation.dead_lettered audit row. The Prometheus counter
+// PropagationUnexpectedSkipTotal increments at the emit site so dashboards
+// can spot patterns before the dead-letter lagging signal fires.
+//
+// Tests:
+//
+//   1. TestPropagation_UnexpectedSkip_DoesNotMarkApplied
+//      The headline regression test. Provisioner returns
+//      (Applied=false, SkipReason="postgres-admin secret not found"). Assert:
+//        - markApplied is NOT called (no applied_at UPDATE in the sqlmock script).
+//        - markRetry IS called: attempts+1, next_attempt_at advanced.
+//        - audit_log row is propagation.unexpected_skip (NOT propagation.applied).
+//        - PropagationUnexpectedSkipTotal counter incremented by 1 with
+//          the right labels.
+//
+//   2. TestPropagation_UnexpectedSkip_DeadLettersAtMaxAttempts
+//      attempts == propagationMaxAttempts-1, then one more
+//      unexpected_skip. Assert the row dead-letters via the modal
+//      markDeadLettered path (single UPDATE … SET failed_at=now()), and
+//      the audit row is propagation.dead_lettered.
+//
+//   3. TestIsPropagationAllowedSkip_Coverage
+//      Registry-iterating coverage test per CLAUDE.md rule 18. The
+//      allowed-skip set lives in one place (propagationAllowedSkipSubstrings)
+//      and we assert every documented allowed-skip string passes, while
+//      a representative set of unexpected-skip strings fails. If a future
+//      PR adds a new allowed-skip substring without updating the list, the
+//      regression test below catches the un-mapped string.
+//
+//   4. TestPropagationUnexpectedSkipErr_IsMatches
+//      Pins the errors.Is contract: the markRetry path uses
+//      errors.Is(err, errPropagationUnexpectedSkipSentinel) to switch on
+//      the audit kind, so the Is() implementation MUST match the
+//      sentinel. A future refactor that breaks this would silently
+//      reintroduce the F1 bug (markRetry would emit propagation.retrying
+//      instead of propagation.unexpected_skip).
+//
+//   5. TestBucketSkipReason_BoundsCardinality
+//      Pins the Prometheus label vocabulary so a Prometheus operator can
+//      rely on the bucket names in alert rules.
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"strings"
+	"testing"
+	"time"
+
+	sqlmock "github.com/DATA-DOG/go-sqlmock"
+	"github.com/google/uuid"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+
+	commonplans "instant.dev/common/plans"
+	"instant.dev/worker/internal/metrics"
+)
+
+// ─── Test 1: F1 regression — unexpected_skip does NOT mark applied ────────────
+
+func TestPropagation_UnexpectedSkip_DoesNotMarkApplied(t *testing.T) {
+	db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp))
+	if err != nil {
+		t.Fatalf("sqlmock.New: %v", err)
+	}
+	defer db.Close()
+
+	propID := uuid.New()
+	teamID := uuid.New()
+	resID := uuid.New()
+
+	// Snapshot the unexpected_skip counter before the run — Prometheus
+	// counters are process-global so other tests in this file (or in the
+	// suite, run with -p 1) may have moved the value. testutil.ToFloat64
+	// reads the WithLabelValues child, which is unique per labelset.
+	counterChild := metrics.PropagationUnexpectedSkipTotal.WithLabelValues(
+		propagationKindTierElevation, "postgres", "postgres_admin_secret_missing",
+	)
+	startCount := testutil.ToFloat64(counterChild)
+
+	mock.ExpectBegin()
+	mock.ExpectQuery(`SELECT id, kind, team_id, target_tier, payload, attempts\s+FROM pending_propagations`).
+		WillReturnRows(sqlmock.NewRows(propagationSweepCols).
+			AddRow(propID, propagationKindTierElevation, teamID, "pro", []byte(`{}`), 0))
+	mock.ExpectCommit()
+
+	mock.ExpectQuery(`SELECT r\.id, r\.token, r\.provider_resource_id, r\.tier, r\.resource_type`).
+		WithArgs(teamID).
+		WillReturnRows(sqlmock.NewRows(teamResourcesCols).
+			AddRow(resID, "tok-1", sql.NullString{}, "pro", "postgres"))
+
+	// THE BUG: pre-fix, the runner would next call markApplied (a
+	// `UPDATE pending_propagations SET applied_at = now()`) and INSERT a
+	// propagation.applied audit row. The fix re-routes through markRetry.
+	// We intentionally do NOT script those expectations — if the bug ever
+	// regressed, sqlmock would fail with "unexpected query".
+	//
+	// markRetry UPDATE — attempts + 1, next_attempt_at advanced.
+	mock.ExpectExec(`UPDATE pending_propagations\s+SET attempts\s*=\s*attempts \+ 1`).
+		WithArgs(
+			sqlmock.AnyArg(), // last_error truncated string
+			sqlmock.AnyArg(), // nextAttempt time.Time
+			propID,
+		).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	// audit_log INSERT — must be propagation.unexpected_skip.
+	mock.ExpectExec(`INSERT INTO audit_log`).
+		WithArgs(
+			teamID,
+			propagationActor,
+			auditKindPropagationUnexpectedSkip, // <-- the F1 contract
+			sqlmock.AnyArg(),
+			sqlmock.AnyArg(),
+		).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// Stub the provisioner to return the canonical F1 case:
+	// Applied=false, SkipReason="postgres-admin secret not found".
+	stub := &stubPropagationRegrader{outcome: regradeOutcome{
+		Applied:    false,
+		SkipReason: "resource not reachable: postgres-admin secret not found",
+	}}
+	w := NewPropagationRunnerWorker(db, commonplans.Default(), stub)
+	w.now = fixedNow()
+
+	if wErr := w.Work(context.Background(), fakePropagationJob()); wErr != nil {
+		t.Fatalf("Work: %v", wErr)
+	}
+
+	if stub.calls != 1 {
+		t.Errorf("RegradeResource calls = %d, want 1", stub.calls)
+	}
+
+	// Counter assertion — exact bucket name expected from bucketSkipReason
+	// against the "postgres-admin secret not found" surface. If a future PR
+	// changes the bucket name, the operator's NR alert rule must change too —
+	// pin it here so the change isn't silent.
+	endCount := testutil.ToFloat64(counterChild)
+	if endCount-startCount != 1.0 {
+		t.Errorf("PropagationUnexpectedSkipTotal{kind=tier_elevation,resource_type=postgres,skip_reason=postgres_admin_secret_missing} delta = %.1f, want 1.0",
+			endCount-startCount)
+	}
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// ─── Test 2: dead-letter at maxAttempts for unexpected_skip ───────────────────
+
+func TestPropagation_UnexpectedSkip_DeadLettersAtMaxAttempts(t *testing.T) {
+	db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp))
+	if err != nil {
+		t.Fatalf("sqlmock.New: %v", err)
+	}
+	defer db.Close()
+
+	propID := uuid.New()
+	teamID := uuid.New()
+	resID := uuid.New()
+
+	// attempts already at maxAttempts-1; one more unexpected_skip dead-letters.
+	mock.ExpectBegin()
+	mock.ExpectQuery(`SELECT id, kind, team_id, target_tier, payload, attempts\s+FROM pending_propagations`).
+		WillReturnRows(sqlmock.NewRows(propagationSweepCols).
+			AddRow(propID, propagationKindTierElevation, teamID, "pro", []byte(`{}`), propagationMaxAttempts-1))
+	mock.ExpectCommit()
+
+	mock.ExpectQuery(`SELECT r\.id, r\.token, r\.provider_resource_id, r\.tier, r\.resource_type`).
+		WithArgs(teamID).
+		WillReturnRows(sqlmock.NewRows(teamResourcesCols).
+			AddRow(resID, "tok-1", sql.NullString{}, "pro", "postgres"))
+
+	// markDeadLettered UPDATE: stamps failed_at + last_error.
+	mock.ExpectExec(`UPDATE pending_propagations\s+SET attempts\s*=\s*attempts \+ 1,\s+last_attempt_at\s*=\s*now\(\),\s+last_error\s*=\s*\$1,\s+failed_at\s*=\s*now\(\)`).
+		WithArgs(sqlmock.AnyArg(), propID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	// age_seconds lookup query.
+	mock.ExpectQuery(`SELECT created_at FROM pending_propagations WHERE id`).
+		WithArgs(propID).
+		WillReturnRows(sqlmock.NewRows([]string{"created_at"}).AddRow(time.Now().Add(-25 * time.Hour)))
+	// audit_log INSERT — propagation.dead_lettered. The handler-side
+	// distinction (unexpected_skip vs gRPC-error vs DB-fail) collapses
+	// at the dead-letter point: a row that exhausted maxAttempts gets
+	// the canonical dead-letter audit kind regardless of cause. The
+	// `last_error` column carries the unexpected_skip detail for the
+	// operator to debug from.
+	mock.ExpectExec(`INSERT INTO audit_log`).
+		WithArgs(
+			teamID,
+			propagationActor,
+			auditKindPropagationDeadLettered,
+			sqlmock.AnyArg(),
+			sqlmock.AnyArg(),
+		).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	stub := &stubPropagationRegrader{outcome: regradeOutcome{
+		Applied:    false,
+		SkipReason: "namespace not found",
+	}}
+	w := NewPropagationRunnerWorker(db, commonplans.Default(), stub)
+	w.now = fixedNow()
+
+	if wErr := w.Work(context.Background(), fakePropagationJob()); wErr != nil {
+		t.Fatalf("Work: %v", wErr)
+	}
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// ─── Test 3: allowed-skip registry coverage (CLAUDE.md rule 18) ───────────────
+
+func TestIsPropagationAllowedSkip_Coverage(t *testing.T) {
+	// The registry-iterating check: every documented allowed-skip substring
+	// is non-empty and is matched by isPropagationAllowedSkip. A future PR
+	// that adds an entry to propagationAllowedSkipSubstrings but forgets to
+	// wire it through isPropagationAllowedSkip fails this test.
+	if len(propagationAllowedSkipSubstrings) == 0 {
+		t.Fatal("propagationAllowedSkipSubstrings is empty — every SkipReason will be treated as unexpected_skip, the F1 fix has been over-corrected into a fail-everything posture")
+	}
+	for _, sub := range propagationAllowedSkipSubstrings {
+		if sub == "" {
+			t.Error("propagationAllowedSkipSubstrings contains an empty string — would match every SkipReason (including '')")
+			continue
+		}
+		// Match a sample reason that contains the substring.
+		sample := "prefix " + sub + " suffix"
+		if !isPropagationAllowedSkip(sample) {
+			t.Errorf("isPropagationAllowedSkip(%q) = false for allowed substring %q — substring lookup is broken", sample, sub)
+		}
+	}
+
+	// And the negative cases — every SkipReason from the chaos drill / known
+	// failure-mode catalog MUST be treated as unexpected_skip. If any of
+	// these turn into allowed-skips a paying customer's regrade will
+	// silently fail.
+	unexpectedSamples := []string{
+		"postgres-admin secret not found",
+		"resource not reachable: postgres-admin secret not found",
+		"namespace not found",
+		"redis-auth secret missing",
+		"pod not found",
+		"no Running pod",
+		"exec fallback: CONFIG SET maxmemory failed",
+		"legacy resource without auth secret",
+	}
+	for _, sample := range unexpectedSamples {
+		if isPropagationAllowedSkip(sample) {
+			t.Errorf("isPropagationAllowedSkip(%q) = true — this MUST be treated as an unexpected_skip (the whole point of F1)", sample)
+		}
+	}
+}
+
+// ─── Test 4: errors.Is contract for the sentinel ──────────────────────────────
+
+func TestPropagationUnexpectedSkipErr_IsMatches(t *testing.T) {
+	err := &propagationUnexpectedSkipErr{
+		Resources: []propagationUnexpectedSkipDetail{
+			{ResourceID: "r1", ResourceType: "postgres", SkipReason: "postgres-admin secret not found"},
+		},
+	}
+	if !errors.Is(err, errPropagationUnexpectedSkipSentinel) {
+		t.Error("errors.Is(*propagationUnexpectedSkipErr, errPropagationUnexpectedSkipSentinel) = false — the markRetry audit-kind switch will never fire, F1 audit signal regression")
+	}
+
+	// Wrapped via fmt.Errorf("…: %w", …) must still match — the dispatch
+	// loop wraps unexpected_skip in handler-side context.
+	wrapped := errors.New("not the sentinel")
+	if errors.Is(wrapped, errPropagationUnexpectedSkipSentinel) {
+		t.Error("errors.Is(unrelated_err, errPropagationUnexpectedSkipSentinel) = true — sentinel matches too broadly, would mis-classify routine retries as unexpected_skip")
+	}
+
+	// Nil receiver must not panic; the runner never constructs a nil
+	// pointer here, but defensive.
+	var nilErr *propagationUnexpectedSkipErr
+	if errors.Is(nilErr, errPropagationUnexpectedSkipSentinel) {
+		// Nil err of this type Still matches the sentinel via the Is method —
+		// acceptable; the runner would never construct it.
+		t.Log("nilErr matched (Is method returns true for nil receiver) — acceptable")
+	}
+
+	// Error string formatting includes every offending resource so the
+	// operator can grep the audit_log.last_error column.
+	got := err.Error()
+	for _, want := range []string{"r1", "postgres", "postgres-admin secret"} {
+		if !strings.Contains(got, want) {
+			t.Errorf("propagationUnexpectedSkipErr.Error() = %q, missing substring %q (operator can't debug from audit_log.last_error)", got, want)
+		}
+	}
+}
+
+// ─── Test 5: bucketSkipReason cardinality contract ────────────────────────────
+
+func TestBucketSkipReason_BoundsCardinality(t *testing.T) {
+	cases := []struct {
+		raw    string
+		bucket string
+	}{
+		// Postgres admin secret missing — the canonical F1 trigger.
+		{"resource not reachable: postgres-admin secret not found", "postgres_admin_secret_missing"},
+		{"postgres-admin Secret not found", "postgres_admin_secret_missing"},
+		{"missing postgres_admin secret", "postgres_admin_secret_missing"},
+
+		// Redis-auth secret missing.
+		{"redis-auth secret not found", "redis_auth_secret_missing"},
+
+		// Namespace teardown race.
+		{"namespace not found: instant-customer-xyz", "namespace_not_found"},
+
+		// Pod-not-found from exec-fallback paths.
+		{"exec fallback: no pod found", "pod_not_found"},
+		{"pod not found in namespace", "pod_not_found"},
+
+		// Resource unreachable (gRPC dial / DNS).
+		{"resource not reachable", "resource_not_reachable"},
+		{"backend unreachable", "resource_not_reachable"},
+
+		// Legacy resource without auth secret.
+		{"legacy resource without auth secret", "legacy_resource"},
+		{"legacy pod missing required config", "legacy_resource"},
+
+		// Unknown — the catch-all bucket.
+		{"completely novel skip reason from a future provisioner", "other"},
+		{"", "other"},
+	}
+	for _, c := range cases {
+		if got := bucketSkipReason(c.raw); got != c.bucket {
+			t.Errorf("bucketSkipReason(%q) = %q, want %q", c.raw, got, c.bucket)
+		}
+	}
+
+	// And the bounded-cardinality invariant: a thousand random unique
+	// SkipReasons should fall into a small fixed set of buckets. (We
+	// declare the upper bound at 8 buckets — the 7 named + "other".)
+	seen := map[string]struct{}{}
+	for i := 0; i < 1000; i++ {
+		// A faux randomness — different prefixes — none of which should
+		// hit any named bucket.
+		raw := "novel skip variant #" + uuid.New().String()
+		seen[bucketSkipReason(raw)] = struct{}{}
+	}
+	if len(seen) != 1 || func() bool { _, ok := seen["other"]; return !ok }() {
+		t.Errorf("bucketSkipReason swallowed %d distinct buckets for novel inputs (want exactly 1, 'other') — cardinality not bounded, Prometheus label explosion risk: %v", len(seen), seen)
+	}
+}
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 9148550..cfaa0b0 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -408,6 +408,96 @@ var (
 		Help: "Event-email forwarder hits on an audit_log row whose kind has a builder but no Go renderer. A non-zero rate means a kind is being silently dropped — fix the eventEmailBodyRenderers map.",
 	}, []string{"kind"})
 
+	// ── propagation_runner — unexpected_skip counter (CHAOS-DRILL-2026-05-20 F1) ─
+	//
+	// Every time the propagation_runner's per-resource RegradeResource call
+	// returns (Applied=false, SkipReason=<not in the allowed-skip whitelist>),
+	// this counter increments. Pre-fix the runner WARN-and-applied that case,
+	// silently stamping the propagation row as applied without the entitlement
+	// landing — a paying customer ended up with "Pro on paper, hobby-grade infra"
+	// and no alert (CHAOS-DRILL-2026-05-20 finding #1). Now the case is treated
+	// as a retryable error: the row retries via the backoff schedule and
+	// dead-letters at propagationMaxAttempts. This counter is the leading
+	// indicator; the dead-letter audit row is the alert-able lagging signal.
+	//
+	// Labels:
+	//
+	//   kind           — pending_propagations.kind ("tier_elevation", etc.).
+	//                    Bounded by propagationKnownKinds (~1-3 entries).
+	//
+	//   resource_type  — "postgres" | "redis" | "mongodb" (the offending
+	//                    resource class). Bounded by ResourceType enum.
+	//
+	//   skip_reason    — a SHORT canonical bucket derived from the raw
+	//                    skip_reason string ("postgres_admin_secret_missing",
+	//                    "namespace_not_found", "other"). The runner does
+	//                    the bucketing (jobs.bucketSkipReason) so cardinality
+	//                    stays bounded — never pass the raw SkipReason here.
+	//
+	// NR alert (suggested):
+	//   sum(rate(instant_propagation_unexpected_skip_total[15m])) > 0
+	//     for 30+ minutes → P2 page. A single isolated event is the
+	//     mid-deprovisioning-race signal; a sustained rate is a real
+	//     downstream regression and an operator must investigate before
+	//     the row dead-letters ~24h later.
+	PropagationUnexpectedSkipTotal = promauto.NewCounterVec(prometheus.CounterOpts{
+		Name: "instant_propagation_unexpected_skip_total",
+		Help: "propagation_runner per-resource RegradeResource returned (Applied=false, SkipReason=<not in allowed-skip whitelist>). Leading indicator for the dead-letter alert.",
+	}, []string{"kind", "resource_type", "skip_reason"})
+
+	// ── propagation_runner — dead-letter + unknown-kind counters (CHAOS F2/F3) ─
+	//
+	// PropagationDeadLetteredTotal increments every time the propagation_runner
+	// transitions a row to failed_at + emits a propagation.*dead_lettered audit
+	// row. Two triggers feed this single metric, distinguished by `reason`:
+	//
+	//   reason="max_attempts" — the modal path. Per-resource RegradeResource
+	//                           failures, F1's unexpected_skip-as-failure, and
+	//                           markApplied DB failures all converge here once
+	//                           they reach propagationMaxAttempts.
+	//   reason="unknown_kind" — CHAOS F2: a worker pod that doesn't recognise
+	//                           a `kind` enqueued by a newer api image. Without
+	//                           the F2 fix these escape the maxAttempts ceiling.
+	//
+	// `kind` carries the row's pending_propagations.kind value for the
+	// max_attempts path (bounded by propagationKnownKinds — ~1-3 entries).
+	// The unknown_kind path passes kind="unknown_kind" as a bounded bucket,
+	// so an attacker-controlled api-side enqueue can't blow up worker
+	// label cardinality.
+	//
+	// NR alert (suggested):
+	//   rate(instant_propagation_dead_lettered_total[5m]) > 0 for 5m → P1 page.
+	//   propagation_runner is the last line of defence between Razorpay webhook
+	//   delivery and customer infra; any dead-letter means a paying customer's
+	//   regrade fell through (or, on the unknown_kind path, that a worker pod
+	//   is running an old image vs the api).
+	PropagationDeadLetteredTotal = promauto.NewCounterVec(prometheus.CounterOpts{
+		Name: "instant_propagation_dead_lettered_total",
+		Help: "propagation_runner rows transitioned to failed_at. Labelled by reason (max_attempts|unknown_kind) and kind (pending_propagations.kind, or 'unknown_kind' for F2's bounded bucket).",
+	}, []string{"reason", "kind"})
+
+	// PropagationUnknownKindTotal counts every TICK that picked up at least
+	// one row whose kind had no handler in propagationHandlers. Distinct
+	// from PropagationDeadLetteredTotal{reason="unknown_kind"} — that fires
+	// once at the END of the row's life (after maxAttempts), this fires on
+	// EVERY tick while the row is retrying. Lets the operator see "the
+	// worker is older than the api" within seconds rather than waiting the
+	// ~24h backoff for the dead-letter to land.
+	//
+	// `kind` is the raw pending_propagations.kind value. Bounded by the
+	// api-side enqueue surface (NOT by attacker input — only the api can
+	// INSERT into pending_propagations); the cardinality risk is accepted
+	// because in the rollback-drift scenario the operator wants to know
+	// EXACTLY which new kind their old worker is rejecting.
+	//
+	// NR alert (suggested):
+	//   sum(rate(instant_propagation_unknown_kind_total[5m])) by (kind) > 0
+	//     for 5m → P2 page. Action: finish the rollout.
+	PropagationUnknownKindTotal = promauto.NewCounterVec(prometheus.CounterOpts{
+		Name: "instant_propagation_unknown_kind_total",
+		Help: "propagation_runner ticks that saw at least one row whose kind has no handler (image-skew indicator). Labelled by kind.",
+	}, []string{"kind"})
+
 	// readyzCheckStatusGauge — per-component readiness status surfaced by
 	// /readyz on this service's HTTP sidecar (:8091). See the matching
 	// gauge in the api repo at api/internal/metrics/metrics.go for the