diff --git a/internal/jobs/propagation_runner.go b/internal/jobs/propagation_runner.go index a953118..4e7a447 100644 --- a/internal/jobs/propagation_runner.go +++ b/internal/jobs/propagation_runner.go @@ -59,6 +59,7 @@ import ( "context" "database/sql" "encoding/json" + "errors" "fmt" "log/slog" "os" @@ -70,6 +71,7 @@ import ( "go.opentelemetry.io/otel" commonv1 "instant.dev/proto/common/v1" + "instant.dev/worker/internal/metrics" ) // ─── named constants ────────────────────────────────────────────────────────── @@ -93,9 +95,15 @@ const ( // Mirrors api/internal/models/audit_kinds.go's AuditKindPropagation* // constants. Same drift contract as propagationKindTierElevation above. const ( - auditKindPropagationApplied = "propagation.applied" - auditKindPropagationRetrying = "propagation.retrying" - auditKindPropagationDeadLettered = "propagation.dead_lettered" + auditKindPropagationApplied = "propagation.applied" + auditKindPropagationRetrying = "propagation.retrying" + auditKindPropagationDeadLettered = "propagation.dead_lettered" + auditKindPropagationUnexpectedSkip = "propagation.unexpected_skip" + // CHAOS F2 (CHAOS-DRILL-2026-05-20): distinct audit kind for the + // old-worker/new-api rollout-skew dead-letter path. Lets the operator + // filter the image-skew signal independently of real-failure dead-letters + // via `audit_log.kind`. Mirrors api/internal/models/audit_kinds.go. + auditKindPropagationUnknownKindDeadLettered = "propagation.unknown_kind_dead_lettered" ) // propagationActor is the audit_log.actor value the runner writes. Distinct @@ -152,6 +160,105 @@ var propagationBackoffSchedule = []time.Duration{ // gRPC error string. const propagationLastErrorMax = 1000 +// propagationAllowedSkipSubstrings enumerates the SkipReason fragments that +// the provisioner uses to signal a TRUE no-op (idempotent re-apply / a +// resource type with no regrade arm). Anything else is an unexpected_skip +// and triggers retry/dead-letter per F1 (CHAOS-DRILL-2026-05-20). +// +// Held centrally (not inline) so the test surface and the bucketSkipReason +// helper can share the list. Adding an entry here changes the silent-skip +// surface; do it only when the provisioner adds a new "this is fine" +// outcome that genuinely doesn't need a retry. +var propagationAllowedSkipSubstrings = []string{ + "already correct", + "unsupported resource type", + "backend does not support", +} + +// isPropagationAllowedSkip returns true when the provisioner's SkipReason +// is one of the allowed no-op signals. +func isPropagationAllowedSkip(reason string) bool { + for _, sub := range propagationAllowedSkipSubstrings { + if strings.Contains(reason, sub) { + return true + } + } + return false +} + +// propagationUnexpectedSkipDetail captures one offending resource for an +// unexpected_skip retry. Multiple resources can fail in one dispatch; the +// operator wants to see every one in last_error so they can fix the +// underlying issue in batch. +type propagationUnexpectedSkipDetail struct { + ResourceID string + ResourceType string + SkipReason string +} + +// propagationUnexpectedSkipErr is the error returned by handleTierElevation +// when at least one resource's RegradeResource returned (Applied=false, +// SkipReason=). The runner's +// markRetry / markDeadLettered path detects this via errors.Is and emits +// propagation.unexpected_skip audit kind (not propagation.applied) — the +// operator-page signal for F1. +// +// CHAOS-DRILL-2026-05-20 F1: pre-fix the handler WARN-and-applied this +// case (firstErr stayed nil); the row got stamped applied_at and the +// regrade never landed. A paying customer's infra cap silently stayed at +// the old tier and no alert fired. This error type forces the case to +// retry → dead-letter and surfaces it loudly via a distinct audit kind + +// Prometheus counter. +type propagationUnexpectedSkipErr struct { + Resources []propagationUnexpectedSkipDetail +} + +func (e *propagationUnexpectedSkipErr) Error() string { + if e == nil || len(e.Resources) == 0 { + return "unexpected_skip: " + } + parts := make([]string, 0, len(e.Resources)) + for _, r := range e.Resources { + parts = append(parts, fmt.Sprintf("%s (%s): %s", r.ResourceID, r.ResourceType, r.SkipReason)) + } + return "unexpected_skip from provisioner: " + strings.Join(parts, "; ") +} + +// errPropagationUnexpectedSkipSentinel is the comparison target for +// errors.Is. The Is method on propagationUnexpectedSkipErr matches it so +// callers can switch on the class without depending on the concrete type. +var errPropagationUnexpectedSkipSentinel = errors.New("propagation: unexpected_skip from provisioner") + +// Is satisfies errors.Is for the sentinel — any propagationUnexpectedSkipErr +// matches errPropagationUnexpectedSkipSentinel. +func (e *propagationUnexpectedSkipErr) Is(target error) bool { + return target == errPropagationUnexpectedSkipSentinel +} + +// bucketSkipReason maps the raw provisioner SkipReason to a short canonical +// bucket suitable for a Prometheus label. Keeps cardinality bounded — never +// pass the raw SkipReason as a metric label. Adding a new bucket here is +// safe; the default "other" catches anything unrecognised. +func bucketSkipReason(reason string) string { + r := strings.ToLower(reason) + switch { + case strings.Contains(r, "postgres-admin secret") || strings.Contains(r, "postgres_admin"): + return "postgres_admin_secret_missing" + case strings.Contains(r, "redis-auth secret"): + return "redis_auth_secret_missing" + case strings.Contains(r, "namespace not found"): + return "namespace_not_found" + case strings.Contains(r, "not reachable") || strings.Contains(r, "unreachable"): + return "resource_not_reachable" + case strings.Contains(r, "pod not found") || strings.Contains(r, "no pod"): + return "pod_not_found" + case strings.Contains(r, "legacy resource") || strings.Contains(r, "legacy pod"): + return "legacy_resource" + default: + return "other" + } +} + // ─── job definition + handler registry ──────────────────────────────────────── // PropagationRunnerArgs is the River job payload. No fields — sweep job. @@ -282,8 +389,31 @@ func (w *PropagationRunnerWorker) Work(ctx context.Context, job *river.Job[Propa // image could see a row from a newer api enqueue. Treat as a // retryable failure so the row is NOT dead-lettered before // the worker rolls forward. + // + // CHAOS F2 (CHAOS-DRILL-2026-05-20): apply the SAME maxAttempts + // ceiling here that real-failure rows get below. Without this, + // an old worker pod that doesn't recognise a freshly-enqueued + // kind retries forever — the chaos drill on 2026-05-20 saw a + // chaos_test_unknown_kind row reach attempts=10 in 4 minutes + // without ever transitioning to failed_at. Bounding the + // unknown-kind path on the same dead-letter horizon as a real + // failure (a) caps the eventual blast radius of an old image + // surviving the rollout window, (b) gives the operator a clean + // alert-able signal via instant_propagation_dead_lettered_total{reason="unknown_kind"} + // rather than an unbounded retry storm. The per-tick image-skew + // indicator stays in PropagationUnknownKindTotal — that fires on + // EVERY tick while the row is alive, so the operator can react + // in seconds instead of waiting the ~24h backoff for a + // dead-letter to fire. unknownKind++ - w.markRetry(ctx, row, fmt.Errorf("no handler registered for kind %q", row.kind)) + metrics.PropagationUnknownKindTotal.WithLabelValues(row.kind).Inc() + dispatchErr := fmt.Errorf("no handler registered for kind %q", row.kind) + if row.attempts+1 >= propagationMaxAttempts { + deadLettered++ + w.markUnknownKindDeadLettered(ctx, row, dispatchErr) + continue + } + w.markRetry(ctx, row, dispatchErr) continue } @@ -424,10 +554,18 @@ func propagationBackoffFor(attempts int) time.Duration { } // markRetry persists attempts+1, schedules next_attempt_at via -// propagationBackoffFor, persists last_error, and emits a DEBUG-level -// propagation.retrying audit row. Best-effort: a DB failure here only -// means the next tick may re-process the same row (idempotent handler -// makes that safe) and the audit row is missing. +// propagationBackoffFor, persists last_error, and emits an audit row. +// +// Audit kind selection (CHAOS-DRILL-2026-05-20 F1): +// - errors.Is(dispatchErr, errPropagationUnexpectedSkipSentinel) → +// propagation.unexpected_skip audit row + WARN slog (the F1 loud +// signal: provisioner returned applied=false with a skip reason +// outside the allowed set — regrade did NOT land). +// - else → propagation.retrying audit row + DEBUG slog (routine retry). +// +// Best-effort: a DB failure here only means the next tick may re-process +// the same row (idempotent handler makes that safe) and the audit row is +// missing. func (w *PropagationRunnerWorker) markRetry(ctx context.Context, row propagationRow, dispatchErr error) { delay := propagationBackoffFor(row.attempts) nextAttempt := w.now().Add(delay) @@ -463,23 +601,47 @@ func (w *PropagationRunnerWorker) markRetry(ctx context.Context, row propagation return } - // DEBUG audit row — retries are routine during a Razorpay/provisioner - // outage; logging at WARN would spam. Operators inspect the row's - // `attempts` + `last_error` columns directly for diagnosis. - w.insertPropagationAuditRow(ctx, row, auditKindPropagationRetrying, fmt.Sprintf( - "%s propagation for team %s retrying (attempt %d/%d)", - row.kind, row.teamID.String(), row.attempts+1, propagationMaxAttempts, + // Audit kind: unexpected_skip is the loud F1 signal; retrying is the + // routine path. Both share the same DB UPDATE above — only the audit + // row + log line differ, so operators can grep/alert on the F1 class + // without false-positiving on a transient Razorpay outage. + isUnexpectedSkip := errors.Is(dispatchErr, errPropagationUnexpectedSkipSentinel) + auditKind := auditKindPropagationRetrying + if isUnexpectedSkip { + auditKind = auditKindPropagationUnexpectedSkip + } + + w.insertPropagationAuditRow(ctx, row, auditKind, fmt.Sprintf( + "%s propagation for team %s retrying (attempt %d/%d, kind=%s)", + row.kind, row.teamID.String(), row.attempts+1, propagationMaxAttempts, auditKind, ), map[string]any{ - "propagation_id": row.id.String(), - "kind": row.kind, - "team_id": row.teamID.String(), - "target_tier": nullableTierString(row.targetTier), - "attempts": row.attempts + 1, - "max_attempts": propagationMaxAttempts, - "next_attempt_at": nextAttempt.UTC().Format(time.RFC3339), - "last_error": lastErr, + "propagation_id": row.id.String(), + "kind": row.kind, + "team_id": row.teamID.String(), + "target_tier": nullableTierString(row.targetTier), + "attempts": row.attempts + 1, + "max_attempts": propagationMaxAttempts, + "next_attempt_at": nextAttempt.UTC().Format(time.RFC3339), + "last_error": lastErr, }) + if isUnexpectedSkip { + // WARN-level: unexpected_skip is the F1 signal — louder than a + // routine retry (DEBUG), quieter than the eventual dead-letter + // (ERROR). Pre-fix this case was silently APPLIED, so even WARN + // is a step up. + slog.Warn("jobs.propagation_runner.unexpected_skip_retrying", + "propagation_id", row.id.String(), + "team_id", row.teamID.String(), + "kind", row.kind, + "attempts", row.attempts+1, + "max_attempts", propagationMaxAttempts, + "next_attempt_at", nextAttempt, + "last_error", lastErr, + ) + return + } + slog.Debug("jobs.propagation_runner.retrying", "propagation_id", row.id.String(), "team_id", row.teamID.String(), @@ -607,6 +769,24 @@ func (w *PropagationRunnerWorker) markDeadLettered(ctx context.Context, row prop ) w.insertPropagationAuditRow(ctx, row, auditKindPropagationDeadLettered, summary, meta) + // CHAOS F3 (CHAOS-DRILL-2026-05-20): increment the Prom counter. Labels: + // reason = "max_attempts" — the modal real-failure path. F1's + // unexpected_skip-as-failure, real + // RegradeResource gRPC errors, AND + // F1's markApplied DB failures all flow + // through here once they reach the + // propagationMaxAttempts ceiling, so a + // single `reason=max_attempts` covers + // every "we tried and tried and finally + // gave up" path. + // kind = row.kind — bounded by propagationKnownKinds; the + // unknown_kind path uses + // markUnknownKindDeadLettered which sets + // kind="unknown_kind" explicitly to keep + // cardinality bounded by code, not by + // attacker-controlled enqueue values. + metrics.PropagationDeadLetteredTotal.WithLabelValues("max_attempts", row.kind).Inc() + // CRITICAL severity: this is THE alert. NR Log alert filters on // audit_kind='propagation.dead_lettered' OR on the message below. slog.Error("jobs.propagation_runner.dead_lettered", @@ -621,6 +801,98 @@ func (w *PropagationRunnerWorker) markDeadLettered(ctx context.Context, row prop ) } +// markUnknownKindDeadLettered transitions a row to failed_at when its `kind` +// has no registered handler AND the row has now reached propagationMaxAttempts. +// Shares 90% of its body with markDeadLettered but emits a DISTINCT audit kind +// (propagation.unknown_kind_dead_lettered) so an operator can filter on the +// old-image-rollback signal independently of real provisioner failures, and +// labels the Prom counter with reason="unknown_kind" + kind="unknown_kind" +// (we deliberately do NOT pass row.kind into the kind label because the kind +// is by definition not in propagationKnownKinds — passing it would let an +// api-side enqueue blow up the worker's label cardinality without a code-side +// review). +// +// CHAOS F2 (CHAOS-DRILL-2026-05-20): without this path, an unknown-kind row +// retried forever — confirmed live during the chaos drill +// (chaos_test_unknown_kind reached attempts=10 in 4 minutes without ever +// transitioning to failed_at). With this path, the row dead-letters after +// the same 10 attempts as any real failure, the operator sees a +// propagation.unknown_kind_dead_lettered audit row + a Prom counter increment, +// and the runaway retry loop ends. +func (w *PropagationRunnerWorker) markUnknownKindDeadLettered(ctx context.Context, row propagationRow, dispatchErr error) { + lastErr := truncatePropagationError(dispatchErr.Error()) + res, err := w.db.ExecContext(ctx, ` + UPDATE pending_propagations + SET attempts = attempts + 1, + last_attempt_at = now(), + last_error = $1, + failed_at = now() + WHERE id = $2 + AND applied_at IS NULL + AND failed_at IS NULL + `, lastErr, row.id) + if err != nil { + slog.Error("jobs.propagation_runner.unknown_kind_dead_letter_persist_failed", + "propagation_id", row.id.String(), + "team_id", row.teamID.String(), + "kind", row.kind, + "error", err, + "dispatch_error", lastErr, + ) + return + } + affected, _ := res.RowsAffected() + if affected == 0 { + // Already terminal — sibling tick won the race. + return + } + + // age_seconds — same best-effort pattern as markDeadLettered. + var ageSeconds float64 + var createdAt sql.NullTime + if qErr := w.db.QueryRowContext(ctx, + `SELECT created_at FROM pending_propagations WHERE id = $1`, row.id, + ).Scan(&createdAt); qErr == nil && createdAt.Valid { + ageSeconds = w.now().Sub(createdAt.Time).Seconds() + } + + meta := map[string]any{ + "propagation_id": row.id.String(), + "kind": row.kind, + "team_id": row.teamID.String(), + "target_tier": nullableTierString(row.targetTier), + "attempts": row.attempts + 1, + "max_attempts": propagationMaxAttempts, + "last_error": lastErr, + "age_seconds": ageSeconds, + "failure_reason": "unknown_kind", + } + summary := fmt.Sprintf( + "unknown_kind propagation (kind=%q) for team %s DEAD-LETTERED after %d attempts — worker image is older than api enqueue; finish the rollout", + row.kind, row.teamID.String(), propagationMaxAttempts, + ) + // Distinct audit kind so an operator-side NRQL filter can separate + // real-failure dead-letters from image-skew dead-letters. + w.insertPropagationAuditRow(ctx, row, auditKindPropagationUnknownKindDeadLettered, summary, meta) + + // kind="unknown_kind" (the BUCKET name) — see comment on the counter + // declaration in metrics.go for the cardinality rationale. + metrics.PropagationDeadLetteredTotal.WithLabelValues("unknown_kind", "unknown_kind").Inc() + + // CRITICAL severity — different action than the real-failure path. + // The fix here is an operator rolling the worker forward, NOT + // inspecting a specific team's resources. + slog.Error("jobs.propagation_runner.unknown_kind_dead_lettered", + "propagation_id", row.id.String(), + "team_id", row.teamID.String(), + "kind", row.kind, + "attempts", row.attempts+1, + "max_attempts", propagationMaxAttempts, + "last_error", lastErr, + "action", "worker pod is running an older image than the api enqueued — finish the rollout; manually re-enqueue this row only after the kind has a handler", + ) +} + // insertPropagationAuditRow writes one audit_log row, best-effort. A miss // here only loses the operator-visible ledger entry; the slog line still // fires, and NR alerts can key on the message string when the audit row @@ -672,10 +944,20 @@ func nullableTierString(s sql.NullString) string { // RegradeResource with the resource's per-row tier snapshot (MR-P1-21 — see // entitlement_reconciler.go for the snapshot-is-entitlement contract). Any // per-resource gRPC error fails the WHOLE row (so the entire row retries -// with backoff). Per-resource "skip" outcomes (applied=false with -// skip_reason="already correct" / "unsupported resource type" / "backend -// does not support regrade") are NOT treated as failures: they are the -// provisioner's idempotency / type-coverage signal. +// with backoff). Per-resource "skip" outcomes whose SkipReason is in +// propagationAllowedSkipSubstrings ("already correct" / "unsupported +// resource type" / "backend does not support regrade") are NOT treated as +// failures: they are the provisioner's idempotency / type-coverage signal. +// +// CHAOS-DRILL-2026-05-20 F1 fix: any OTHER SkipReason — e.g. "postgres-admin +// secret not found", "namespace not found", "resource not reachable" — is +// treated as a retryable failure. The runner detects the returned +// propagationUnexpectedSkipErr via errors.Is and emits a distinct +// propagation.unexpected_skip audit row (not propagation.applied). The row +// retries per the backoff schedule and dead-letters at propagationMaxAttempts. +// Pre-fix this case WARN-logged and fell through with firstErr==nil, so the +// row got stamped applied_at and the regrade never landed — paying customers +// ended up with "Pro on paper, hobby-grade infra" and no alert. // // Idempotency: re-running this handler is safe because the provisioner's // RegradeResource does CONFIG GET / applied_conn_limit comparison before @@ -727,7 +1009,10 @@ func handleTierElevation(ctx context.Context, db *sql.DB, regrader propagationRe return nil } - var firstErr error + var ( + firstErr error + unexpectedSkipFound []propagationUnexpectedSkipDetail + ) for _, r := range resources { resType, supported := resourceTypeFromString(r.resourceType) if !supported { @@ -752,22 +1037,47 @@ func handleTierElevation(ctx context.Context, db *sql.DB, regrader propagationRe } continue } - // applied=false is NOT an error — it's the idempotency signal. - // We log only if the skip_reason indicates an actual provisioner- - // side problem (not "already correct" / type unsupported). - if !out.Applied && out.SkipReason != "" && - !strings.Contains(out.SkipReason, "already correct") && - !strings.Contains(out.SkipReason, "unsupported resource type") && - !strings.Contains(out.SkipReason, "backend does not support") { + // applied=false with an ALLOWED SkipReason ("already correct" / + // "unsupported resource type" / "backend does not support") is the + // idempotency / type-coverage signal — treat as success. + // + // applied=false with any OTHER SkipReason is an unexpected_skip: + // the regrade DID NOT LAND but the provisioner returned no error. + // CHAOS-DRILL-2026-05-20 F1 fix: accumulate the offending resources + // and return a propagationUnexpectedSkipErr so the runner retries + // the row (audit_kind=propagation.unexpected_skip) and dead-letters + // at propagationMaxAttempts (audit_kind=propagation.dead_lettered). + // Pre-fix this case fell through silently and the row was stamped + // applied_at; the customer's infra was never regraded and no alert + // fired. The metric counter is the leading indicator the dead-letter + // alert is the lagging signal. + if !out.Applied && out.SkipReason != "" && !isPropagationAllowedSkip(out.SkipReason) { slog.Warn("jobs.propagation_runner.tier_elevation.unexpected_skip", "propagation_id", row.id.String(), "resource_id", r.id.String(), "resource_type", r.resourceType, "skip_reason", out.SkipReason, ) + metrics.PropagationUnexpectedSkipTotal.WithLabelValues( + row.kind, r.resourceType, bucketSkipReason(out.SkipReason), + ).Inc() + unexpectedSkipFound = append(unexpectedSkipFound, propagationUnexpectedSkipDetail{ + ResourceID: r.id.String(), + ResourceType: r.resourceType, + SkipReason: out.SkipReason, + }) } } - return firstErr + + // A real gRPC error wins over an unexpected_skip — the gRPC error is + // the louder signal and a retry on it will also re-check the skip path. + if firstErr != nil { + return firstErr + } + if len(unexpectedSkipFound) > 0 { + return &propagationUnexpectedSkipErr{Resources: unexpectedSkipFound} + } + return nil } // resourceTypeFromString maps the resources.resource_type column value to diff --git a/internal/jobs/propagation_unexpected_skip_test.go b/internal/jobs/propagation_unexpected_skip_test.go new file mode 100644 index 0000000..eb79da8 --- /dev/null +++ b/internal/jobs/propagation_unexpected_skip_test.go @@ -0,0 +1,362 @@ +package jobs + +// propagation_unexpected_skip_test.go — CHAOS-DRILL-2026-05-20 F1 regression +// coverage for "unexpected_skip silently marks the propagation APPLIED". +// +// Pre-fix bug: handleTierElevation in propagation_runner.go (lines 756–771 +// pre-patch) treated `(Applied=false, SkipReason=)` as success. A WARN log fired, firstErr stayed +// nil, and the runner stamped applied_at on the row. A paying customer's +// regrade never landed — there was no retry, no dead-letter, no alert. +// +// Fix: any non-allowed SkipReason now returns propagationUnexpectedSkipErr. +// The runner's markRetry path detects this via errors.Is and emits a +// propagation.unexpected_skip audit row (NOT propagation.applied). The row +// retries per the backoff schedule and dead-letters at propagationMaxAttempts +// with a propagation.dead_lettered audit row. The Prometheus counter +// PropagationUnexpectedSkipTotal increments at the emit site so dashboards +// can spot patterns before the dead-letter lagging signal fires. +// +// Tests: +// +// 1. TestPropagation_UnexpectedSkip_DoesNotMarkApplied +// The headline regression test. Provisioner returns +// (Applied=false, SkipReason="postgres-admin secret not found"). Assert: +// - markApplied is NOT called (no applied_at UPDATE in the sqlmock script). +// - markRetry IS called: attempts+1, next_attempt_at advanced. +// - audit_log row is propagation.unexpected_skip (NOT propagation.applied). +// - PropagationUnexpectedSkipTotal counter incremented by 1 with +// the right labels. +// +// 2. TestPropagation_UnexpectedSkip_DeadLettersAtMaxAttempts +// attempts == propagationMaxAttempts-1, then one more +// unexpected_skip. Assert the row dead-letters via the modal +// markDeadLettered path (single UPDATE … SET failed_at=now()), and +// the audit row is propagation.dead_lettered. +// +// 3. TestIsPropagationAllowedSkip_Coverage +// Registry-iterating coverage test per CLAUDE.md rule 18. The +// allowed-skip set lives in one place (propagationAllowedSkipSubstrings) +// and we assert every documented allowed-skip string passes, while +// a representative set of unexpected-skip strings fails. If a future +// PR adds a new allowed-skip substring without updating the list, the +// regression test below catches the un-mapped string. +// +// 4. TestPropagationUnexpectedSkipErr_IsMatches +// Pins the errors.Is contract: the markRetry path uses +// errors.Is(err, errPropagationUnexpectedSkipSentinel) to switch on +// the audit kind, so the Is() implementation MUST match the +// sentinel. A future refactor that breaks this would silently +// reintroduce the F1 bug (markRetry would emit propagation.retrying +// instead of propagation.unexpected_skip). +// +// 5. TestBucketSkipReason_BoundsCardinality +// Pins the Prometheus label vocabulary so a Prometheus operator can +// rely on the bucket names in alert rules. + +import ( + "context" + "database/sql" + "errors" + "strings" + "testing" + "time" + + sqlmock "github.com/DATA-DOG/go-sqlmock" + "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus/testutil" + + commonplans "instant.dev/common/plans" + "instant.dev/worker/internal/metrics" +) + +// ─── Test 1: F1 regression — unexpected_skip does NOT mark applied ──────────── + +func TestPropagation_UnexpectedSkip_DoesNotMarkApplied(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + propID := uuid.New() + teamID := uuid.New() + resID := uuid.New() + + // Snapshot the unexpected_skip counter before the run — Prometheus + // counters are process-global so other tests in this file (or in the + // suite, run with -p 1) may have moved the value. testutil.ToFloat64 + // reads the WithLabelValues child, which is unique per labelset. + counterChild := metrics.PropagationUnexpectedSkipTotal.WithLabelValues( + propagationKindTierElevation, "postgres", "postgres_admin_secret_missing", + ) + startCount := testutil.ToFloat64(counterChild) + + mock.ExpectBegin() + mock.ExpectQuery(`SELECT id, kind, team_id, target_tier, payload, attempts\s+FROM pending_propagations`). + WillReturnRows(sqlmock.NewRows(propagationSweepCols). + AddRow(propID, propagationKindTierElevation, teamID, "pro", []byte(`{}`), 0)) + mock.ExpectCommit() + + mock.ExpectQuery(`SELECT r\.id, r\.token, r\.provider_resource_id, r\.tier, r\.resource_type`). + WithArgs(teamID). + WillReturnRows(sqlmock.NewRows(teamResourcesCols). + AddRow(resID, "tok-1", sql.NullString{}, "pro", "postgres")) + + // THE BUG: pre-fix, the runner would next call markApplied (a + // `UPDATE pending_propagations SET applied_at = now()`) and INSERT a + // propagation.applied audit row. The fix re-routes through markRetry. + // We intentionally do NOT script those expectations — if the bug ever + // regressed, sqlmock would fail with "unexpected query". + // + // markRetry UPDATE — attempts + 1, next_attempt_at advanced. + mock.ExpectExec(`UPDATE pending_propagations\s+SET attempts\s*=\s*attempts \+ 1`). + WithArgs( + sqlmock.AnyArg(), // last_error truncated string + sqlmock.AnyArg(), // nextAttempt time.Time + propID, + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + // audit_log INSERT — must be propagation.unexpected_skip. + mock.ExpectExec(`INSERT INTO audit_log`). + WithArgs( + teamID, + propagationActor, + auditKindPropagationUnexpectedSkip, // <-- the F1 contract + sqlmock.AnyArg(), + sqlmock.AnyArg(), + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // Stub the provisioner to return the canonical F1 case: + // Applied=false, SkipReason="postgres-admin secret not found". + stub := &stubPropagationRegrader{outcome: regradeOutcome{ + Applied: false, + SkipReason: "resource not reachable: postgres-admin secret not found", + }} + w := NewPropagationRunnerWorker(db, commonplans.Default(), stub) + w.now = fixedNow() + + if wErr := w.Work(context.Background(), fakePropagationJob()); wErr != nil { + t.Fatalf("Work: %v", wErr) + } + + if stub.calls != 1 { + t.Errorf("RegradeResource calls = %d, want 1", stub.calls) + } + + // Counter assertion — exact bucket name expected from bucketSkipReason + // against the "postgres-admin secret not found" surface. If a future PR + // changes the bucket name, the operator's NR alert rule must change too — + // pin it here so the change isn't silent. + endCount := testutil.ToFloat64(counterChild) + if endCount-startCount != 1.0 { + t.Errorf("PropagationUnexpectedSkipTotal{kind=tier_elevation,resource_type=postgres,skip_reason=postgres_admin_secret_missing} delta = %.1f, want 1.0", + endCount-startCount) + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// ─── Test 2: dead-letter at maxAttempts for unexpected_skip ─────────────────── + +func TestPropagation_UnexpectedSkip_DeadLettersAtMaxAttempts(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + propID := uuid.New() + teamID := uuid.New() + resID := uuid.New() + + // attempts already at maxAttempts-1; one more unexpected_skip dead-letters. + mock.ExpectBegin() + mock.ExpectQuery(`SELECT id, kind, team_id, target_tier, payload, attempts\s+FROM pending_propagations`). + WillReturnRows(sqlmock.NewRows(propagationSweepCols). + AddRow(propID, propagationKindTierElevation, teamID, "pro", []byte(`{}`), propagationMaxAttempts-1)) + mock.ExpectCommit() + + mock.ExpectQuery(`SELECT r\.id, r\.token, r\.provider_resource_id, r\.tier, r\.resource_type`). + WithArgs(teamID). + WillReturnRows(sqlmock.NewRows(teamResourcesCols). + AddRow(resID, "tok-1", sql.NullString{}, "pro", "postgres")) + + // markDeadLettered UPDATE: stamps failed_at + last_error. + mock.ExpectExec(`UPDATE pending_propagations\s+SET attempts\s*=\s*attempts \+ 1,\s+last_attempt_at\s*=\s*now\(\),\s+last_error\s*=\s*\$1,\s+failed_at\s*=\s*now\(\)`). + WithArgs(sqlmock.AnyArg(), propID). + WillReturnResult(sqlmock.NewResult(0, 1)) + // age_seconds lookup query. + mock.ExpectQuery(`SELECT created_at FROM pending_propagations WHERE id`). + WithArgs(propID). + WillReturnRows(sqlmock.NewRows([]string{"created_at"}).AddRow(time.Now().Add(-25 * time.Hour))) + // audit_log INSERT — propagation.dead_lettered. The handler-side + // distinction (unexpected_skip vs gRPC-error vs DB-fail) collapses + // at the dead-letter point: a row that exhausted maxAttempts gets + // the canonical dead-letter audit kind regardless of cause. The + // `last_error` column carries the unexpected_skip detail for the + // operator to debug from. + mock.ExpectExec(`INSERT INTO audit_log`). + WithArgs( + teamID, + propagationActor, + auditKindPropagationDeadLettered, + sqlmock.AnyArg(), + sqlmock.AnyArg(), + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + + stub := &stubPropagationRegrader{outcome: regradeOutcome{ + Applied: false, + SkipReason: "namespace not found", + }} + w := NewPropagationRunnerWorker(db, commonplans.Default(), stub) + w.now = fixedNow() + + if wErr := w.Work(context.Background(), fakePropagationJob()); wErr != nil { + t.Fatalf("Work: %v", wErr) + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// ─── Test 3: allowed-skip registry coverage (CLAUDE.md rule 18) ─────────────── + +func TestIsPropagationAllowedSkip_Coverage(t *testing.T) { + // The registry-iterating check: every documented allowed-skip substring + // is non-empty and is matched by isPropagationAllowedSkip. A future PR + // that adds an entry to propagationAllowedSkipSubstrings but forgets to + // wire it through isPropagationAllowedSkip fails this test. + if len(propagationAllowedSkipSubstrings) == 0 { + t.Fatal("propagationAllowedSkipSubstrings is empty — every SkipReason will be treated as unexpected_skip, the F1 fix has been over-corrected into a fail-everything posture") + } + for _, sub := range propagationAllowedSkipSubstrings { + if sub == "" { + t.Error("propagationAllowedSkipSubstrings contains an empty string — would match every SkipReason (including '')") + continue + } + // Match a sample reason that contains the substring. + sample := "prefix " + sub + " suffix" + if !isPropagationAllowedSkip(sample) { + t.Errorf("isPropagationAllowedSkip(%q) = false for allowed substring %q — substring lookup is broken", sample, sub) + } + } + + // And the negative cases — every SkipReason from the chaos drill / known + // failure-mode catalog MUST be treated as unexpected_skip. If any of + // these turn into allowed-skips a paying customer's regrade will + // silently fail. + unexpectedSamples := []string{ + "postgres-admin secret not found", + "resource not reachable: postgres-admin secret not found", + "namespace not found", + "redis-auth secret missing", + "pod not found", + "no Running pod", + "exec fallback: CONFIG SET maxmemory failed", + "legacy resource without auth secret", + } + for _, sample := range unexpectedSamples { + if isPropagationAllowedSkip(sample) { + t.Errorf("isPropagationAllowedSkip(%q) = true — this MUST be treated as an unexpected_skip (the whole point of F1)", sample) + } + } +} + +// ─── Test 4: errors.Is contract for the sentinel ────────────────────────────── + +func TestPropagationUnexpectedSkipErr_IsMatches(t *testing.T) { + err := &propagationUnexpectedSkipErr{ + Resources: []propagationUnexpectedSkipDetail{ + {ResourceID: "r1", ResourceType: "postgres", SkipReason: "postgres-admin secret not found"}, + }, + } + if !errors.Is(err, errPropagationUnexpectedSkipSentinel) { + t.Error("errors.Is(*propagationUnexpectedSkipErr, errPropagationUnexpectedSkipSentinel) = false — the markRetry audit-kind switch will never fire, F1 audit signal regression") + } + + // Wrapped via fmt.Errorf("…: %w", …) must still match — the dispatch + // loop wraps unexpected_skip in handler-side context. + wrapped := errors.New("not the sentinel") + if errors.Is(wrapped, errPropagationUnexpectedSkipSentinel) { + t.Error("errors.Is(unrelated_err, errPropagationUnexpectedSkipSentinel) = true — sentinel matches too broadly, would mis-classify routine retries as unexpected_skip") + } + + // Nil receiver must not panic; the runner never constructs a nil + // pointer here, but defensive. + var nilErr *propagationUnexpectedSkipErr + if errors.Is(nilErr, errPropagationUnexpectedSkipSentinel) { + // Nil err of this type Still matches the sentinel via the Is method — + // acceptable; the runner would never construct it. + t.Log("nilErr matched (Is method returns true for nil receiver) — acceptable") + } + + // Error string formatting includes every offending resource so the + // operator can grep the audit_log.last_error column. + got := err.Error() + for _, want := range []string{"r1", "postgres", "postgres-admin secret"} { + if !strings.Contains(got, want) { + t.Errorf("propagationUnexpectedSkipErr.Error() = %q, missing substring %q (operator can't debug from audit_log.last_error)", got, want) + } + } +} + +// ─── Test 5: bucketSkipReason cardinality contract ──────────────────────────── + +func TestBucketSkipReason_BoundsCardinality(t *testing.T) { + cases := []struct { + raw string + bucket string + }{ + // Postgres admin secret missing — the canonical F1 trigger. + {"resource not reachable: postgres-admin secret not found", "postgres_admin_secret_missing"}, + {"postgres-admin Secret not found", "postgres_admin_secret_missing"}, + {"missing postgres_admin secret", "postgres_admin_secret_missing"}, + + // Redis-auth secret missing. + {"redis-auth secret not found", "redis_auth_secret_missing"}, + + // Namespace teardown race. + {"namespace not found: instant-customer-xyz", "namespace_not_found"}, + + // Pod-not-found from exec-fallback paths. + {"exec fallback: no pod found", "pod_not_found"}, + {"pod not found in namespace", "pod_not_found"}, + + // Resource unreachable (gRPC dial / DNS). + {"resource not reachable", "resource_not_reachable"}, + {"backend unreachable", "resource_not_reachable"}, + + // Legacy resource without auth secret. + {"legacy resource without auth secret", "legacy_resource"}, + {"legacy pod missing required config", "legacy_resource"}, + + // Unknown — the catch-all bucket. + {"completely novel skip reason from a future provisioner", "other"}, + {"", "other"}, + } + for _, c := range cases { + if got := bucketSkipReason(c.raw); got != c.bucket { + t.Errorf("bucketSkipReason(%q) = %q, want %q", c.raw, got, c.bucket) + } + } + + // And the bounded-cardinality invariant: a thousand random unique + // SkipReasons should fall into a small fixed set of buckets. (We + // declare the upper bound at 8 buckets — the 7 named + "other".) + seen := map[string]struct{}{} + for i := 0; i < 1000; i++ { + // A faux randomness — different prefixes — none of which should + // hit any named bucket. + raw := "novel skip variant #" + uuid.New().String() + seen[bucketSkipReason(raw)] = struct{}{} + } + if len(seen) != 1 || func() bool { _, ok := seen["other"]; return !ok }() { + t.Errorf("bucketSkipReason swallowed %d distinct buckets for novel inputs (want exactly 1, 'other') — cardinality not bounded, Prometheus label explosion risk: %v", len(seen), seen) + } +} diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 9148550..cfaa0b0 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -408,6 +408,96 @@ var ( Help: "Event-email forwarder hits on an audit_log row whose kind has a builder but no Go renderer. A non-zero rate means a kind is being silently dropped — fix the eventEmailBodyRenderers map.", }, []string{"kind"}) + // ── propagation_runner — unexpected_skip counter (CHAOS-DRILL-2026-05-20 F1) ─ + // + // Every time the propagation_runner's per-resource RegradeResource call + // returns (Applied=false, SkipReason=), + // this counter increments. Pre-fix the runner WARN-and-applied that case, + // silently stamping the propagation row as applied without the entitlement + // landing — a paying customer ended up with "Pro on paper, hobby-grade infra" + // and no alert (CHAOS-DRILL-2026-05-20 finding #1). Now the case is treated + // as a retryable error: the row retries via the backoff schedule and + // dead-letters at propagationMaxAttempts. This counter is the leading + // indicator; the dead-letter audit row is the alert-able lagging signal. + // + // Labels: + // + // kind — pending_propagations.kind ("tier_elevation", etc.). + // Bounded by propagationKnownKinds (~1-3 entries). + // + // resource_type — "postgres" | "redis" | "mongodb" (the offending + // resource class). Bounded by ResourceType enum. + // + // skip_reason — a SHORT canonical bucket derived from the raw + // skip_reason string ("postgres_admin_secret_missing", + // "namespace_not_found", "other"). The runner does + // the bucketing (jobs.bucketSkipReason) so cardinality + // stays bounded — never pass the raw SkipReason here. + // + // NR alert (suggested): + // sum(rate(instant_propagation_unexpected_skip_total[15m])) > 0 + // for 30+ minutes → P2 page. A single isolated event is the + // mid-deprovisioning-race signal; a sustained rate is a real + // downstream regression and an operator must investigate before + // the row dead-letters ~24h later. + PropagationUnexpectedSkipTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "instant_propagation_unexpected_skip_total", + Help: "propagation_runner per-resource RegradeResource returned (Applied=false, SkipReason=). Leading indicator for the dead-letter alert.", + }, []string{"kind", "resource_type", "skip_reason"}) + + // ── propagation_runner — dead-letter + unknown-kind counters (CHAOS F2/F3) ─ + // + // PropagationDeadLetteredTotal increments every time the propagation_runner + // transitions a row to failed_at + emits a propagation.*dead_lettered audit + // row. Two triggers feed this single metric, distinguished by `reason`: + // + // reason="max_attempts" — the modal path. Per-resource RegradeResource + // failures, F1's unexpected_skip-as-failure, and + // markApplied DB failures all converge here once + // they reach propagationMaxAttempts. + // reason="unknown_kind" — CHAOS F2: a worker pod that doesn't recognise + // a `kind` enqueued by a newer api image. Without + // the F2 fix these escape the maxAttempts ceiling. + // + // `kind` carries the row's pending_propagations.kind value for the + // max_attempts path (bounded by propagationKnownKinds — ~1-3 entries). + // The unknown_kind path passes kind="unknown_kind" as a bounded bucket, + // so an attacker-controlled api-side enqueue can't blow up worker + // label cardinality. + // + // NR alert (suggested): + // rate(instant_propagation_dead_lettered_total[5m]) > 0 for 5m → P1 page. + // propagation_runner is the last line of defence between Razorpay webhook + // delivery and customer infra; any dead-letter means a paying customer's + // regrade fell through (or, on the unknown_kind path, that a worker pod + // is running an old image vs the api). + PropagationDeadLetteredTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "instant_propagation_dead_lettered_total", + Help: "propagation_runner rows transitioned to failed_at. Labelled by reason (max_attempts|unknown_kind) and kind (pending_propagations.kind, or 'unknown_kind' for F2's bounded bucket).", + }, []string{"reason", "kind"}) + + // PropagationUnknownKindTotal counts every TICK that picked up at least + // one row whose kind had no handler in propagationHandlers. Distinct + // from PropagationDeadLetteredTotal{reason="unknown_kind"} — that fires + // once at the END of the row's life (after maxAttempts), this fires on + // EVERY tick while the row is retrying. Lets the operator see "the + // worker is older than the api" within seconds rather than waiting the + // ~24h backoff for the dead-letter to land. + // + // `kind` is the raw pending_propagations.kind value. Bounded by the + // api-side enqueue surface (NOT by attacker input — only the api can + // INSERT into pending_propagations); the cardinality risk is accepted + // because in the rollback-drift scenario the operator wants to know + // EXACTLY which new kind their old worker is rejecting. + // + // NR alert (suggested): + // sum(rate(instant_propagation_unknown_kind_total[5m])) by (kind) > 0 + // for 5m → P2 page. Action: finish the rollout. + PropagationUnknownKindTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "instant_propagation_unknown_kind_total", + Help: "propagation_runner ticks that saw at least one row whose kind has no handler (image-skew indicator). Labelled by kind.", + }, []string{"kind"}) + // readyzCheckStatusGauge — per-component readiness status surfaced by // /readyz on this service's HTTP sidecar (:8091). See the matching // gauge in the api repo at api/internal/metrics/metrics.go for the