Skip to content

Commit d77bf23

Browse files
fix(db): forward-consistent resources_status_check in migrations 024/049 (#284)
The migration runner re-applies every migration on each boot. Migrations 024 (pre-'suspended') and 049 (pre-'pending') re-added a NARROW resources_status_check that crashes the api boot the moment a valid row holds a later-added status — a single 'pending' resource (normal mid-provisioning state) took prod down to a single not-yet-restarted pod on 2026-06-10. Widen 024/049 to the full canonical set so 024/049/057 all define the same constraint and any re-apply is safe. Adds TestMigrations_ResourcesStatusCheck_ForwardConsistent (no-DB) to guard the recurrence: every migration re-adding the constraint must allow the full set. Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 25233bf commit d77bf23

3 files changed

Lines changed: 50 additions & 5 deletions

File tree

internal/db/migrations/024_resources_paused_status.sql

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,13 @@
2727
ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check;
2828
ALTER TABLE resources
2929
ADD CONSTRAINT resources_status_check
30-
CHECK (status IN ('active', 'paused', 'expired', 'deleted', 'reaped'));
30+
-- Forward-consistent full status set (incident 2026-06-10). The migration
31+
-- runner RE-APPLIES every migration on each boot; a NARROW constraint here
32+
-- (missing 'suspended' [added in 049] / 'pending' [added in 057]) crashes
33+
-- the boot the moment a row already holds one of those later-added — but
34+
-- valid — statuses. Re-adding the canonical set makes 024 safe to re-run
35+
-- regardless of data. (024/049/057 now all define the same set.)
36+
CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped'));
3137

3238
ALTER TABLE resources ADD COLUMN IF NOT EXISTS paused_at TIMESTAMPTZ;
3339

internal/db/migrations/049_resources_suspended_status.sql

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@
3030
ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check;
3131
ALTER TABLE resources
3232
ADD CONSTRAINT resources_status_check
33-
CHECK (status IN ('active', 'paused', 'suspended', 'expired', 'deleted', 'reaped'));
33+
-- Forward-consistent full status set (incident 2026-06-10): include 'pending'
34+
-- (added in 057) so re-applying 049 on boot can't crash on a valid pending
35+
-- row before 057 runs. 024/049/057 now all define the same canonical set.
36+
CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped'));
3437

3538
-- Partial index for the auto-unsuspend scan.
3639
-- EnforceStorageQuotaWorker scans WHERE status = 'suspended' on every run to

internal/db/postgres_migrations_test.go

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
// Rule-22 coverage block — symptom: api/internal/db migration runner +
44
// connect entry points are the platform-DB boot gate. Enumeration: every
55
// exported symbol in postgres.go + redis.go.
6-
// RunMigrations, embeddedMigrationFilenames, MigrationFiles,
7-
// ErrDBConnect.{Error,Unwrap}, ConnectPostgres,
8-
// ErrRedisConnect.{Error,Unwrap}, ConnectRedis.
6+
//
7+
// RunMigrations, embeddedMigrationFilenames, MigrationFiles,
8+
// ErrDBConnect.{Error,Unwrap}, ConnectPostgres,
9+
// ErrRedisConnect.{Error,Unwrap}, ConnectRedis.
10+
//
911
// Sites touched: all of the above. Coverage test: this file.
1012
package db
1113

@@ -497,3 +499,37 @@ func TestConnectRedis_PanicsOnUnreachable(t *testing.T) {
497499
// then returns the ping error → ConnectRedis panics with *ErrRedisConnect.
498500
ConnectRedis("redis://127.0.0.1:1/0")
499501
}
502+
503+
// TestMigrations_ResourcesStatusCheck_ForwardConsistent guards the
504+
// 2026-06-10 incident: the migration runner RE-APPLIES every migration on
505+
// each boot. Migration 024 originally re-added a NARROW
506+
// resources_status_check (missing 'suspended' [added 049] and 'pending'
507+
// [added 057]); once a valid row held one of those later-added statuses, the
508+
// 024 re-apply failed with a constraint violation and crashed the api boot
509+
// (prod survived only on a not-yet-restarted pod). Any migration that re-adds
510+
// resources_status_check MUST allow the full canonical status set, so
511+
// re-applying it mid-sequence can never reject a valid row. No DB needed —
512+
// reads the embedded SQL via the same seam the runner uses.
513+
func TestMigrations_ResourcesStatusCheck_ForwardConsistent(t *testing.T) {
514+
canonical := []string{"pending", "active", "paused", "suspended", "expired", "deleted", "reaped"}
515+
checked := 0
516+
for _, name := range MigrationFiles() {
517+
b, err := readMigrationFile(name)
518+
if err != nil {
519+
t.Fatalf("readMigrationFile(%s): %v", name, err)
520+
}
521+
sql := string(b)
522+
if !strings.Contains(sql, "ADD CONSTRAINT resources_status_check") {
523+
continue
524+
}
525+
checked++
526+
for _, st := range canonical {
527+
if !strings.Contains(sql, "'"+st+"'") {
528+
t.Errorf("%s re-adds resources_status_check but omits status %q — a valid row in that status will crash the api boot when this migration re-applies before a later widening migration. Use the full canonical set: %v", name, st, canonical)
529+
}
530+
}
531+
}
532+
if checked == 0 {
533+
t.Fatal("no migration adds resources_status_check — test wiring broken (did the constraint move?)")
534+
}
535+
}

0 commit comments

Comments
 (0)