Skip to content

Commit e7d23c6

Browse files
fix(customer-backup): stop recovery UPDATE binding NULL to NOT-NULL started_at + classify mongo/redis auth failures (#106)
BUG 1 (P1 — recovery wedged + log flood): recoverStuckRows ran `UPDATE resource_backups SET status='pending', started_at=NULL ...` but resource_backups.started_at is `TIMESTAMPTZ NOT NULL` (api mig 031_backups.sql). The explicit NULL bind violated the constraint on EVERY 30-60s tick, so stuck-row recovery NEVER worked and the worker log flooded with `stuck_row_recovery_failed: pq: null value in column "started_at" ... violates not-null constraint`. Drop the started_at clause: leaving the stale value is correct because the re-claim in processBackup runs `SET started_at=now()`, and the WHERE floor only re-matches rows that have been 'running' again past the timeout. The old regex-matcher test stayed green because sqlmock doesn't enforce NOT NULL — the new test inspects the literal SQL and reds on NULL. BUG 2 (misclassification surfaced by the 2026-06-11 mongo backup P1): backupFailReason only knew Postgres auth strings, so mongodump ("auth error: ... SCRAM-SHA-256") and redis-cli ("WRONGPASS", "NOAUTH") credential failures were bucketed as transient "dump" — telling the customer "briefly unreachable, we'll retry" for a non-self-healing credential failure and NOT paging ops. Extend the classifier to match all three backends' auth dialects. Note: the customer's actual mongo backup failure is an underlying DATA/INFRA incident (the shared mongo pod OOMKilled and lost db_poold8ad5fc1 + its user) — operator action required; this commit fixes the two CODE defects (broken recovery + mislabel) the incident exposed. Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
1 parent 6692610 commit e7d23c6

3 files changed

Lines changed: 125 additions & 8 deletions

File tree

internal/jobs/customer_backup_reason_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,19 @@ func TestBackupFailReason(t *testing.T) {
2727
{"no password supplied", errors.New("pg_dump: error: no password supplied"), "auth"},
2828
{"role does not exist", errors.New(`FATAL: role "usr_abc" does not exist`), "auth"},
2929
{"permission denied", errors.New("permission denied for table users"), "auth"},
30+
// R2 (2026-06-11) — the exact mongo + redis prod stderr from the P1
31+
// incident. Before the fix these classified "dump" (transient) and
32+
// told the customer "briefly unreachable, we'll retry" for a
33+
// non-self-healing credential failure, and never paged ops.
34+
{"prod mongo SCRAM auth", errors.New(`mongodump: exit status 1 (stderr: Failed: can't create session: failed to connect to mongodb://usr_x:y@mongo.instanode.dev:27017/db_x?authSource=db_x: connection() error occurred during connection handshake: auth error: sasl conversation error: unable to authenticate using mechanism "SCRAM-SHA-256": connection(mongo.instanode.dev:27017[-3]) socket was unexpectedly closed: EOF)`), "auth"},
35+
{"mongo plain authentication failed", errors.New("mongodump: Failed: Authentication failed."), "auth"},
36+
{"mongo unable to authenticate", errors.New("unable to authenticate using mechanism"), "auth"},
37+
{"prod redis WRONGPASS", errors.New(`redis-cli --rdb: exit status 1 (stderr: AUTH failed: WRONGPASS invalid username-password pair or user is disabled.)`), "auth"},
38+
{"redis NOAUTH", errors.New("redis-cli --rdb: NOAUTH Authentication required."), "auth"},
3039
{"server unavailable is transient", errors.New("pg_dump: server unavailable"), "dump"},
3140
{"connection refused is transient", errors.New("connection refused"), "dump"},
41+
{"mongo connection refused is transient", errors.New("mongodump: Failed: can't create session: connection refused"), "dump"},
42+
{"redis timeout is transient", errors.New("redis-cli --rdb: Could not connect to Redis: Connection timed out"), "dump"},
3243
{"nil err defaults to dump", nil, "dump"},
3344
}
3445
for _, c := range cases {

internal/jobs/customer_backup_runner.go

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -348,10 +348,19 @@ func (w *CustomerBackupRunnerWorker) Work(ctx context.Context, job *river.Job[Cu
348348
// failure here is logged and the sweep proceeds (the pending-row scan
349349
// still drains the normal queue).
350350
func (w *CustomerBackupRunnerWorker) recoverStuckRows(ctx context.Context) {
351+
// We DELIBERATELY do not touch started_at here. The column is
352+
// `TIMESTAMPTZ NOT NULL` (api migration 031_backups.sql), so an explicit
353+
// `started_at = NULL` violates the constraint and makes the UPDATE fail
354+
// on EVERY tick (the row stays orphaned at 'running', recovery never
355+
// works, and the log floods with stuck_row_recovery_failed). Leaving the
356+
// stale started_at in place is correct: the re-claim in processBackup
357+
// runs `SET started_at = now()`, so the value is overwritten the instant
358+
// the row is re-picked, and the WHERE-clause floor below (started_at <
359+
// now() - timeout) only ever re-matches a row that has been 'running'
360+
// again past the timeout — never a freshly-reset 'pending' row.
351361
res, err := w.db.ExecContext(ctx, `
352362
UPDATE resource_backups
353363
SET status = 'pending',
354-
started_at = NULL,
355364
error_summary = 'recovered: runner pod lost before finalize — re-queued'
356365
WHERE status = 'running'
357366
AND started_at IS NOT NULL
@@ -592,23 +601,43 @@ func (w *CustomerBackupRunnerWorker) processBackup(parentCtx context.Context, p
592601
// the api's internal refund endpoint so the team's daily counter is
593602
// credited. Scheduled backups don't burn the manual-counter so no
594603
// refund is needed.
595-
// backupFailReason classifies a pg_dump failure into "auth" (the credential
596-
// was rejected — password auth failed, missing role, no password supplied:
597-
// credential drift that will NOT self-heal and is SLA-relevant) vs "dump" (any
598-
// other pg_dump failure — DB briefly unreachable, timeout: transient, retried
599-
// next run). The match is on Postgres' own error text, lower-cased so it's
600-
// resilient to surrounding formatting.
604+
// backupFailReason classifies a dump failure into "auth" (the credential was
605+
// rejected — credential drift that will NOT self-heal and is SLA-relevant,
606+
// PAGES ops) vs "dump" (any other failure — DB briefly unreachable, timeout:
607+
// transient, retried next run). The match is on the dump tool's own error
608+
// text, lower-cased so it's resilient to surrounding formatting.
609+
//
610+
// R2 (2026-06-11) — the matcher now spans all THREE dump tools, not just
611+
// pg_dump. The 2026-06-11 P1 incident showed mongodump auth errors
612+
// ("auth error: ... SCRAM-SHA-256") and redis-cli auth errors ("WRONGPASS",
613+
// "NOAUTH") were silently bucketed as transient "dump" — telling the customer
614+
// "briefly unreachable, we'll retry" for a credential failure that retrying
615+
// can NEVER fix, and NOT paging ops on a non-self-healing condition. Each
616+
// backend speaks its own auth dialect, so we match all of them here.
601617
func backupFailReason(err error) string {
602618
if err == nil {
603619
return "dump"
604620
}
605621
s := strings.ToLower(err.Error())
606622
switch {
623+
// Postgres (pg_dump) auth dialect.
607624
case strings.Contains(s, "password authentication failed"),
608625
strings.Contains(s, "authentication failed"),
609626
strings.Contains(s, "no password supplied"),
610627
strings.Contains(s, "role") && strings.Contains(s, "does not exist"),
611-
strings.Contains(s, "permission denied for"):
628+
strings.Contains(s, "permission denied for"),
629+
// MongoDB (mongodump) auth dialect: the driver reports an
630+
// "auth error" wrapping a SASL/SCRAM failure, and "authentication
631+
// failed" on an outright user/db mismatch.
632+
strings.Contains(s, "auth error"),
633+
strings.Contains(s, "unable to authenticate"),
634+
strings.Contains(s, "sasl"),
635+
// Redis (redis-cli) auth dialect: WRONGPASS (bad user/pass),
636+
// NOAUTH (server wants a password we didn't send), and the
637+
// human-readable "invalid username-password pair".
638+
strings.Contains(s, "wrongpass"),
639+
strings.Contains(s, "noauth"),
640+
strings.Contains(s, "invalid username-password"):
612641
return "auth"
613642
default:
614643
return "dump"

internal/jobs/customer_backup_runner_test.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"os"
99
"path/filepath"
1010
"runtime"
11+
"strings"
1112
"sync"
1213
"testing"
1314
"time"
@@ -425,6 +426,82 @@ func TestRunner_RecoversStuckRunningRows(t *testing.T) {
425426
}
426427
}
427428

429+
// TestRecoverStuckRows_NeverBindsNullStartedAt is the regression test for the
430+
// 2026-06-11 P1: recoverStuckRows shipped with `SET ... started_at = NULL`,
431+
// but resource_backups.started_at is `TIMESTAMPTZ NOT NULL` (api migration
432+
// 031_backups.sql). In prod that UPDATE failed on EVERY 30-60s tick with
433+
//
434+
// pq: null value in column "started_at" of relation "resource_backups"
435+
// violates not-null constraint
436+
//
437+
// so stuck-row recovery NEVER worked and the log flooded. sqlmock does NOT
438+
// enforce NOT-NULL, which is exactly why the prior regex-matcher test stayed
439+
// green through the bug. This test inspects the literal SQL the runner emits
440+
// for the recovery UPDATE and FAILS if it ever sets started_at to NULL again.
441+
// It reds on the old code (which contained `started_at = NULL`) and greens on
442+
// the fix (which drops the clause, leaving started_at untouched on re-queue).
443+
func TestRecoverStuckRows_NeverBindsNullStartedAt(t *testing.T) {
444+
var capturedRecoverySQL string
445+
matcher := sqlmock.QueryMatcherFunc(func(expectedSQL, actualSQL string) error {
446+
// The recovery UPDATE is the only statement that resets a row to
447+
// 'pending'. Capture its exact text so we can assert on it below.
448+
if strings.Contains(actualSQL, "SET status = 'pending'") &&
449+
strings.Contains(actualSQL, "WHERE status = 'running'") {
450+
capturedRecoverySQL = actualSQL
451+
}
452+
// Defer to the default regex semantics so the rest of the tick's
453+
// expectations match exactly as in the other tests.
454+
return sqlmock.QueryMatcherRegexp.Match(expectedSQL, actualSQL)
455+
})
456+
457+
db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(matcher))
458+
if err != nil {
459+
t.Fatalf("sqlmock.New: %v", err)
460+
}
461+
defer db.Close()
462+
463+
mock.ExpectExec(`UPDATE resource_backups\s+SET status = 'pending'`).
464+
WillReturnResult(sqlmock.NewResult(0, 1))
465+
// Pending sweep finds nothing else this tick.
466+
mock.ExpectQuery(`SELECT b.id::text`).
467+
WithArgs(backupBatchSize).
468+
WillReturnRows(sqlmock.NewRows([]string{
469+
"id", "resource_id", "tier_at_backup", "backup_kind",
470+
"token", "connection_url", "resource_type", "team_id",
471+
}))
472+
// Retention sweep loops the five fallback tier names.
473+
for i := 0; i < 5; i++ {
474+
mock.ExpectQuery(`SELECT id::text, s3_key\s+FROM resource_backups`).
475+
WillReturnRows(sqlmock.NewRows([]string{"id", "s3_key"}))
476+
}
477+
478+
w := &CustomerBackupRunnerWorker{
479+
db: db,
480+
store: newFakeBackupStore(),
481+
pgDump: &fakePgDump{payload: []byte("x")},
482+
bucket: "instant-shared",
483+
prefix: "backups",
484+
aesKey: testAESKeyHex,
485+
now: time.Now,
486+
timeout: time.Minute,
487+
batchN: backupBatchSize,
488+
}
489+
if err := w.Work(context.Background(), fakeRunnerJob()); err != nil {
490+
t.Fatalf("Work: %v", err)
491+
}
492+
493+
if capturedRecoverySQL == "" {
494+
t.Fatal("recovery UPDATE was never emitted — test did not exercise recoverStuckRows")
495+
}
496+
// The crux: the recovery UPDATE must NOT set started_at to NULL. The
497+
// column is NOT NULL, so any `started_at = NULL` makes the statement
498+
// fail in prod on every tick. Case-insensitive, whitespace-tolerant.
499+
normalized := strings.ToLower(strings.Join(strings.Fields(capturedRecoverySQL), " "))
500+
if strings.Contains(normalized, "started_at = null") {
501+
t.Errorf("recovery UPDATE binds NULL to NOT-NULL started_at — P1 regression:\n%s", capturedRecoverySQL)
502+
}
503+
}
504+
428505
// TestBackupObjectKey — pure function contract.
429506
func TestBackupObjectKey(t *testing.T) {
430507
got := backupObjectKey("backups", "tok-abc", "bk-uuid")

0 commit comments

Comments
 (0)