Skip to content

Commit 741b561

Browse files
committed
fix(admin/sqs): map leader-churn errors to 503; add tests; SPA polish
Three findings from Claude review on PR #670: 1. (P1) translateAdminQueuesError did not catch leader-churn errors. AdminDeleteQueue passes the upfront isVerifiedSQSLeader check, then dispatches deleteQueueWithRetry which can hit a kv coordinator that just lost leadership. The resulting kv.ErrLeaderNotFound / adapter.ErrNotLeader / wrapped "not leader" suffixes were not in the translator's switch — they fell to default and the admin handler rendered a generic 500 instead of the spec'd 503 + Retry-After: 1. Added the `case isLeaderChurnError(err)` arm mirroring translateAdminTablesError's identical fix from PR #634. 2. (P2) No tests for translateAdminQueuesError. Mirrored the three Dynamo equivalents in main_admin_test.go: - TestTranslateAdminQueuesError_LeaderChurn covers every kv sentinel + canonical wrapped-suffix variant. - TestTranslateAdminQueuesError_LeaderPhraseInMiddleOfMessage pins the HasSuffix matcher behaviour against false positives on user-supplied error messages mid-string. - TestTranslateAdminQueuesError_UnrelatedErrorPassesThrough confirms the detector does not swallow innocent "leader" mentions outside the canonical phrase set. 3. (Low / polish) SqsList.tsx subtitle leaked the Go file path "adapter/sqs_admin.go" and the internal milestone name "Phase 3.A" to end users — DynamoList / S3List don't do this. Replaced with operator-facing prose describing what the page does. Verified: - go test -run TestTranslateAdminQueuesError . — passes - go build ./... clean - go test -race ./internal/admin/... + go test -race -run TestSQS ./adapter/ — pass - golangci-lint run ./adapter/... ./internal/admin/... ./... — 0 issues - cd web/admin && npm run lint (tsc --strict) clean
1 parent 3a8a1d7 commit 741b561

3 files changed

Lines changed: 76 additions & 4 deletions

File tree

main_admin.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,21 @@ func translateAdminQueuesError(err error) error {
211211
return admin.ErrQueuesNotFound
212212
case errors.Is(err, adapter.ErrAdminSQSValidation):
213213
return admin.ErrQueuesValidation
214+
case isLeaderChurnError(err):
215+
// Leadership can be lost between AdminDeleteQueue's upfront
216+
// isVerifiedSQSLeader check and the coordinator dispatch
217+
// inside deleteQueueWithRetry. The kv coordinator surfaces
218+
// that as ErrLeaderNotFound / ErrNotLeader (or a wrapped
219+
// "not leader" / "leader not found" suffix), and the
220+
// retry loop's isRetryableTransactWriteError does not catch
221+
// them. Without this arm the error falls to default and the
222+
// admin handler renders a generic 500 — Codex P2 + Claude
223+
// P1 on PR #670 confirmed the gap. Mirrors the same arm in
224+
// translateAdminTablesError that fixed this for Dynamo on
225+
// PR #634.
226+
return admin.ErrQueuesNotLeader
214227
default:
215-
return err
228+
return err //nolint:wrapcheck // forwarded so the handler logs but does not surface it.
216229
}
217230
}
218231

main_admin_test.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,3 +426,63 @@ func TestTranslateAdminTablesError_UnrelatedErrorPassesThrough(t *testing.T) {
426426
require.NotErrorIs(t, out, admin.ErrTablesNotLeader)
427427
require.Equal(t, in, out)
428428
}
429+
430+
// TestTranslateAdminQueuesError_LeaderChurn is the SQS counterpart of
431+
// TestTranslateAdminTablesError_LeaderChurn. AdminDeleteQueue clears
432+
// the upfront isVerifiedSQSLeader check but the kv coordinator can
433+
// still drop leadership inside deleteQueueWithRetry's Dispatch; the
434+
// resulting ErrLeaderNotFound / ErrNotLeader / wrapped suffixes must
435+
// classify as 503 leader_unavailable, not the generic 500 fallthrough.
436+
// Codex P2 + Claude P1 on PR #670 confirmed the original gap.
437+
func TestTranslateAdminQueuesError_LeaderChurn(t *testing.T) {
438+
cases := []struct {
439+
name string
440+
in error
441+
}{
442+
{"kv.ErrLeaderNotFound", kv.ErrLeaderNotFound},
443+
{"adapter.ErrNotLeader", adapter.ErrNotLeader},
444+
{"adapter.ErrLeaderNotFound", adapter.ErrLeaderNotFound},
445+
{"wrapped not leader", errors.New("dispatch failed: not leader")},
446+
{"wrapped leader not found", errors.New("dispatch: leader not found")},
447+
{"wrapped leadership lost", errors.New("commit aborted: leadership lost")},
448+
{"wrapped leadership transfer", errors.New("retry exhausted: leadership transfer in progress")},
449+
}
450+
for _, tc := range cases {
451+
t.Run(tc.name, func(t *testing.T) {
452+
out := translateAdminQueuesError(tc.in)
453+
require.ErrorIs(t, out, admin.ErrQueuesNotLeader,
454+
"input %q must map to ErrQueuesNotLeader", tc.in)
455+
})
456+
}
457+
}
458+
459+
// TestTranslateAdminQueuesError_LeaderPhraseInMiddleOfMessage is the
460+
// SQS counterpart of the same Tables test — pins that the HasSuffix
461+
// matcher in isLeaderChurnError does not false-positive on
462+
// user-supplied error messages that happen to mention a leader
463+
// phrase mid-string (e.g. a queue name or attribute value that
464+
// happens to contain "not leader").
465+
func TestTranslateAdminQueuesError_LeaderPhraseInMiddleOfMessage(t *testing.T) {
466+
cases := []string{
467+
"not leader: actually a downstream error",
468+
"leader not found, but recovered automatically",
469+
"leadership lost mid-snapshot, retried successfully",
470+
}
471+
for _, msg := range cases {
472+
t.Run(msg, func(t *testing.T) {
473+
out := translateAdminQueuesError(errors.New(msg))
474+
require.NotErrorIs(t, out, admin.ErrQueuesNotLeader,
475+
"mid-message leader phrase %q must not classify as leader-churn", msg)
476+
})
477+
}
478+
}
479+
480+
// TestTranslateAdminQueuesError_UnrelatedErrorPassesThrough confirms
481+
// the leader-churn detector does not swallow unrelated errors that
482+
// happen to mention the word "leader" outside the canonical phrases.
483+
func TestTranslateAdminQueuesError_UnrelatedErrorPassesThrough(t *testing.T) {
484+
in := errors.New("team leader misconfigured")
485+
out := translateAdminQueuesError(in)
486+
require.NotErrorIs(t, out, admin.ErrQueuesNotLeader)
487+
require.Equal(t, in, out)
488+
}

web/admin/src/pages/SqsList.tsx

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,8 @@ export function SqsListPage() {
1111
<div>
1212
<h1 className="text-xl font-semibold">SQS queues</h1>
1313
<p className="text-xs text-muted">
14-
Backed by the SigV4-bypass admin entrypoints in
15-
<code className="font-mono ml-1">adapter/sqs_admin.go</code>.
16-
Detail pages show the new approximate counters from Phase 3.A.
14+
List, describe, and delete SQS queues. Detail pages also surface
15+
the approximate visible / in-flight / delayed message counts.
1716
</p>
1817
</div>
1918
<button type="button" className="btn-secondary" onClick={queues.reload}>

0 commit comments

Comments
 (0)