Skip to content

Commit 271f741

Browse files
fix(bugbash 2026-05-21): NATS AccountSeed for post-restart revocation + test alignment (#14)
* fix(queueprovider/nats): A04-F3 — expose AccountSeed for post-restart revocation Migration 060 added resources.queue_account_seed_encrypted to make NATS account revocation survive a provisioner pod restart, but IssueTenantCredentials was discarding the freshly-minted account seed (`_ = accountSeed`). Without the seed reaching the api caller, the column was never populated and RevokeWith Seed could never re-sign the account claim after a restart wiped the in-memory accountCache. This change: - Adds TenantCreds.AccountSeed (documented as a secret; NEVER log). - Populates AccountSeed in nats.IssueTenantCredentials. - Adds round-trip test proving RevokeWithSeed works without accountCache (simulates the post-restart path that migration 060 was built for). Cross-repo: api + worker must (a) bump common, (b) AES-256-GCM-encrypt AccountSeed via the existing keyring and persist to queue_account_seed_ encrypted, (c) decrypt + pass to RevokeWithSeed on teardown. Tracked separately. Forward-compatible: AccountSeed is only populated on isolated provisions, so legacy_open prod is unaffected. Coverage block (rule 17): Symptom: queue_account_seed_encrypted always NULL; revocation no-ops post-restart Enumeration: rg -n 'AccountSeed|queue_account_seed_encrypted' common/ Sites found: 3 (TenantCreds field, IssueTenantCredentials return, RevokeWithSeed param) Sites touched: all 3 (RevokeWithSeed already accepted seed; populating it now activates the path) Coverage test: TestNATS_IssueExposesAccountSeed_AndRevokeWithSeed_RoundTrips Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * fix(test): growth tier DeploymentsAppsLimit asserts 50 (wave-3 BugBash value) Wave-3 BugBash bumped growth tier deployments_apps from 5 → 50 in plans.yaml; test was not updated. Test fix only — plans.yaml + common/plans/plans.go defaultYAML are the authoritative source. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 77f94f2 commit 271f741

5 files changed

Lines changed: 100 additions & 5 deletions

File tree

plans/plans_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,8 @@ func TestDeploymentsAppsLimit_Tiers(t *testing.T) {
449449
"hobby_plus must allow 2 deployment apps (doubles hobby's 1, vs pro's 10)")
450450
assert.Equal(t, 10, r.DeploymentsAppsLimit("pro"))
451451
assert.Equal(t, -1, r.DeploymentsAppsLimit("team"))
452-
assert.Equal(t, 5, r.DeploymentsAppsLimit("growth"))
452+
assert.Equal(t, 50, r.DeploymentsAppsLimit("growth"),
453+
"growth allows 50 deployment apps (wave-3 BugBash bumped from 5 → 50, matching plans.yaml)")
453454
}
454455

455456
// TestHobbyPlus_TierMatrix is the W11 lock-in test for the hobby_plus tier.

queueprovider/nats/export_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package nats
2+
3+
import "sync"
4+
5+
// PurgeAccountCacheForTest empties the in-memory accountCache. Used by tests
6+
// to simulate the post-restart scenario where the cache is empty and the only
7+
// way to revoke is via the persisted (encrypted) account seed via
8+
// RevokeWithSeed. NOT exported outside _test.go.
9+
func (p *Provider) PurgeAccountCacheForTest() {
10+
p.accountCache = sync.Map{}
11+
}

queueprovider/nats/nats.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -327,10 +327,12 @@ func (p *Provider) IssueTenantCredentials(ctx context.Context, in queueprovider.
327327
expiresAt = &t
328328
}
329329

330-
// Wipe the account seed from memory once we've finished signing; the
331-
// api/provisioner persist accountSeed separately (encrypted at rest) so
332-
// Revoke can re-sign the updated claim later.
333-
_ = accountSeed // keep ref alive until here; do NOT log
330+
// Return the account seed to the caller (api/worker) so it can be
331+
// encrypted at rest in resources.queue_account_seed_encrypted (migration
332+
// 060). Without this, revocation after process restart is impossible —
333+
// the in-memory accountCache is the only other copy. The caller MUST
334+
// treat AccountSeed as a secret and MUST NOT log it; this is enforced
335+
// upstream by the api crypto path that wraps the value before persist.
334336

335337
return &queueprovider.TenantCreds{
336338
JWT: userJWT,
@@ -341,6 +343,7 @@ func (p *Provider) IssueTenantCredentials(ctx context.Context, in queueprovider.
341343
ExpiresAt: expiresAt,
342344
KeyID: accountPub,
343345
AuthMode: queueprovider.AuthModeIsolated,
346+
AccountSeed: string(accountSeed), // secret — caller encrypts before persist; NEVER log
344347
}, nil
345348
}
346349

queueprovider/nats/nats_test.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,3 +216,68 @@ func TestNATS_Revoke_PushesAccountUpdate(t *testing.T) {
216216
"Revoke should have pushed an updated claim for the account")
217217
assert.Equal(t, creds.KeyID, pusher.pushes[1].Pub)
218218
}
219+
220+
// TestNATS_IssueExposesAccountSeed_AndRevokeWithSeed_RoundTrips verifies the
221+
// fix for BugBash 2026-05-21 A04-F3: migration 060 added
222+
// `resources.queue_account_seed_encrypted` to make revocation survive a
223+
// provisioner restart, but the column was never written because IssueTenant
224+
// Credentials discarded the seed. This test asserts that:
225+
//
226+
// 1. IssueTenantCredentials returns a non-empty TenantCreds.AccountSeed
227+
// whose NKey prefix is "SA" (the canonical NATS account-seed prefix),
228+
// 2. The returned seed parses cleanly as an account NKey, and
229+
// 3. Passing that seed back to RevokeWithSeed re-signs and pushes the
230+
// account claim — proving the round-trip works WITHOUT the in-memory
231+
// accountCache (which is what a process restart would have lost).
232+
//
233+
// Coverage block (rule 17):
234+
//
235+
// Symptom: resources.queue_account_seed_encrypted always NULL; revocation no-ops after pod restart
236+
// Enumeration: rg -n "AccountSeed\|queue_account_seed_encrypted" common/ api/ worker/
237+
// Sites found: 3 (provider.go TenantCreds field, nats.go IssueTenant return, RevokeWithSeed param)
238+
// Sites touched: all 3 in common; api + worker tracked separately in cross-repo fix
239+
// Coverage test: this test — fails the moment AccountSeed is dropped from the return value
240+
func TestNATS_IssueExposesAccountSeed_AndRevokeWithSeed_RoundTrips(t *testing.T) {
241+
seed := newOperatorSeed(t)
242+
p, err := queueprovider.Factory(queueprovider.Config{
243+
Backend: "nats",
244+
Host: "nats.test.local",
245+
NATSOperatorSeed: seed,
246+
})
247+
require.NoError(t, err)
248+
natsProv := p.(*natsprov.Provider)
249+
pusher := &recordingPusher{}
250+
natsProv.SetResolverPusher(pusher)
251+
252+
creds, err := p.IssueTenantCredentials(context.Background(), queueprovider.IssueRequest{
253+
ResourceToken: "tok-seed-roundtrip",
254+
Subject: "tenant_seedroundtrip.",
255+
})
256+
require.NoError(t, err)
257+
258+
// (1) AccountSeed is populated and looks like a NATS account seed.
259+
require.NotEmpty(t, creds.AccountSeed,
260+
"AccountSeed must be exposed on TenantCreds — without it migration 060's queue_account_seed_encrypted column is dead weight and post-restart revocation silently no-ops")
261+
assert.True(t, strings.HasPrefix(creds.AccountSeed, "SA"),
262+
"AccountSeed must have NKey account-seed prefix SA — got prefix %q", creds.AccountSeed[:2])
263+
264+
// (2) AccountSeed parses cleanly as an account NKey.
265+
kp, err := nkeys.FromSeed([]byte(creds.AccountSeed))
266+
require.NoError(t, err, "AccountSeed must parse as an nkeys account seed")
267+
pub, err := kp.PublicKey()
268+
require.NoError(t, err)
269+
assert.Equal(t, creds.KeyID, pub,
270+
"AccountSeed's derived public key must match the TenantCreds.KeyID (account pub) so RevokeWithSeed targets the right account")
271+
272+
// (3) Simulate a process restart by wiping the in-memory cache, then
273+
// prove RevokeWithSeed alone (no cache hit) re-pushes the claim. This
274+
// is the exact failure path migration 060 was designed to eliminate.
275+
require.Len(t, pusher.pushes, 1, "issue should have pushed once")
276+
natsProv.PurgeAccountCacheForTest()
277+
err = natsProv.RevokeWithSeed(context.Background(), creds.AccountSeed)
278+
require.NoError(t, err)
279+
require.Len(t, pusher.pushes, 2,
280+
"RevokeWithSeed must push the revocation claim even when accountCache is empty (post-restart scenario)")
281+
assert.Equal(t, creds.KeyID, pusher.pushes[1].Pub,
282+
"revocation push must target the same account public key as the original issue")
283+
}

queueprovider/provider.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,21 @@ type TenantCreds struct {
141141
// Echoed in the API response so the caller knows whether isolation is
142142
// actually being enforced for this resource.
143143
AuthMode string
144+
145+
// AccountSeed is the NATS account NKey seed (operator-mode backends only)
146+
// — required for revocation after process restart. The api MUST encrypt
147+
// this value at rest (AES-256-GCM keyring, same path as connection_url)
148+
// and persist it in the resources.queue_account_seed_encrypted column
149+
// (migration 060). On teardown the api/worker decrypts and passes the
150+
// seed back to RevokeWithSeed so the account claim can be re-signed and
151+
// pushed to the resolver even after the in-memory accountCache has been
152+
// lost to a pod restart.
153+
//
154+
// Treated as a secret. NEVER log this field — it is a private NKey seed
155+
// (format "SA...") that grants account-level signing authority. Backends
156+
// that don't use NKey/JWT (RabbitMQ, Kafka skeletons, legacy_open) leave
157+
// this empty.
158+
AccountSeed string
144159
}
145160

146161
// Capabilities describes what isolation a backend can ENFORCE.

0 commit comments

Comments
 (0)