Skip to content

Commit 05fae56

Browse files
fix(provisioner): bound ProvisionCache gRPC with a 45s deadline
Authed /cache/new (PRO Redis provision) was observed hanging >60s while anonymous /cache/new returned in ~6s. The provisioner-side root cause is fixed separately; this is the defensive api-side guard. provClient ALREADY applies a per-call deadline to every provision RPC via provisionTimeout(tier) — the issue is the value: 4m (anon/free/hobby) and 5m (pro/team/growth). Those budgets exist for Postgres/Mongo, which spin up a per-tenant pod (PVC bind + image pull + DB init, 30-90s on a cold node). A Redis namespace carve does none of that (~1-6s for every tier), so a hung provisioner could hang the whole /cache/new request for up to 5 minutes. Introduce cacheProvisionTimeout = 45s (named, no inline magic number; a package var so the timeout->503 unit test can shrink it) and use it in ProvisionCache only. 45s is ~7x the slowest observed healthy carve, so a legitimately slow-but-OK provision still succeeds while a genuine hang fails fast. On timeout the existing handler path runs unchanged: soft-delete the pending resource + 503 provision_failed — no orphan, no hang. db/vector/nosql/queue intentionally keep provisionTimeout(tier): they are pod-backed and genuinely need the cold-pod budget; tightening them to 45s would regress legitimate slow provisions. Noted as a deliberate non-change. Tests (internal/provisioner, no DB needed): - TestProvisionCache_HangBecomesDeadlineError: blocking mock provisioner -> ProvisionCache returns gRPC DeadlineExceeded in ~the bounded window, not an indefinite hang (the error the handler turns into a 503). - TestCacheProvisionTimeout_Value: pins 45s and asserts it stays tighter than provisionTimeout(pro) so a future edit can't silently reintroduce the multi-minute hang. ProvisionCache coverage 100%. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 9239227 commit 05fae56

2 files changed

Lines changed: 111 additions & 1 deletion

File tree

internal/provisioner/client.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,28 @@ const (
3333
provisionerCircuitCooldown = 30 * time.Second
3434
)
3535

36+
// cacheProvisionTimeout bounds the api → provisioner ProvisionCache RPC.
37+
//
38+
// Unlike Postgres/Mongo, a Redis namespace provision does NOT spin up a
39+
// per-tenant pod with a PVC bind + image pull + DB init — it carves a
40+
// namespace/ACL out of an already-running Redis (observed ~1–6s end to
41+
// end, anonymous AND authenticated). It therefore does not need the
42+
// 4–5m cold-pod budget that provisionTimeout() grants the pod-backed
43+
// resource types.
44+
//
45+
// Granting Redis that same multi-minute budget means a *hung* provisioner
46+
// hangs the whole /cache/new request for up to 5 minutes (the symptom that
47+
// motivated this fix: authed /cache/new observed hanging >60s while anon
48+
// returned in ~6s). A generous-but-bounded 45s ceiling is ~7× the slowest
49+
// observed healthy provision, so a legitimately slow-but-OK Redis carve
50+
// still succeeds, while a genuine hang fails fast — the existing handler
51+
// error path then soft-deletes the pending resource and returns a clean
52+
// 503 provision_failed instead of an indefinite hang.
53+
//
54+
// A package var (not const) so the timeout→503 unit test can shrink it to
55+
// a few ms instead of blocking the suite for 45s; production never mutates it.
56+
var cacheProvisionTimeout = 45 * time.Second
57+
3658
// Credentials matches the shape returned by local providers.
3759
type Credentials struct {
3860
URL string
@@ -313,7 +335,10 @@ func (c *Client) ProvisionPostgres(ctx context.Context, token, tier, teamID stri
313335
func (c *Client) ProvisionCache(ctx context.Context, token, tier, teamID string) (*Credentials, error) {
314336
return callWithBreaker(c.breaker, func() (*Credentials, error) {
315337
start := time.Now()
316-
ctx, cancel := context.WithTimeout(c.ctxWithTeamID(c.ctxWithAuth(ctx), teamID), provisionTimeout(tier))
338+
// Redis carve is fast for every tier — bound it tighter than the
339+
// pod-backed resource types so a hung provisioner returns a clean
340+
// 503 in ≤45s instead of hanging /cache/new for minutes.
341+
ctx, cancel := context.WithTimeout(c.ctxWithTeamID(c.ctxWithAuth(ctx), teamID), cacheProvisionTimeout)
317342
defer cancel()
318343
resp, err := c.grpc.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{
319344
Token: token,

internal/provisioner/client_cov_test.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ type covServer struct {
4343
// Override ProvisionResource success payload.
4444
provisionResp *provisionerv1.ProvisionResponse
4545

46+
// If non-nil, ProvisionResource blocks until this channel is closed OR
47+
// the RPC context is cancelled (whichever comes first) — simulating a
48+
// hung provisioner so the client-side deadline can be exercised.
49+
provisionBlock chan struct{}
50+
4651
// Sniffed inbound metadata from the most recent ProvisionResource call.
4752
lastAuthToken []string
4853
lastRequestID []string
@@ -63,6 +68,15 @@ func (s *covServer) ProvisionResource(ctx context.Context, req *provisionerv1.Pr
6368
s.lastRequestID = md.Get("x-request-id")
6469
s.lastTeamID = md.Get("x-instant-team-id")
6570
}
71+
if s.provisionBlock != nil {
72+
// Simulate a hung provisioner: block until released or the RPC
73+
// context (carrying the client's deadline) is cancelled.
74+
select {
75+
case <-s.provisionBlock:
76+
case <-ctx.Done():
77+
return nil, status.FromContextError(ctx.Err()).Err()
78+
}
79+
}
6680
if s.provisionErr != nil {
6781
return nil, s.provisionErr
6882
}
@@ -411,6 +425,77 @@ func TestProvisionCache_SuccessAndError(t *testing.T) {
411425
}
412426
}
413427

428+
// TestCacheProvisionTimeout_Value pins the production deadline so a future
429+
// edit that re-grants Redis the multi-minute cold-pod budget (re-introducing
430+
// the /cache/new hang) reds the suite.
431+
func TestCacheProvisionTimeout_Value(t *testing.T) {
432+
if cacheProvisionTimeout != 45*time.Second {
433+
t.Errorf("cacheProvisionTimeout = %s; want 45s (generous-but-bounded Redis carve ceiling)", cacheProvisionTimeout)
434+
}
435+
// Must be well under provisionTimeout's pod-backed budget — that gap is
436+
// the whole point: a hung provisioner fails /cache/new fast instead of
437+
// hanging it for minutes.
438+
if cacheProvisionTimeout >= provisionTimeout("pro") {
439+
t.Errorf("cacheProvisionTimeout (%s) must be tighter than provisionTimeout(pro) (%s)",
440+
cacheProvisionTimeout, provisionTimeout("pro"))
441+
}
442+
}
443+
444+
// TestProvisionCache_HangBecomesDeadlineError proves the defensive fix: when
445+
// the provisioner hangs, ProvisionCache's own client-side deadline fires and
446+
// returns a DeadlineExceeded-class error in ~the bounded window — NOT an
447+
// indefinite block. This is the error the cache handler converts into a 503
448+
// provision_failed after soft-deleting the pending resource.
449+
//
450+
// We shrink cacheProvisionTimeout to a few ms (restored via t.Cleanup) so the
451+
// test is fast, and give the *caller* context a generous timeout so the error
452+
// is provably caused by our internal deadline, not the caller's.
453+
func TestProvisionCache_HangBecomesDeadlineError(t *testing.T) {
454+
orig := cacheProvisionTimeout
455+
cacheProvisionTimeout = 50 * time.Millisecond
456+
t.Cleanup(func() { cacheProvisionTimeout = orig })
457+
458+
h := newCovHarness(t, "hang-secret")
459+
defer h.close()
460+
461+
// Server blocks forever (channel never closed) → only the client-side
462+
// deadline can end the RPC.
463+
h.mock.provisionBlock = make(chan struct{})
464+
465+
// Caller context is far more generous than the 50ms internal deadline,
466+
// so a failure here is unambiguously our deadline firing.
467+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
468+
defer cancel()
469+
470+
done := make(chan error, 1)
471+
start := time.Now()
472+
go func() {
473+
_, err := h.client.ProvisionCache(ctx, "tok", "pro", "team-hang")
474+
done <- err
475+
}()
476+
477+
select {
478+
case err := <-done:
479+
elapsed := time.Since(start)
480+
if err == nil {
481+
t.Fatal("ProvisionCache returned nil; want a DeadlineExceeded-class error from the bounded deadline")
482+
}
483+
if !strings.Contains(err.Error(), "provisioner.ProvisionCache:") {
484+
t.Errorf("err = %v; want wrapped ProvisionCache error", err)
485+
}
486+
if st, ok := status.FromError(err); !ok || st.Code() != codes.DeadlineExceeded {
487+
t.Errorf("err = %v; want gRPC DeadlineExceeded", err)
488+
}
489+
// Bounded: must finish well within the caller's 10s, near the 50ms
490+
// internal deadline — proves the api no longer hangs on a hung provisioner.
491+
if elapsed > 5*time.Second {
492+
t.Errorf("ProvisionCache took %s; expected ~the bounded deadline (50ms), not an indefinite hang", elapsed)
493+
}
494+
case <-time.After(8 * time.Second):
495+
t.Fatal("ProvisionCache hung past the bounded deadline — the defensive timeout did not fire")
496+
}
497+
}
498+
414499
func TestProvisionNoSQL_SuccessAndError(t *testing.T) {
415500
h := newCovHarness(t, "mongo-secret")
416501
defer h.close()

0 commit comments

Comments
 (0)