Skip to content

Commit 18e1d64

Browse files
fix(provision): honour caller deadline in postgres/mongo/queue k8s Provision (#53)
The redis k8s backend was fixed for the pro-provision-hang bug (#52) but the postgres, mongo, AND queue backends still derived their provisioning context from context.Background(), discarding the caller's gRPC deadline. When the api caller's deadline fired (or it cancelled the RPC), the provisioner kept blocking up to 5m on a wedged PVC/CSI attach and the api handler hung — the same class that drove the e2e-prod flakiness, for db/nosql/queue instead of cache. Each backend now derives provCtx from the incoming ctx with a 5m ceiling backstop (min of the two), mirroring redis/k8s.go provisionContext. The api grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro), so legitimate 30-90s pod startup is unaffected; only pathological hangs + early cancellations now fast-fail. waitPodReady already honours ctx.Err() each poll; the shared server.mapError already maps context.DeadlineExceeded/Canceled to retryable gRPC codes, so no server change is needed. Also bounded the rollback namespace-delete to a fresh 30s background ctx (redis parity) so cleanup runs even when the incoming ctx is cancelled without a wedged apiserver pinning the goroutine. Tests: TestProvision_HonoursCallerDeadline per backend — a 300ms caller ctx fast-fails (<30s) wrapping context.DeadlineExceeded, instead of blocking for the pod-ready ceiling. make gate green. Co-authored-by: Manas Srivastava <[email protected]> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent d30d3bb commit 18e1d64

6 files changed

Lines changed: 194 additions & 13 deletions

File tree

internal/backend/mongo/k8s.go

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -307,15 +307,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string) (*Creden
307307

308308
rollback := func(step string, cause error) error {
309309
slog.Error("k8s.mongo.provision.rollback", "step", step, "namespace", ns, "error", cause)
310-
_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
310+
// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
311+
// may already be cancelled (often WHY we are rolling back), but the
312+
// namespace teardown must still run so a failed provision does not leak a
313+
// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
314+
delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
315+
defer delCancel()
316+
_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
311317
return fmt.Errorf("k8s mongo: %s: %w", step, cause)
312318
}
313319

314-
// Use a fresh background context — pod startup can take minutes, far exceeding
315-
// any gRPC request deadline on the incoming ctx.
320+
// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
321+
// and a hard 5m server-side ceiling (min of the two). Deriving from the
322+
// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
323+
// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
324+
// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
325+
// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
326+
// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
327+
// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
328+
// early cancellations now fast-fail (mapError classifies the ctx error as a
329+
// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
330+
// a caller that passes no deadline at all.
316331
// Carry the teamID value forward so applyNamespace can label the namespace
317332
// with instant.dev/owner-team (pentest 2026-05-16 fix).
318-
provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
333+
provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
319334
defer provCancel()
320335
if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
321336
provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package mongo
2+
3+
// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
4+
// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
5+
// See the postgres sibling file for the full rationale: provCtx now derives from
6+
// the incoming ctx (with a 5m ceiling), so a stalled provision fast-fails on the
7+
// caller's deadline instead of blocking up to 5m on a background clock.
8+
9+
import (
10+
"context"
11+
"errors"
12+
"testing"
13+
"time"
14+
15+
"k8s.io/client-go/kubernetes/fake"
16+
)
17+
18+
// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
19+
// must return promptly, bounded by the caller's deadline — NOT block for
20+
// mongoK8sReadyTO on a background clock.
21+
func TestProvision_HonoursCallerDeadline(t *testing.T) {
22+
cs := fake.NewClientset() // empty cluster: the mongo pod never becomes Ready
23+
b := &K8sBackend{cs: cs, storageClass: "standard", image: "mongo:7"}
24+
25+
ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
26+
defer cancel()
27+
28+
start := time.Now()
29+
_, err := b.Provision(ctx, "pro-token", "pro")
30+
elapsed := time.Since(start)
31+
32+
if err == nil {
33+
t.Fatal("Provision should fail when the mongo pod never becomes Ready; got nil error")
34+
}
35+
if elapsed > 30*time.Second {
36+
t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
37+
"~300ms deadline and fast-fail, not block for mongoK8sReadyTO. This means provCtx "+
38+
"no longer derives from the caller's ctx.", elapsed)
39+
}
40+
if !errors.Is(err, context.DeadlineExceeded) {
41+
t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
42+
"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
43+
}
44+
}

internal/backend/postgres/k8s.go

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -244,16 +244,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string, connLimi
244244

245245
rollback := func(step string, cause error) error {
246246
slog.Error("k8s.postgres.provision.rollback", "step", step, "namespace", ns, "error", cause)
247-
_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
247+
// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
248+
// may already be cancelled (often WHY we are rolling back), but the
249+
// namespace teardown must still run so a failed provision does not leak a
250+
// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
251+
delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
252+
defer delCancel()
253+
_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
248254
return fmt.Errorf("k8s postgres: %s: %w", step, cause)
249255
}
250256

251-
// Use a fresh background context for the provisioning sequence.
252-
// The gRPC request context (ctx) has a short deadline that would cancel
253-
// waitPodReady, which can legitimately take 1–3 minutes for pod startup.
257+
// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
258+
// and a hard 5m server-side ceiling (min of the two). Deriving from the
259+
// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
260+
// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
261+
// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
262+
// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
263+
// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
264+
// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
265+
// early cancellations now fast-fail (mapError classifies the ctx error as a
266+
// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
267+
// a caller that passes no deadline at all.
254268
// Carry the teamID value forward so applyNamespace can label the namespace
255269
// with instant.dev/owner-team (pentest 2026-05-16 fix).
256-
provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
270+
provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
257271
defer provCancel()
258272
if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
259273
provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package postgres
2+
3+
// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
4+
// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
5+
//
6+
// Before the fix the provisioning context derived from context.Background(), so
7+
// when the api caller's gRPC deadline fired (or it cancelled the RPC) the
8+
// provisioner kept blocking up to 5m on a wedged PVC/CSI attach and the api
9+
// handler hung. The fix derives provCtx from the incoming ctx (with a 5m
10+
// ceiling backstop), so a stalled provision fast-fails bounded by the caller's
11+
// deadline and the shared mapError maps the ctx error to a retryable gRPC status.
12+
13+
import (
14+
"context"
15+
"errors"
16+
"testing"
17+
"time"
18+
19+
"k8s.io/client-go/kubernetes/fake"
20+
)
21+
22+
// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
23+
// must return promptly, bounded by the caller's deadline — NOT block for
24+
// k8sReadyTimeout on a background clock.
25+
func TestProvision_HonoursCallerDeadline(t *testing.T) {
26+
cs := fake.NewClientset() // empty cluster: the postgres pod never becomes Ready
27+
b := &K8sBackend{cs: cs, storageClass: "standard", image: "postgres:16"}
28+
29+
// Caller deadline far shorter than k8sReadyTimeout and the 5m ceiling.
30+
ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
31+
defer cancel()
32+
33+
start := time.Now()
34+
_, err := b.Provision(ctx, "pro-token", "pro", 8)
35+
elapsed := time.Since(start)
36+
37+
if err == nil {
38+
t.Fatal("Provision should fail when the postgres pod never becomes Ready; got nil error")
39+
}
40+
if elapsed > 30*time.Second {
41+
t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
42+
"~300ms deadline and fast-fail, not block for k8sReadyTimeout. This means provCtx "+
43+
"no longer derives from the caller's ctx.", elapsed)
44+
}
45+
if !errors.Is(err, context.DeadlineExceeded) {
46+
t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
47+
"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
48+
}
49+
}

internal/backend/queue/k8s.go

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,15 +208,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string) (*Creden
208208

209209
rollback := func(step string, cause error) error {
210210
slog.Error("k8s.nats.provision.rollback", "step", step, "namespace", ns, "error", cause)
211-
_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
211+
// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
212+
// may already be cancelled (often WHY we are rolling back), but the
213+
// namespace teardown must still run so a failed provision does not leak a
214+
// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
215+
delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
216+
defer delCancel()
217+
_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
212218
return fmt.Errorf("k8s nats: %s: %w", step, cause)
213219
}
214220

215-
// Use a fresh background context — pod startup can take minutes, far exceeding
216-
// any gRPC request deadline on the incoming ctx.
221+
// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
222+
// and a hard 5m server-side ceiling (min of the two). Deriving from the
223+
// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
224+
// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
225+
// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
226+
// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
227+
// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
228+
// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
229+
// early cancellations now fast-fail (mapError classifies the ctx error as a
230+
// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
231+
// a caller that passes no deadline at all.
217232
// Carry the teamID value forward so applyNamespace can label the namespace
218233
// with instant.dev/owner-team (pentest 2026-05-16 fix).
219-
provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
234+
provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
220235
defer provCancel()
221236
if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
222237
provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package queue
2+
3+
// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
4+
// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
5+
// See the postgres sibling file for the full rationale: provCtx now derives from
6+
// the incoming ctx (with a 5m ceiling), so a stalled provision fast-fails on the
7+
// caller's deadline instead of blocking up to 5m on a background clock.
8+
9+
import (
10+
"context"
11+
"errors"
12+
"testing"
13+
"time"
14+
15+
"k8s.io/client-go/kubernetes/fake"
16+
)
17+
18+
// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
19+
// must return promptly, bounded by the caller's deadline — NOT block for
20+
// natsK8sReadyTO on a background clock.
21+
func TestProvision_HonoursCallerDeadline(t *testing.T) {
22+
cs := fake.NewClientset() // empty cluster: the nats pod never becomes Ready
23+
b := &K8sBackend{cs: cs, storageClass: "standard", image: "nats:2.10-alpine"}
24+
25+
ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
26+
defer cancel()
27+
28+
start := time.Now()
29+
_, err := b.Provision(ctx, "pro-token", "pro")
30+
elapsed := time.Since(start)
31+
32+
if err == nil {
33+
t.Fatal("Provision should fail when the nats pod never becomes Ready; got nil error")
34+
}
35+
if elapsed > 30*time.Second {
36+
t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
37+
"~300ms deadline and fast-fail, not block for natsK8sReadyTO. This means provCtx "+
38+
"no longer derives from the caller's ctx.", elapsed)
39+
}
40+
if !errors.Is(err, context.DeadlineExceeded) {
41+
t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
42+
"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
43+
}
44+
}

0 commit comments

Comments
 (0)