fix(provision): honour caller deadline in postgres/mongo/queue k8s Provision (#53)

mastermanas805 · claude · web-flow · commit 18e1d6480660 · 2026-06-08T22:55:53.000+05:30
The redis k8s backend was fixed for the pro-provision-hang bug (#52) but the postgres, mongo, AND queue backends still derived their provisioning context from context.Background(), discarding the caller's gRPC deadline. When the api caller's deadline fired (or it cancelled the RPC), the provisioner kept blocking up to 5m on a wedged PVC/CSI attach and the api handler hung — the same class that drove the e2e-prod flakiness, for db/nosql/queue instead of cache. Each backend now derives provCtx from the incoming ctx with a 5m ceiling backstop (min of the two), mirroring redis/k8s.go provisionContext. The api grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro), so legitimate 30-90s pod startup is unaffected; only pathological hangs + early cancellations now fast-fail. waitPodReady already honours ctx.Err() each poll; the shared server.mapError already maps context.DeadlineExceeded/Canceled to retryable gRPC codes, so no server change is needed. Also bounded the rollback namespace-delete to a fresh 30s background ctx (redis parity) so cleanup runs even when the incoming ctx is cancelled without a wedged apiserver pinning the goroutine. Tests: TestProvision_HonoursCallerDeadline per backend — a 300ms caller ctx fast-fails (<30s) wrapping context.DeadlineExceeded, instead of blocking for the pod-ready ceiling. make gate green. Co-authored-by: Manas Srivastava <[email protected]> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
diff --git a/internal/backend/mongo/k8s.go b/internal/backend/mongo/k8s.go
@@ -307,15 +307,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string) (*Creden
 
 	rollback := func(step string, cause error) error {
 		slog.Error("k8s.mongo.provision.rollback", "step", step, "namespace", ns, "error", cause)
-		_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
+		// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
+		// may already be cancelled (often WHY we are rolling back), but the
+		// namespace teardown must still run so a failed provision does not leak a
+		// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
+		delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer delCancel()
+		_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
 		return fmt.Errorf("k8s mongo: %s: %w", step, cause)
 	}
 
-	// Use a fresh background context — pod startup can take minutes, far exceeding
-	// any gRPC request deadline on the incoming ctx.
+	// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
+	// and a hard 5m server-side ceiling (min of the two). Deriving from the
+	// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
+	// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
+	// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
+	// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
+	// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
+	// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
+	// early cancellations now fast-fail (mapError classifies the ctx error as a
+	// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
+	// a caller that passes no deadline at all.
 	// Carry the teamID value forward so applyNamespace can label the namespace
 	// with instant.dev/owner-team (pentest 2026-05-16 fix).
-	provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
 	defer provCancel()
 	if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
 		provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)
diff --git a/internal/backend/mongo/k8s_caller_deadline_test.go b/internal/backend/mongo/k8s_caller_deadline_test.go
@@ -0,0 +1,44 @@
+package mongo
+
+// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
+// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
+// See the postgres sibling file for the full rationale: provCtx now derives from
+// the incoming ctx (with a 5m ceiling), so a stalled provision fast-fails on the
+// caller's deadline instead of blocking up to 5m on a background clock.
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"k8s.io/client-go/kubernetes/fake"
+)
+
+// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
+// must return promptly, bounded by the caller's deadline — NOT block for
+// mongoK8sReadyTO on a background clock.
+func TestProvision_HonoursCallerDeadline(t *testing.T) {
+	cs := fake.NewClientset() // empty cluster: the mongo pod never becomes Ready
+	b := &K8sBackend{cs: cs, storageClass: "standard", image: "mongo:7"}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
+	defer cancel()
+
+	start := time.Now()
+	_, err := b.Provision(ctx, "pro-token", "pro")
+	elapsed := time.Since(start)
+
+	if err == nil {
+		t.Fatal("Provision should fail when the mongo pod never becomes Ready; got nil error")
+	}
+	if elapsed > 30*time.Second {
+		t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
+			"~300ms deadline and fast-fail, not block for mongoK8sReadyTO. This means provCtx "+
+			"no longer derives from the caller's ctx.", elapsed)
+	}
+	if !errors.Is(err, context.DeadlineExceeded) {
+		t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
+			"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
+	}
+}
diff --git a/internal/backend/postgres/k8s.go b/internal/backend/postgres/k8s.go
@@ -244,16 +244,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string, connLimi
 
 	rollback := func(step string, cause error) error {
 		slog.Error("k8s.postgres.provision.rollback", "step", step, "namespace", ns, "error", cause)
-		_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
+		// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
+		// may already be cancelled (often WHY we are rolling back), but the
+		// namespace teardown must still run so a failed provision does not leak a
+		// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
+		delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer delCancel()
+		_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
 		return fmt.Errorf("k8s postgres: %s: %w", step, cause)
 	}
 
-	// Use a fresh background context for the provisioning sequence.
-	// The gRPC request context (ctx) has a short deadline that would cancel
-	// waitPodReady, which can legitimately take 1–3 minutes for pod startup.
+	// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
+	// and a hard 5m server-side ceiling (min of the two). Deriving from the
+	// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
+	// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
+	// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
+	// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
+	// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
+	// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
+	// early cancellations now fast-fail (mapError classifies the ctx error as a
+	// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
+	// a caller that passes no deadline at all.
 	// Carry the teamID value forward so applyNamespace can label the namespace
 	// with instant.dev/owner-team (pentest 2026-05-16 fix).
-	provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
 	defer provCancel()
 	if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
 		provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)
diff --git a/internal/backend/postgres/k8s_caller_deadline_test.go b/internal/backend/postgres/k8s_caller_deadline_test.go
@@ -0,0 +1,49 @@
+package postgres
+
+// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
+// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
+//
+// Before the fix the provisioning context derived from context.Background(), so
+// when the api caller's gRPC deadline fired (or it cancelled the RPC) the
+// provisioner kept blocking up to 5m on a wedged PVC/CSI attach and the api
+// handler hung. The fix derives provCtx from the incoming ctx (with a 5m
+// ceiling backstop), so a stalled provision fast-fails bounded by the caller's
+// deadline and the shared mapError maps the ctx error to a retryable gRPC status.
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"k8s.io/client-go/kubernetes/fake"
+)
+
+// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
+// must return promptly, bounded by the caller's deadline — NOT block for
+// k8sReadyTimeout on a background clock.
+func TestProvision_HonoursCallerDeadline(t *testing.T) {
+	cs := fake.NewClientset() // empty cluster: the postgres pod never becomes Ready
+	b := &K8sBackend{cs: cs, storageClass: "standard", image: "postgres:16"}
+
+	// Caller deadline far shorter than k8sReadyTimeout and the 5m ceiling.
+	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
+	defer cancel()
+
+	start := time.Now()
+	_, err := b.Provision(ctx, "pro-token", "pro", 8)
+	elapsed := time.Since(start)
+
+	if err == nil {
+		t.Fatal("Provision should fail when the postgres pod never becomes Ready; got nil error")
+	}
+	if elapsed > 30*time.Second {
+		t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
+			"~300ms deadline and fast-fail, not block for k8sReadyTimeout. This means provCtx "+
+			"no longer derives from the caller's ctx.", elapsed)
+	}
+	if !errors.Is(err, context.DeadlineExceeded) {
+		t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
+			"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
+	}
+}
diff --git a/internal/backend/queue/k8s.go b/internal/backend/queue/k8s.go
@@ -208,15 +208,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string) (*Creden
 
 	rollback := func(step string, cause error) error {
 		slog.Error("k8s.nats.provision.rollback", "step", step, "namespace", ns, "error", cause)
-		_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
+		// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
+		// may already be cancelled (often WHY we are rolling back), but the
+		// namespace teardown must still run so a failed provision does not leak a
+		// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
+		delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer delCancel()
+		_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
 		return fmt.Errorf("k8s nats: %s: %w", step, cause)
 	}
 
-	// Use a fresh background context — pod startup can take minutes, far exceeding
-	// any gRPC request deadline on the incoming ctx.
+	// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
+	// and a hard 5m server-side ceiling (min of the two). Deriving from the
+	// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
+	// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
+	// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
+	// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
+	// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
+	// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
+	// early cancellations now fast-fail (mapError classifies the ctx error as a
+	// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
+	// a caller that passes no deadline at all.
 	// Carry the teamID value forward so applyNamespace can label the namespace
 	// with instant.dev/owner-team (pentest 2026-05-16 fix).
-	provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
 	defer provCancel()
 	if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
 		provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)
diff --git a/internal/backend/queue/k8s_caller_deadline_test.go b/internal/backend/queue/k8s_caller_deadline_test.go
@@ -0,0 +1,44 @@
+package queue
+
+// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
+// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
+// See the postgres sibling file for the full rationale: provCtx now derives from
+// the incoming ctx (with a 5m ceiling), so a stalled provision fast-fails on the
+// caller's deadline instead of blocking up to 5m on a background clock.
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"k8s.io/client-go/kubernetes/fake"
+)
+
+// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
+// must return promptly, bounded by the caller's deadline — NOT block for
+// natsK8sReadyTO on a background clock.
+func TestProvision_HonoursCallerDeadline(t *testing.T) {
+	cs := fake.NewClientset() // empty cluster: the nats pod never becomes Ready
+	b := &K8sBackend{cs: cs, storageClass: "standard", image: "nats:2.10-alpine"}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
+	defer cancel()
+
+	start := time.Now()
+	_, err := b.Provision(ctx, "pro-token", "pro")
+	elapsed := time.Since(start)
+
+	if err == nil {
+		t.Fatal("Provision should fail when the nats pod never becomes Ready; got nil error")
+	}
+	if elapsed > 30*time.Second {
+		t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
+			"~300ms deadline and fast-fail, not block for natsK8sReadyTO. This means provCtx "+
+			"no longer derives from the caller's ctx.", elapsed)
+	}
+	if !errors.Is(err, context.DeadlineExceeded) {
+		t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
+			"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
+	}
+}