@@ -34,6 +34,7 @@ package redis
3434import (
3535 "bytes"
3636 "context"
37+ "errors"
3738 "fmt"
3839 "strings"
3940 "testing"
@@ -46,6 +47,8 @@ import (
4647 "k8s.io/apimachinery/pkg/runtime/schema"
4748 "k8s.io/client-go/kubernetes/fake"
4849 ktesting "k8s.io/client-go/testing"
50+
51+ "instant.dev/provisioner/internal/ctxkeys"
4952)
5053
5154// ─── Tier sizing regression guards ──────────────────────────────────────────
@@ -59,17 +62,17 @@ func TestSizingForTier_MaxmemoryMB_MatchesPlansYAML(t *testing.T) {
5962 wantMB int // expected maxmemoryMB (mirrors plans.yaml redis_memory_mb)
6063 expectLimit bool // true if --maxmemory flag should be applied
6164 }{
62- {"anonymous" , 5 , true }, // plans.yaml: anonymous redis_memory_mb = 5
63- {"hobby" , 50 , true }, // plans.yaml: hobby redis_memory_mb = 50
64- {"hobby_yearly" , 50 , true }, // plans.yaml: hobby_yearly mirrors hobby
65- {"hobby_plus" , 50 , true }, // plans.yaml: hobby_plus redis_memory_mb = 50
65+ {"anonymous" , 5 , true }, // plans.yaml: anonymous redis_memory_mb = 5
66+ {"hobby" , 50 , true }, // plans.yaml: hobby redis_memory_mb = 50
67+ {"hobby_yearly" , 50 , true }, // plans.yaml: hobby_yearly mirrors hobby
68+ {"hobby_plus" , 50 , true }, // plans.yaml: hobby_plus redis_memory_mb = 50
6669 {"hobby_plus_yearly" , 50 , true }, // plans.yaml: hobby_plus_yearly mirrors hobby_plus
67- {"pro" , 512 , true }, // plans.yaml: pro redis_memory_mb = 512
68- {"pro_yearly" , 512 , true },// plans.yaml: pro_yearly mirrors pro
69- {"team" , - 1 , false }, // "no cap" pod-start default — flag omitted; Regrade reconciles to registry (team=1536, finite post strict-80)
70- {"team_yearly" , - 1 , false }, // team_yearly mirrors team's pod-start sizing default
71- {"growth" , 1024 , true }, // plans.yaml: growth redis_memory_mb = 1024
72- {"unknown" , 50 , true }, // unknown → hobby fallback
70+ {"pro" , 512 , true }, // plans.yaml: pro redis_memory_mb = 512
71+ {"pro_yearly" , 512 , true }, // plans.yaml: pro_yearly mirrors pro
72+ {"team" , - 1 , false }, // "no cap" pod-start default — flag omitted; Regrade reconciles to registry (team=1536, finite post strict-80)
73+ {"team_yearly" , - 1 , false }, // team_yearly mirrors team's pod-start sizing default
74+ {"growth" , 1024 , true }, // plans.yaml: growth redis_memory_mb = 1024
75+ {"unknown" , 50 , true }, // unknown → hobby fallback
7376 }
7477 for _ , tc := range cases {
7578 t .Run (tc .tier , func (t * testing.T ) {
@@ -188,7 +191,7 @@ func TestRouteKeyTTLForTier_PaidTiersNeverExpire(t *testing.T) {
188191 {"pro" , persistRouteKey },
189192 {"growth" , persistRouteKey },
190193 {"team" , persistRouteKey },
191- {"" , persistRouteKey }, // empty/unknown → fail safe to persistent
194+ {"" , persistRouteKey }, // empty/unknown → fail safe to persistent
192195 {"some_future_tier" , persistRouteKey },
193196 }
194197 for _ , tc := range cases {
@@ -355,17 +358,17 @@ func (f *fakeExecor) ExecInPod(_ context.Context, namespace, podName, containerN
355358// at the cap instead of silently evicting customer keys).
356359func TestExecCommandConstruction (t * testing.T ) {
357360 cases := []struct {
358- tier string
359- targetMB int
361+ tier string
362+ targetMB int
360363 wantTargetBytes int64
361- wantPolicy string
362- wantMaxmem string // expected substring in the CONFIG SET maxmemory command
364+ wantPolicy string
365+ wantMaxmem string // expected substring in the CONFIG SET maxmemory command
363366 }{
364367 {"anonymous" , 5 , 5 * 1024 * 1024 , "noeviction" , "5242880" },
365368 {"hobby" , 50 , 50 * 1024 * 1024 , "noeviction" , "52428800" },
366369 {"pro" , 512 , 512 * 1024 * 1024 , "noeviction" , "536870912" },
367370 {"growth" , 1024 , 1024 * 1024 * 1024 , "noeviction" , "1073741824" },
368- {"team" , - 1 , 0 , "noeviction" , "maxmemory 0" }, // unlimited — check full "maxmemory 0" substring
371+ {"team" , - 1 , 0 , "noeviction" , "maxmemory 0" }, // unlimited — check full "maxmemory 0" substring
369372 }
370373
371374 for _ , tc := range cases {
@@ -804,6 +807,167 @@ func TestRegrade_TeamTier_Unlimited(t *testing.T) {
804807 }
805808}
806809
810+ // ─── Provision caller-deadline / fast-fail guards (pro-provision-hang fix) ───
811+
812+ // TestProvision_ProTier_HonoursCallerDeadline is the core regression guard for
813+ // fix/redis-pro-provision-hang.
814+ //
815+ // THE BUG: provisioning a Redis cache for a Pro/Team team HUNG (>60s, never
816+ // returned) while the anonymous provision returned in ~6s. The Pro tier uses a
817+ // real 10Gi PVC (pvcMi=10240); when the block-storage attach stalls, the Redis
818+ // pod never reaches Ready and waitPodReady polled for the full redisK8sReadyTO
819+ // (3 minutes). The old code derived the provisioning context from
820+ // context.Background() with a hardcoded 5-minute timeout, so it IGNORED the
821+ // caller's deadline entirely: even after the api gRPC call timed out, the
822+ // provisioner kept blocking the handler. The anonymous path never hit this
823+ // because emptyDir pods (pvcMi=0) go Ready in seconds.
824+ //
825+ // THE FIX: provisionContext derives from the caller's ctx, so the moment the api
826+ // gives up, Provision returns promptly with a (wrapped) context error.
827+ //
828+ // This test uses a fake clientset whose Redis pod NEVER becomes Ready (no
829+ // scheduler in the fake → no PodReady condition is ever set), simulating the
830+ // stalled-PVC-attach condition. With a short caller deadline, Provision MUST
831+ // return well before redisK8sReadyTO — proving it honours the caller's deadline.
832+ // If a future change reverts to context.Background(), this test fails (times out
833+ // the t.Fatal guard / takes ~3 minutes).
834+ func TestProvision_ProTier_HonoursCallerDeadline (t * testing.T ) {
835+ cs := fake .NewClientset () // empty cluster: the redis pod never becomes Ready
836+ b := & K8sBackend {cs : cs , storageClass : "standard" , image : "redis:7-alpine" }
837+
838+ // Caller deadline far shorter than redisK8sReadyTO (3m) and the ceiling (5m).
839+ ctx , cancel := context .WithTimeout (context .Background (), 300 * time .Millisecond )
840+ defer cancel ()
841+
842+ start := time .Now ()
843+ _ , err := b .Provision (ctx , "pro-token" , "pro" )
844+ elapsed := time .Since (start )
845+
846+ if err == nil {
847+ t .Fatal ("Provision should fail when the Redis pod never becomes Ready; got nil error" )
848+ }
849+ // MUST return promptly — bounded by the caller deadline, NOT redisK8sReadyTO.
850+ // Allow generous slack for the fake-client round trips, but it must be far
851+ // below the 3-minute pod-ready ceiling that the old (background-context) code
852+ // would have blocked for.
853+ if elapsed > 30 * time .Second {
854+ t .Fatalf ("PRO-PROVISION-HANG REGRESSION: Provision took %s; it must honour the " +
855+ "caller's deadline (~300ms) and fast-fail, not block for redisK8sReadyTO (%s). " +
856+ "This means the provisioning context no longer derives from the caller's ctx." ,
857+ elapsed , redisK8sReadyTO )
858+ }
859+ // The error must be the caller-deadline error so mapError can classify it as
860+ // a retryable gRPC status (api soft-deletes + 503s rather than a hard 500).
861+ if ! errors .Is (err , context .DeadlineExceeded ) {
862+ t .Errorf ("Provision error should wrap context.DeadlineExceeded (got %v) so mapError " +
863+ "surfaces a retryable status; a non-context error would map to Internal/500" , err )
864+ }
865+ }
866+
867+ // TestProvision_AnonTier_HonoursCallerDeadline asserts the fast-fail is
868+ // tier-independent: even the anonymous path (which in prod is fast because it
869+ // uses emptyDir) returns promptly when the caller cancels. Under the fake
870+ // clientset the pod never goes Ready for any tier, so this verifies the
871+ // caller-deadline propagation rather than the emptyDir speedup.
872+ func TestProvision_AnonTier_HonoursCallerDeadline (t * testing.T ) {
873+ cs := fake .NewClientset ()
874+ b := & K8sBackend {cs : cs , storageClass : "standard" , image : "redis:7-alpine" }
875+
876+ ctx , cancel := context .WithTimeout (context .Background (), 300 * time .Millisecond )
877+ defer cancel ()
878+
879+ start := time .Now ()
880+ _ , err := b .Provision (ctx , "anon-token" , "anonymous" )
881+ elapsed := time .Since (start )
882+
883+ if err == nil {
884+ t .Fatal ("Provision should fail when the Redis pod never becomes Ready; got nil error" )
885+ }
886+ if elapsed > 30 * time .Second {
887+ t .Fatalf ("Provision took %s; must fast-fail on caller deadline, not block %s" , elapsed , redisK8sReadyTO )
888+ }
889+ if ! errors .Is (err , context .DeadlineExceeded ) {
890+ t .Errorf ("Provision error should wrap context.DeadlineExceeded; got %v" , err )
891+ }
892+ }
893+
894+ // TestProvision_CallerCancel_FastFails asserts that an explicit caller
895+ // cancellation (not just a deadline) also unblocks Provision promptly. This
896+ // covers the api-cancels-the-RPC case (e.g. the agent disconnected) — the
897+ // provisioner must stop blocking, not grind on for minutes.
898+ func TestProvision_CallerCancel_FastFails (t * testing.T ) {
899+ cs := fake .NewClientset ()
900+ b := & K8sBackend {cs : cs , storageClass : "standard" , image : "redis:7-alpine" }
901+
902+ ctx , cancel := context .WithCancel (context .Background ())
903+ // Cancel shortly after the call starts so waitPodReady is mid-poll.
904+ go func () {
905+ time .Sleep (200 * time .Millisecond )
906+ cancel ()
907+ }()
908+
909+ start := time .Now ()
910+ _ , err := b .Provision (ctx , "cancel-token" , "pro" )
911+ elapsed := time .Since (start )
912+
913+ if err == nil {
914+ t .Fatal ("Provision should fail when the caller cancels; got nil error" )
915+ }
916+ if elapsed > 30 * time .Second {
917+ t .Fatalf ("Provision took %s; caller cancellation must unblock it, not block %s" , elapsed , redisK8sReadyTO )
918+ }
919+ if ! errors .Is (err , context .Canceled ) {
920+ t .Errorf ("Provision error should wrap context.Canceled; got %v" , err )
921+ }
922+ }
923+
924+ // TestProvisionContext_HonoursCallerDeadline verifies provisionContext's two
925+ // bounding layers:
926+ // 1. When the caller's deadline is sooner than the ceiling, the derived
927+ // context's deadline equals the caller's (caller wins).
928+ // 2. The teamID context value is carried forward (so applyNamespace can label
929+ // the namespace with instant.dev/owner-team).
930+ func TestProvisionContext_HonoursCallerDeadline (t * testing.T ) {
931+ callerDeadline := time .Now ().Add (2 * time .Second ) // sooner than redisK8sProvisionCeiling (5m)
932+ parent , cancel := context .WithDeadline (context .Background (), callerDeadline )
933+ defer cancel ()
934+ parent = context .WithValue (parent , ctxkeys .TeamIDKey , "team-xyz" )
935+
936+ provCtx , provCancel := provisionContext (parent )
937+ defer provCancel ()
938+
939+ dl , ok := provCtx .Deadline ()
940+ if ! ok {
941+ t .Fatal ("provisionContext result must have a deadline" )
942+ }
943+ // The caller's 2s deadline is sooner than the 5m ceiling, so it must win.
944+ if dl .After (callerDeadline .Add (50 * time .Millisecond )) {
945+ t .Errorf ("provisionContext deadline = %v; caller's sooner deadline (%v) should win over the ceiling" , dl , callerDeadline )
946+ }
947+ if got , _ := provCtx .Value (ctxkeys .TeamIDKey ).(string ); got != "team-xyz" {
948+ t .Errorf ("provisionContext should carry teamID forward; got %q want %q" , got , "team-xyz" )
949+ }
950+ }
951+
952+ // TestProvisionContext_CeilingBoundsNoDeadlineCaller verifies that when the
953+ // caller passes a context with NO deadline (e.g. an internal background call),
954+ // the ceiling still bounds the provision so a wedged attach can't pin a
955+ // goroutine forever.
956+ func TestProvisionContext_CeilingBoundsNoDeadlineCaller (t * testing.T ) {
957+ provCtx , provCancel := provisionContext (context .Background ())
958+ defer provCancel ()
959+
960+ dl , ok := provCtx .Deadline ()
961+ if ! ok {
962+ t .Fatal ("provisionContext must impose the ceiling deadline even when the caller has none" )
963+ }
964+ // Deadline should be ~redisK8sProvisionCeiling from now.
965+ until := time .Until (dl )
966+ if until <= 0 || until > redisK8sProvisionCeiling + time .Second {
967+ t .Errorf ("provisionContext ceiling deadline = %s from now; want ~%s" , until , redisK8sProvisionCeiling )
968+ }
969+ }
970+
807971// ─── Helpers ─────────────────────────────────────────────────────────────────
808972
809973// runningPod creates a minimal Running pod for use in fake.Clientset.
0 commit comments