|
| 1 | +package server_test |
| 2 | + |
| 3 | +// server_live_roundtrip_test.go — REAL-BACKEND integration coverage for the |
| 4 | +// gRPC server layer's Provision → Regrade → Deprovision lifecycle. |
| 5 | +// |
| 6 | +// Why this file exists (the truehomie-db DROP incident class, 2026-06-03): |
| 7 | +// every existing server_test.go / server_coverage_test.go test injects a *fake* |
| 8 | +// backend, so the actual DROP DATABASE / DROP USER / ALTER ROLE DDL has never |
| 9 | +// run through the real gRPC handler path (breaker wrapping, tier→connLimit |
| 10 | +// routing, mapError, response shaping, idempotent re-deprovision). High |
| 11 | +// statement coverage from mocks does NOT prove the destroy/regrade DDL is |
| 12 | +// correct end-to-end. These tests drive the genuine RPC handlers |
| 13 | +// (server.ProvisionResource / RegradeResource / DeprovisionResource) against a |
| 14 | +// real Postgres and a real Redis, and assert the backing infra is actually |
| 15 | +// created, regraded, and torn down — and that a second Deprovision is a clean |
| 16 | +// idempotent no-op (the #9 DROP IF EXISTS fix). |
| 17 | +// |
| 18 | +// Env-gated: skips cleanly when the backend URL is unset, so `go test -short` |
| 19 | +// in CI without a backend stays green; runs for real when the backend is |
| 20 | +// present (local dev Postgres at localhost:5432, Redis at localhost:6379, and |
| 21 | +// CI's coverage.yml docker services). |
| 22 | + |
| 23 | +import ( |
| 24 | + "context" |
| 25 | + "fmt" |
| 26 | + "os" |
| 27 | + "strings" |
| 28 | + "testing" |
| 29 | + "time" |
| 30 | + |
| 31 | + "github.com/jackc/pgx/v5" |
| 32 | + goredis "github.com/redis/go-redis/v9" |
| 33 | + |
| 34 | + commonv1 "instant.dev/proto/common/v1" |
| 35 | + provisionerv1 "instant.dev/proto/provisioner/v1" |
| 36 | + "instant.dev/provisioner/internal/backend/postgres" |
| 37 | + "instant.dev/provisioner/internal/backend/redis" |
| 38 | + "instant.dev/provisioner/internal/circuit" |
| 39 | + "instant.dev/provisioner/internal/config" |
| 40 | + "instant.dev/provisioner/internal/server" |
| 41 | +) |
| 42 | + |
| 43 | +// livePostgresAdminDSN returns an admin DSN capable of CREATE/DROP DATABASE, |
| 44 | +// or "" when none is configured (caller MUST t.Skip). Mirrors the env-var |
| 45 | +// resolution used by the backend/postgres live tests so a single env wires |
| 46 | +// both layers. |
| 47 | +func livePostgresAdminDSN() string { |
| 48 | + for _, k := range []string{"TEST_POSTGRES_CUSTOMERS_URL", "TEST_POSTGRES_ADMIN_DSN", "CUSTOMER_POSTGRES_DSN"} { |
| 49 | + if v := os.Getenv(k); v != "" { |
| 50 | + return v |
| 51 | + } |
| 52 | + } |
| 53 | + return "" |
| 54 | +} |
| 55 | + |
| 56 | +// liveRedisURL returns a redis:// URL for the provision pool, or "" when unset. |
| 57 | +func liveRedisURL() string { |
| 58 | + for _, k := range []string{"TEST_REDIS_URL", "CUSTOMER_REDIS_URL"} { |
| 59 | + if v := os.Getenv(k); v != "" { |
| 60 | + return v |
| 61 | + } |
| 62 | + } |
| 63 | + return "" |
| 64 | +} |
| 65 | + |
| 66 | +// liveServerWithRealPostgres builds a Server wired to a REAL LocalBackend |
| 67 | +// Postgres (shared-cluster admin DSN) and fresh per-test breakers. No pool, no |
| 68 | +// dedicated backend, so every RPC takes the live shared-cluster path. |
| 69 | +func liveServerWithRealPostgres(adminDSN string) *server.Server { |
| 70 | + return server.NewWithBackends( |
| 71 | + &config.Config{}, |
| 72 | + postgres.NewBackend("", adminDSN, "", "", ""), // "" → LocalBackend(adminDSN) |
| 73 | + nil, nil, nil, nil, // redis/mongo/queue/storage unused on this path |
| 74 | + nil, nil, nil, nil, // no dedicated backends |
| 75 | + nil, // no pool → live provision path |
| 76 | + ).SetBreakers(circuit.NewBreakers()) |
| 77 | +} |
| 78 | + |
| 79 | +// liveServerWithRealRedis builds a Server wired to a REAL Redis LocalBackend. |
| 80 | +func liveServerWithRealRedis(redisAddr string) *server.Server { |
| 81 | + return server.NewWithBackends( |
| 82 | + &config.Config{}, |
| 83 | + nil, |
| 84 | + redis.NewBackend("", redisAddr), // "" → LocalBackend(redisAddr) |
| 85 | + nil, nil, nil, |
| 86 | + nil, nil, nil, nil, |
| 87 | + nil, |
| 88 | + ).SetBreakers(circuit.NewBreakers()) |
| 89 | +} |
| 90 | + |
| 91 | +// liveToken returns a short, unique, test-scoped token safe as a Postgres |
| 92 | +// db_/usr_ identifier and a Redis key prefix. |
| 93 | +func liveToken(t *testing.T) string { |
| 94 | + t.Helper() |
| 95 | + clean := strings.NewReplacer("/", "_", " ", "_").Replace(t.Name()) |
| 96 | + if len(clean) > 24 { |
| 97 | + clean = clean[:24] |
| 98 | + } |
| 99 | + return fmt.Sprintf("tok%d%s", time.Now().UnixNano(), clean) |
| 100 | +} |
| 101 | + |
| 102 | +// pgConnLimit queries the actual rolconnlimit for usr_<token> on the live |
| 103 | +// cluster, or returns (0, err) if the role does not exist. |
| 104 | +func pgConnLimit(t *testing.T, adminDSN, username string) (int, bool) { |
| 105 | + t.Helper() |
| 106 | + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) |
| 107 | + defer cancel() |
| 108 | + conn, err := pgx.Connect(ctx, adminDSN) |
| 109 | + if err != nil { |
| 110 | + t.Fatalf("pgConnLimit connect: %v", err) |
| 111 | + } |
| 112 | + defer conn.Close(ctx) //nolint:errcheck |
| 113 | + var lim int |
| 114 | + err = conn.QueryRow(ctx, "SELECT rolconnlimit FROM pg_roles WHERE rolname=$1", username).Scan(&lim) |
| 115 | + if err == pgx.ErrNoRows { |
| 116 | + return 0, false |
| 117 | + } |
| 118 | + if err != nil { |
| 119 | + t.Fatalf("pgConnLimit query: %v", err) |
| 120 | + } |
| 121 | + return lim, true |
| 122 | +} |
| 123 | + |
| 124 | +// pgDatabaseExists reports whether db_<token> exists on the live cluster. |
| 125 | +func pgDatabaseExists(t *testing.T, adminDSN, dbName string) bool { |
| 126 | + t.Helper() |
| 127 | + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) |
| 128 | + defer cancel() |
| 129 | + conn, err := pgx.Connect(ctx, adminDSN) |
| 130 | + if err != nil { |
| 131 | + t.Fatalf("pgDatabaseExists connect: %v", err) |
| 132 | + } |
| 133 | + defer conn.Close(ctx) //nolint:errcheck |
| 134 | + var n int |
| 135 | + if err := conn.QueryRow(ctx, "SELECT count(*) FROM pg_database WHERE datname=$1", dbName).Scan(&n); err != nil { |
| 136 | + t.Fatalf("pgDatabaseExists query: %v", err) |
| 137 | + } |
| 138 | + return n > 0 |
| 139 | +} |
| 140 | + |
| 141 | +// cleanupPG drops db_<token>/usr_<token> best-effort so repeated runs and |
| 142 | +// failed assertions never leak objects on the shared cluster. |
| 143 | +func cleanupPG(t *testing.T, adminDSN, dbName, username string) { |
| 144 | + t.Helper() |
| 145 | + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) |
| 146 | + defer cancel() |
| 147 | + conn, err := pgx.Connect(ctx, adminDSN) |
| 148 | + if err != nil { |
| 149 | + t.Logf("cleanupPG connect: %v", err) |
| 150 | + return |
| 151 | + } |
| 152 | + defer conn.Close(ctx) //nolint:errcheck |
| 153 | + _, _ = conn.Exec(ctx, fmt.Sprintf("DROP DATABASE IF EXISTS %q WITH (FORCE)", dbName)) |
| 154 | + _, _ = conn.Exec(ctx, fmt.Sprintf("DROP USER IF EXISTS %q", username)) |
| 155 | +} |
| 156 | + |
| 157 | +// TestServer_Postgres_Provision_Regrade_Deprovision_LiveRoundTrip is the |
| 158 | +// truehomie-DROP-class integration test for the gRPC server layer: it drives |
| 159 | +// the real RPC handlers against a real Postgres and asserts the backing |
| 160 | +// db_/usr_ are CREATED by ProvisionResource, the role CONNECTION LIMIT is |
| 161 | +// adjusted by RegradeResource, the db_/usr_ are DROPped by DeprovisionResource, |
| 162 | +// and a second DeprovisionResource is a clean idempotent no-op (DROP IF EXISTS). |
| 163 | +func TestServer_Postgres_Provision_Regrade_Deprovision_LiveRoundTrip(t *testing.T) { |
| 164 | + adminDSN := livePostgresAdminDSN() |
| 165 | + if adminDSN == "" { |
| 166 | + t.Skip("TEST_POSTGRES_CUSTOMERS_URL/TEST_POSTGRES_ADMIN_DSN unset — skipping live-Postgres gRPC round-trip") |
| 167 | + } |
| 168 | + srv := liveServerWithRealPostgres(adminDSN) |
| 169 | + ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) |
| 170 | + defer cancel() |
| 171 | + |
| 172 | + token := liveToken(t) |
| 173 | + dbName := "db_" + token |
| 174 | + username := "usr_" + token |
| 175 | + t.Cleanup(func() { cleanupPG(t, adminDSN, dbName, username) }) |
| 176 | + |
| 177 | + // --- Provision (hobby tier → a positive CONNECTION LIMIT) --- |
| 178 | + provResp, err := srv.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{ |
| 179 | + Token: token, |
| 180 | + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES, |
| 181 | + Tier: "hobby", |
| 182 | + }) |
| 183 | + if err != nil { |
| 184 | + t.Fatalf("ProvisionResource(postgres, hobby): %v", err) |
| 185 | + } |
| 186 | + if provResp.DatabaseName != dbName || provResp.Username != username { |
| 187 | + t.Fatalf("ProvisionResource returned db=%q user=%q; want db=%q user=%q", |
| 188 | + provResp.DatabaseName, provResp.Username, dbName, username) |
| 189 | + } |
| 190 | + if !strings.HasPrefix(provResp.ConnectionUrl, "postgres://") { |
| 191 | + t.Errorf("ConnectionUrl = %q; want postgres:// prefix", provResp.ConnectionUrl) |
| 192 | + } |
| 193 | + if !pgDatabaseExists(t, adminDSN, dbName) { |
| 194 | + t.Fatalf("after ProvisionResource, %q does not exist on the live cluster", dbName) |
| 195 | + } |
| 196 | + hobbyLimit, ok := pgConnLimit(t, adminDSN, username) |
| 197 | + if !ok { |
| 198 | + t.Fatalf("after ProvisionResource, role %q does not exist", username) |
| 199 | + } |
| 200 | + if hobbyLimit <= 0 { |
| 201 | + t.Errorf("hobby role connection limit = %d; want a positive cap applied at CREATE USER", hobbyLimit) |
| 202 | + } |
| 203 | + |
| 204 | + // --- Regrade (pro tier → a different positive cap; assert the real ALTER ROLE took) --- |
| 205 | + regResp, err := srv.RegradeResource(ctx, &provisionerv1.RegradeRequest{ |
| 206 | + Token: token, |
| 207 | + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES, |
| 208 | + Tier: "pro", |
| 209 | + }) |
| 210 | + if err != nil { |
| 211 | + t.Fatalf("RegradeResource(postgres, pro): %v", err) |
| 212 | + } |
| 213 | + if !regResp.Applied { |
| 214 | + t.Errorf("RegradeResource(pro).Applied = false; want true") |
| 215 | + } |
| 216 | + proLimit, ok := pgConnLimit(t, adminDSN, username) |
| 217 | + if !ok { |
| 218 | + t.Fatalf("role %q vanished after Regrade", username) |
| 219 | + } |
| 220 | + if int(regResp.AppliedConnLimit) != proLimit { |
| 221 | + t.Errorf("pg_roles.rolconnlimit = %d but RegradeResponse.AppliedConnLimit = %d; the ALTER ROLE did not match the reported cap", |
| 222 | + proLimit, regResp.AppliedConnLimit) |
| 223 | + } |
| 224 | + if proLimit == hobbyLimit { |
| 225 | + t.Errorf("pro connection limit (%d) equals hobby (%d); the Regrade did not change the live cap", proLimit, hobbyLimit) |
| 226 | + } |
| 227 | + |
| 228 | + // --- Deprovision (the DROP DATABASE / DROP USER path — truehomie class) --- |
| 229 | + depResp, err := srv.DeprovisionResource(ctx, &provisionerv1.DeprovisionRequest{ |
| 230 | + Token: token, |
| 231 | + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES, |
| 232 | + }) |
| 233 | + if err != nil { |
| 234 | + t.Fatalf("DeprovisionResource(postgres): %v", err) |
| 235 | + } |
| 236 | + if !depResp.Deprovisioned { |
| 237 | + t.Errorf("DeprovisionResource.Deprovisioned = false; want true") |
| 238 | + } |
| 239 | + if pgDatabaseExists(t, adminDSN, dbName) { |
| 240 | + t.Errorf("after DeprovisionResource, %q still exists — DROP DATABASE did not run", dbName) |
| 241 | + } |
| 242 | + if _, ok := pgConnLimit(t, adminDSN, username); ok { |
| 243 | + t.Errorf("after DeprovisionResource, role %q still exists — DROP USER did not run", username) |
| 244 | + } |
| 245 | + |
| 246 | + // --- Idempotency: a second Deprovision must be a clean no-op (DROP IF EXISTS, #9) --- |
| 247 | + depResp2, err := srv.DeprovisionResource(ctx, &provisionerv1.DeprovisionRequest{ |
| 248 | + Token: token, |
| 249 | + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES, |
| 250 | + }) |
| 251 | + if err != nil { |
| 252 | + t.Errorf("second DeprovisionResource (idempotent) returned %v; want nil — DROP IF EXISTS must no-op cleanly", err) |
| 253 | + } |
| 254 | + if depResp2 != nil && !depResp2.Deprovisioned { |
| 255 | + t.Errorf("second DeprovisionResource.Deprovisioned = false; want true (idempotent success)") |
| 256 | + } |
| 257 | +} |
| 258 | + |
| 259 | +// TestServer_Postgres_Reprovision_AfterDeprovision_LiveRoundTrip asserts the |
| 260 | +// (re)Provision leg of the round-trip: after a full teardown the SAME token can |
| 261 | +// be provisioned again with no "already exists" collision — i.e. Deprovision |
| 262 | +// truly removed every object Provision created. This is the regression guard |
| 263 | +// for a partial-DROP leak that would block re-provisioning. |
| 264 | +func TestServer_Postgres_Reprovision_AfterDeprovision_LiveRoundTrip(t *testing.T) { |
| 265 | + adminDSN := livePostgresAdminDSN() |
| 266 | + if adminDSN == "" { |
| 267 | + t.Skip("postgres admin DSN unset — skipping reprovision round-trip") |
| 268 | + } |
| 269 | + srv := liveServerWithRealPostgres(adminDSN) |
| 270 | + ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) |
| 271 | + defer cancel() |
| 272 | + |
| 273 | + token := liveToken(t) |
| 274 | + dbName := "db_" + token |
| 275 | + username := "usr_" + token |
| 276 | + t.Cleanup(func() { cleanupPG(t, adminDSN, dbName, username) }) |
| 277 | + |
| 278 | + req := &provisionerv1.ProvisionRequest{ |
| 279 | + Token: token, |
| 280 | + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES, |
| 281 | + Tier: "hobby", |
| 282 | + } |
| 283 | + depReq := &provisionerv1.DeprovisionRequest{ |
| 284 | + Token: token, |
| 285 | + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES, |
| 286 | + } |
| 287 | + |
| 288 | + if _, err := srv.ProvisionResource(ctx, req); err != nil { |
| 289 | + t.Fatalf("first ProvisionResource: %v", err) |
| 290 | + } |
| 291 | + if _, err := srv.DeprovisionResource(ctx, depReq); err != nil { |
| 292 | + t.Fatalf("DeprovisionResource: %v", err) |
| 293 | + } |
| 294 | + // Re-provision the same token: must succeed (no orphaned db_/usr_ blocking it). |
| 295 | + if _, err := srv.ProvisionResource(ctx, req); err != nil { |
| 296 | + t.Fatalf("re-ProvisionResource after Deprovision: %v — teardown leaked an object that blocks reuse", err) |
| 297 | + } |
| 298 | + if !pgDatabaseExists(t, adminDSN, dbName) { |
| 299 | + t.Errorf("re-provisioned %q missing", dbName) |
| 300 | + } |
| 301 | + // Final teardown. |
| 302 | + if _, err := srv.DeprovisionResource(ctx, depReq); err != nil { |
| 303 | + t.Errorf("final DeprovisionResource: %v", err) |
| 304 | + } |
| 305 | +} |
| 306 | + |
| 307 | +// TestServer_Redis_Provision_Deprovision_LiveRoundTrip drives the real Redis |
| 308 | +// LocalBackend through the gRPC handlers: ProvisionResource creates an ACL |
| 309 | +// user, DeprovisionResource removes the ACL user and namespace keys, and a |
| 310 | +// second Deprovision is a clean idempotent no-op. (Redis LocalBackend has no |
| 311 | +// Regrade — only the k8s backend implements redis.Regrader.) |
| 312 | +func TestServer_Redis_Provision_Deprovision_LiveRoundTrip(t *testing.T) { |
| 313 | + redisURL := liveRedisURL() |
| 314 | + if redisURL == "" { |
| 315 | + t.Skip("TEST_REDIS_URL/CUSTOMER_REDIS_URL unset — skipping live-Redis gRPC round-trip") |
| 316 | + } |
| 317 | + opt, err := goredis.ParseURL(redisURL) |
| 318 | + if err != nil { |
| 319 | + t.Skipf("redis URL %q does not parse: %v", redisURL, err) |
| 320 | + } |
| 321 | + // Probe so we skip (not fail) when nothing is listening. |
| 322 | + probe := goredis.NewClient(opt) |
| 323 | + pctx, pcancel := context.WithTimeout(context.Background(), time.Second) |
| 324 | + defer pcancel() |
| 325 | + if perr := probe.Ping(pctx).Err(); perr != nil { |
| 326 | + _ = probe.Close() |
| 327 | + t.Skipf("redis not reachable at %s: %v", opt.Addr, perr) |
| 328 | + } |
| 329 | + |
| 330 | + srv := liveServerWithRealRedis(opt.Addr) |
| 331 | + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) |
| 332 | + defer cancel() |
| 333 | + |
| 334 | + token := liveToken(t) |
| 335 | + username := "usr_" + token |
| 336 | + t.Cleanup(func() { |
| 337 | + // Best-effort: drop the ACL user and any namespace keys directly. |
| 338 | + _ = probe.Do(context.Background(), "ACL", "DELUSER", username).Err() |
| 339 | + if keys, _, kerr := probe.Scan(context.Background(), 0, token+":*", 100).Result(); kerr == nil && len(keys) > 0 { |
| 340 | + _ = probe.Del(context.Background(), keys...).Err() |
| 341 | + } |
| 342 | + _ = probe.Close() |
| 343 | + }) |
| 344 | + |
| 345 | + // --- Provision: the gRPC handler must create the ACL user on the live pod --- |
| 346 | + provResp, err := srv.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{ |
| 347 | + Token: token, |
| 348 | + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_REDIS, |
| 349 | + Tier: "hobby", |
| 350 | + }) |
| 351 | + if err != nil { |
| 352 | + t.Fatalf("ProvisionResource(redis, hobby): %v", err) |
| 353 | + } |
| 354 | + if !strings.HasPrefix(provResp.ConnectionUrl, "redis://") { |
| 355 | + t.Errorf("ConnectionUrl = %q; want redis:// prefix", provResp.ConnectionUrl) |
| 356 | + } |
| 357 | + if provResp.KeyPrefix != token+":" { |
| 358 | + t.Errorf("KeyPrefix = %q; want %q", provResp.KeyPrefix, token+":") |
| 359 | + } |
| 360 | + // Assert the ACL user actually exists on the live Redis. |
| 361 | + if gerr := probe.Do(ctx, "ACL", "GETUSER", username).Err(); gerr != nil { |
| 362 | + t.Fatalf("ACL user %q not created on live Redis after ProvisionResource: %v", username, gerr) |
| 363 | + } |
| 364 | + // Write a namespace key so Deprovision has keys to reap. |
| 365 | + if serr := probe.Set(ctx, token+":k1", "v1", 0).Err(); serr != nil { |
| 366 | + t.Fatalf("seed key: %v", serr) |
| 367 | + } |
| 368 | + |
| 369 | + // --- Deprovision: removes the ACL user and the namespace keys --- |
| 370 | + depResp, err := srv.DeprovisionResource(ctx, &provisionerv1.DeprovisionRequest{ |
| 371 | + Token: token, |
| 372 | + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_REDIS, |
| 373 | + }) |
| 374 | + if err != nil { |
| 375 | + t.Fatalf("DeprovisionResource(redis): %v", err) |
| 376 | + } |
| 377 | + if !depResp.Deprovisioned { |
| 378 | + t.Errorf("DeprovisionResource.Deprovisioned = false; want true") |
| 379 | + } |
| 380 | + if gerr := probe.Do(ctx, "ACL", "GETUSER", username).Err(); gerr == nil { |
| 381 | + t.Errorf("ACL user %q still exists after DeprovisionResource — DELUSER did not run", username) |
| 382 | + } |
| 383 | + if n, eerr := probe.Exists(ctx, token+":k1").Result(); eerr != nil { |
| 384 | + t.Fatalf("EXISTS after deprovision: %v", eerr) |
| 385 | + } else if n != 0 { |
| 386 | + t.Errorf("namespace key survived DeprovisionResource — SCAN+DEL did not reap it") |
| 387 | + } |
| 388 | + |
| 389 | + // --- Idempotency: a second Deprovision is a clean no-op --- |
| 390 | + if _, err := srv.DeprovisionResource(ctx, &provisionerv1.DeprovisionRequest{ |
| 391 | + Token: token, |
| 392 | + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_REDIS, |
| 393 | + }); err != nil { |
| 394 | + t.Errorf("second DeprovisionResource (idempotent) returned %v; want nil", err) |
| 395 | + } |
| 396 | +} |
0 commit comments