Skip to content

Commit b2557e7

Browse files
test(provisioner): real-backend integration round-trips for Deprovision/Regrade DDL (toward 100% integration) (#45)
Adds env-gated real-backend integration tests that drive the gRPC server layer's Provision → Regrade → Deprovision lifecycle against a REAL Postgres and a REAL Redis, through the actual RPC handlers (not injected fakes). Motivation: the truehomie-db DROP incident class (2026-06-03). Every existing server test injects a fake backend, so the real DROP DATABASE / DROP USER / ALTER ROLE DDL had never executed through the gRPC handler path (breaker wrapping, tier→connLimit routing, mapError, response shaping, idempotent re-deprovision). High statement coverage from mocks did not prove the destroy/regrade DDL was correct end-to-end. Tests (all skip cleanly when the backend URL is unset): - TestServer_Postgres_Provision_Regrade_Deprovision_LiveRoundTrip: asserts db_/usr_ created, role CONNECTION LIMIT actually ALTERed (pg_roles cross- check), db_/usr_ DROPped, and a second Deprovision is a clean idempotent no-op (#9 DROP IF EXISTS). - TestServer_Postgres_Reprovision_AfterDeprovision_LiveRoundTrip: re-provisions the same token after teardown — guards a partial-DROP leak that would block reuse. - TestServer_Redis_Provision_Deprovision_LiveRoundTrip: ACL user created, ACL user + namespace keys reaped, idempotent second Deprovision. Coverage delta (integration-only, server test pkg vs ./... per the §1.4 mechanism): server package 18.9% → 24.8% (+5.9pp). The backend DDL methods, as exercised FROM the gRPC layer, go 0% → 60-83% (postgres Provision 0→68.3%, Deprovision 0→62.5%, Regrade 0→73.3%; redis Provision 0→70.0%, Deprovision 0→83.3%). Gate: go build/vet/test ./... -short green; gofmt + golangci-lint clean. Env-blocked: mongo/queue(NATS)/storage backend round-trips remain GAP (no local TEST_MONGO_URL/TEST_NATS_URL) — out of scope for this increment. Co-authored-by: Manas Srivastava <[email protected]> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent abfb80a commit b2557e7

1 file changed

Lines changed: 396 additions & 0 deletions

File tree

Lines changed: 396 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,396 @@
1+
package server_test
2+
3+
// server_live_roundtrip_test.go — REAL-BACKEND integration coverage for the
4+
// gRPC server layer's Provision → Regrade → Deprovision lifecycle.
5+
//
6+
// Why this file exists (the truehomie-db DROP incident class, 2026-06-03):
7+
// every existing server_test.go / server_coverage_test.go test injects a *fake*
8+
// backend, so the actual DROP DATABASE / DROP USER / ALTER ROLE DDL has never
9+
// run through the real gRPC handler path (breaker wrapping, tier→connLimit
10+
// routing, mapError, response shaping, idempotent re-deprovision). High
11+
// statement coverage from mocks does NOT prove the destroy/regrade DDL is
12+
// correct end-to-end. These tests drive the genuine RPC handlers
13+
// (server.ProvisionResource / RegradeResource / DeprovisionResource) against a
14+
// real Postgres and a real Redis, and assert the backing infra is actually
15+
// created, regraded, and torn down — and that a second Deprovision is a clean
16+
// idempotent no-op (the #9 DROP IF EXISTS fix).
17+
//
18+
// Env-gated: skips cleanly when the backend URL is unset, so `go test -short`
19+
// in CI without a backend stays green; runs for real when the backend is
20+
// present (local dev Postgres at localhost:5432, Redis at localhost:6379, and
21+
// CI's coverage.yml docker services).
22+
23+
import (
24+
"context"
25+
"fmt"
26+
"os"
27+
"strings"
28+
"testing"
29+
"time"
30+
31+
"github.com/jackc/pgx/v5"
32+
goredis "github.com/redis/go-redis/v9"
33+
34+
commonv1 "instant.dev/proto/common/v1"
35+
provisionerv1 "instant.dev/proto/provisioner/v1"
36+
"instant.dev/provisioner/internal/backend/postgres"
37+
"instant.dev/provisioner/internal/backend/redis"
38+
"instant.dev/provisioner/internal/circuit"
39+
"instant.dev/provisioner/internal/config"
40+
"instant.dev/provisioner/internal/server"
41+
)
42+
43+
// livePostgresAdminDSN returns an admin DSN capable of CREATE/DROP DATABASE,
44+
// or "" when none is configured (caller MUST t.Skip). Mirrors the env-var
45+
// resolution used by the backend/postgres live tests so a single env wires
46+
// both layers.
47+
func livePostgresAdminDSN() string {
48+
for _, k := range []string{"TEST_POSTGRES_CUSTOMERS_URL", "TEST_POSTGRES_ADMIN_DSN", "CUSTOMER_POSTGRES_DSN"} {
49+
if v := os.Getenv(k); v != "" {
50+
return v
51+
}
52+
}
53+
return ""
54+
}
55+
56+
// liveRedisURL returns a redis:// URL for the provision pool, or "" when unset.
57+
func liveRedisURL() string {
58+
for _, k := range []string{"TEST_REDIS_URL", "CUSTOMER_REDIS_URL"} {
59+
if v := os.Getenv(k); v != "" {
60+
return v
61+
}
62+
}
63+
return ""
64+
}
65+
66+
// liveServerWithRealPostgres builds a Server wired to a REAL LocalBackend
67+
// Postgres (shared-cluster admin DSN) and fresh per-test breakers. No pool, no
68+
// dedicated backend, so every RPC takes the live shared-cluster path.
69+
func liveServerWithRealPostgres(adminDSN string) *server.Server {
70+
return server.NewWithBackends(
71+
&config.Config{},
72+
postgres.NewBackend("", adminDSN, "", "", ""), // "" → LocalBackend(adminDSN)
73+
nil, nil, nil, nil, // redis/mongo/queue/storage unused on this path
74+
nil, nil, nil, nil, // no dedicated backends
75+
nil, // no pool → live provision path
76+
).SetBreakers(circuit.NewBreakers())
77+
}
78+
79+
// liveServerWithRealRedis builds a Server wired to a REAL Redis LocalBackend.
80+
func liveServerWithRealRedis(redisAddr string) *server.Server {
81+
return server.NewWithBackends(
82+
&config.Config{},
83+
nil,
84+
redis.NewBackend("", redisAddr), // "" → LocalBackend(redisAddr)
85+
nil, nil, nil,
86+
nil, nil, nil, nil,
87+
nil,
88+
).SetBreakers(circuit.NewBreakers())
89+
}
90+
91+
// liveToken returns a short, unique, test-scoped token safe as a Postgres
92+
// db_/usr_ identifier and a Redis key prefix.
93+
func liveToken(t *testing.T) string {
94+
t.Helper()
95+
clean := strings.NewReplacer("/", "_", " ", "_").Replace(t.Name())
96+
if len(clean) > 24 {
97+
clean = clean[:24]
98+
}
99+
return fmt.Sprintf("tok%d%s", time.Now().UnixNano(), clean)
100+
}
101+
102+
// pgConnLimit queries the actual rolconnlimit for usr_<token> on the live
103+
// cluster, or returns (0, err) if the role does not exist.
104+
func pgConnLimit(t *testing.T, adminDSN, username string) (int, bool) {
105+
t.Helper()
106+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
107+
defer cancel()
108+
conn, err := pgx.Connect(ctx, adminDSN)
109+
if err != nil {
110+
t.Fatalf("pgConnLimit connect: %v", err)
111+
}
112+
defer conn.Close(ctx) //nolint:errcheck
113+
var lim int
114+
err = conn.QueryRow(ctx, "SELECT rolconnlimit FROM pg_roles WHERE rolname=$1", username).Scan(&lim)
115+
if err == pgx.ErrNoRows {
116+
return 0, false
117+
}
118+
if err != nil {
119+
t.Fatalf("pgConnLimit query: %v", err)
120+
}
121+
return lim, true
122+
}
123+
124+
// pgDatabaseExists reports whether db_<token> exists on the live cluster.
125+
func pgDatabaseExists(t *testing.T, adminDSN, dbName string) bool {
126+
t.Helper()
127+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
128+
defer cancel()
129+
conn, err := pgx.Connect(ctx, adminDSN)
130+
if err != nil {
131+
t.Fatalf("pgDatabaseExists connect: %v", err)
132+
}
133+
defer conn.Close(ctx) //nolint:errcheck
134+
var n int
135+
if err := conn.QueryRow(ctx, "SELECT count(*) FROM pg_database WHERE datname=$1", dbName).Scan(&n); err != nil {
136+
t.Fatalf("pgDatabaseExists query: %v", err)
137+
}
138+
return n > 0
139+
}
140+
141+
// cleanupPG drops db_<token>/usr_<token> best-effort so repeated runs and
142+
// failed assertions never leak objects on the shared cluster.
143+
func cleanupPG(t *testing.T, adminDSN, dbName, username string) {
144+
t.Helper()
145+
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
146+
defer cancel()
147+
conn, err := pgx.Connect(ctx, adminDSN)
148+
if err != nil {
149+
t.Logf("cleanupPG connect: %v", err)
150+
return
151+
}
152+
defer conn.Close(ctx) //nolint:errcheck
153+
_, _ = conn.Exec(ctx, fmt.Sprintf("DROP DATABASE IF EXISTS %q WITH (FORCE)", dbName))
154+
_, _ = conn.Exec(ctx, fmt.Sprintf("DROP USER IF EXISTS %q", username))
155+
}
156+
157+
// TestServer_Postgres_Provision_Regrade_Deprovision_LiveRoundTrip is the
158+
// truehomie-DROP-class integration test for the gRPC server layer: it drives
159+
// the real RPC handlers against a real Postgres and asserts the backing
160+
// db_/usr_ are CREATED by ProvisionResource, the role CONNECTION LIMIT is
161+
// adjusted by RegradeResource, the db_/usr_ are DROPped by DeprovisionResource,
162+
// and a second DeprovisionResource is a clean idempotent no-op (DROP IF EXISTS).
163+
func TestServer_Postgres_Provision_Regrade_Deprovision_LiveRoundTrip(t *testing.T) {
164+
adminDSN := livePostgresAdminDSN()
165+
if adminDSN == "" {
166+
t.Skip("TEST_POSTGRES_CUSTOMERS_URL/TEST_POSTGRES_ADMIN_DSN unset — skipping live-Postgres gRPC round-trip")
167+
}
168+
srv := liveServerWithRealPostgres(adminDSN)
169+
ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
170+
defer cancel()
171+
172+
token := liveToken(t)
173+
dbName := "db_" + token
174+
username := "usr_" + token
175+
t.Cleanup(func() { cleanupPG(t, adminDSN, dbName, username) })
176+
177+
// --- Provision (hobby tier → a positive CONNECTION LIMIT) ---
178+
provResp, err := srv.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{
179+
Token: token,
180+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
181+
Tier: "hobby",
182+
})
183+
if err != nil {
184+
t.Fatalf("ProvisionResource(postgres, hobby): %v", err)
185+
}
186+
if provResp.DatabaseName != dbName || provResp.Username != username {
187+
t.Fatalf("ProvisionResource returned db=%q user=%q; want db=%q user=%q",
188+
provResp.DatabaseName, provResp.Username, dbName, username)
189+
}
190+
if !strings.HasPrefix(provResp.ConnectionUrl, "postgres://") {
191+
t.Errorf("ConnectionUrl = %q; want postgres:// prefix", provResp.ConnectionUrl)
192+
}
193+
if !pgDatabaseExists(t, adminDSN, dbName) {
194+
t.Fatalf("after ProvisionResource, %q does not exist on the live cluster", dbName)
195+
}
196+
hobbyLimit, ok := pgConnLimit(t, adminDSN, username)
197+
if !ok {
198+
t.Fatalf("after ProvisionResource, role %q does not exist", username)
199+
}
200+
if hobbyLimit <= 0 {
201+
t.Errorf("hobby role connection limit = %d; want a positive cap applied at CREATE USER", hobbyLimit)
202+
}
203+
204+
// --- Regrade (pro tier → a different positive cap; assert the real ALTER ROLE took) ---
205+
regResp, err := srv.RegradeResource(ctx, &provisionerv1.RegradeRequest{
206+
Token: token,
207+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
208+
Tier: "pro",
209+
})
210+
if err != nil {
211+
t.Fatalf("RegradeResource(postgres, pro): %v", err)
212+
}
213+
if !regResp.Applied {
214+
t.Errorf("RegradeResource(pro).Applied = false; want true")
215+
}
216+
proLimit, ok := pgConnLimit(t, adminDSN, username)
217+
if !ok {
218+
t.Fatalf("role %q vanished after Regrade", username)
219+
}
220+
if int(regResp.AppliedConnLimit) != proLimit {
221+
t.Errorf("pg_roles.rolconnlimit = %d but RegradeResponse.AppliedConnLimit = %d; the ALTER ROLE did not match the reported cap",
222+
proLimit, regResp.AppliedConnLimit)
223+
}
224+
if proLimit == hobbyLimit {
225+
t.Errorf("pro connection limit (%d) equals hobby (%d); the Regrade did not change the live cap", proLimit, hobbyLimit)
226+
}
227+
228+
// --- Deprovision (the DROP DATABASE / DROP USER path — truehomie class) ---
229+
depResp, err := srv.DeprovisionResource(ctx, &provisionerv1.DeprovisionRequest{
230+
Token: token,
231+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
232+
})
233+
if err != nil {
234+
t.Fatalf("DeprovisionResource(postgres): %v", err)
235+
}
236+
if !depResp.Deprovisioned {
237+
t.Errorf("DeprovisionResource.Deprovisioned = false; want true")
238+
}
239+
if pgDatabaseExists(t, adminDSN, dbName) {
240+
t.Errorf("after DeprovisionResource, %q still exists — DROP DATABASE did not run", dbName)
241+
}
242+
if _, ok := pgConnLimit(t, adminDSN, username); ok {
243+
t.Errorf("after DeprovisionResource, role %q still exists — DROP USER did not run", username)
244+
}
245+
246+
// --- Idempotency: a second Deprovision must be a clean no-op (DROP IF EXISTS, #9) ---
247+
depResp2, err := srv.DeprovisionResource(ctx, &provisionerv1.DeprovisionRequest{
248+
Token: token,
249+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
250+
})
251+
if err != nil {
252+
t.Errorf("second DeprovisionResource (idempotent) returned %v; want nil — DROP IF EXISTS must no-op cleanly", err)
253+
}
254+
if depResp2 != nil && !depResp2.Deprovisioned {
255+
t.Errorf("second DeprovisionResource.Deprovisioned = false; want true (idempotent success)")
256+
}
257+
}
258+
259+
// TestServer_Postgres_Reprovision_AfterDeprovision_LiveRoundTrip asserts the
260+
// (re)Provision leg of the round-trip: after a full teardown the SAME token can
261+
// be provisioned again with no "already exists" collision — i.e. Deprovision
262+
// truly removed every object Provision created. This is the regression guard
263+
// for a partial-DROP leak that would block re-provisioning.
264+
func TestServer_Postgres_Reprovision_AfterDeprovision_LiveRoundTrip(t *testing.T) {
265+
adminDSN := livePostgresAdminDSN()
266+
if adminDSN == "" {
267+
t.Skip("postgres admin DSN unset — skipping reprovision round-trip")
268+
}
269+
srv := liveServerWithRealPostgres(adminDSN)
270+
ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
271+
defer cancel()
272+
273+
token := liveToken(t)
274+
dbName := "db_" + token
275+
username := "usr_" + token
276+
t.Cleanup(func() { cleanupPG(t, adminDSN, dbName, username) })
277+
278+
req := &provisionerv1.ProvisionRequest{
279+
Token: token,
280+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
281+
Tier: "hobby",
282+
}
283+
depReq := &provisionerv1.DeprovisionRequest{
284+
Token: token,
285+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
286+
}
287+
288+
if _, err := srv.ProvisionResource(ctx, req); err != nil {
289+
t.Fatalf("first ProvisionResource: %v", err)
290+
}
291+
if _, err := srv.DeprovisionResource(ctx, depReq); err != nil {
292+
t.Fatalf("DeprovisionResource: %v", err)
293+
}
294+
// Re-provision the same token: must succeed (no orphaned db_/usr_ blocking it).
295+
if _, err := srv.ProvisionResource(ctx, req); err != nil {
296+
t.Fatalf("re-ProvisionResource after Deprovision: %v — teardown leaked an object that blocks reuse", err)
297+
}
298+
if !pgDatabaseExists(t, adminDSN, dbName) {
299+
t.Errorf("re-provisioned %q missing", dbName)
300+
}
301+
// Final teardown.
302+
if _, err := srv.DeprovisionResource(ctx, depReq); err != nil {
303+
t.Errorf("final DeprovisionResource: %v", err)
304+
}
305+
}
306+
307+
// TestServer_Redis_Provision_Deprovision_LiveRoundTrip drives the real Redis
308+
// LocalBackend through the gRPC handlers: ProvisionResource creates an ACL
309+
// user, DeprovisionResource removes the ACL user and namespace keys, and a
310+
// second Deprovision is a clean idempotent no-op. (Redis LocalBackend has no
311+
// Regrade — only the k8s backend implements redis.Regrader.)
312+
func TestServer_Redis_Provision_Deprovision_LiveRoundTrip(t *testing.T) {
313+
redisURL := liveRedisURL()
314+
if redisURL == "" {
315+
t.Skip("TEST_REDIS_URL/CUSTOMER_REDIS_URL unset — skipping live-Redis gRPC round-trip")
316+
}
317+
opt, err := goredis.ParseURL(redisURL)
318+
if err != nil {
319+
t.Skipf("redis URL %q does not parse: %v", redisURL, err)
320+
}
321+
// Probe so we skip (not fail) when nothing is listening.
322+
probe := goredis.NewClient(opt)
323+
pctx, pcancel := context.WithTimeout(context.Background(), time.Second)
324+
defer pcancel()
325+
if perr := probe.Ping(pctx).Err(); perr != nil {
326+
_ = probe.Close()
327+
t.Skipf("redis not reachable at %s: %v", opt.Addr, perr)
328+
}
329+
330+
srv := liveServerWithRealRedis(opt.Addr)
331+
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
332+
defer cancel()
333+
334+
token := liveToken(t)
335+
username := "usr_" + token
336+
t.Cleanup(func() {
337+
// Best-effort: drop the ACL user and any namespace keys directly.
338+
_ = probe.Do(context.Background(), "ACL", "DELUSER", username).Err()
339+
if keys, _, kerr := probe.Scan(context.Background(), 0, token+":*", 100).Result(); kerr == nil && len(keys) > 0 {
340+
_ = probe.Del(context.Background(), keys...).Err()
341+
}
342+
_ = probe.Close()
343+
})
344+
345+
// --- Provision: the gRPC handler must create the ACL user on the live pod ---
346+
provResp, err := srv.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{
347+
Token: token,
348+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_REDIS,
349+
Tier: "hobby",
350+
})
351+
if err != nil {
352+
t.Fatalf("ProvisionResource(redis, hobby): %v", err)
353+
}
354+
if !strings.HasPrefix(provResp.ConnectionUrl, "redis://") {
355+
t.Errorf("ConnectionUrl = %q; want redis:// prefix", provResp.ConnectionUrl)
356+
}
357+
if provResp.KeyPrefix != token+":" {
358+
t.Errorf("KeyPrefix = %q; want %q", provResp.KeyPrefix, token+":")
359+
}
360+
// Assert the ACL user actually exists on the live Redis.
361+
if gerr := probe.Do(ctx, "ACL", "GETUSER", username).Err(); gerr != nil {
362+
t.Fatalf("ACL user %q not created on live Redis after ProvisionResource: %v", username, gerr)
363+
}
364+
// Write a namespace key so Deprovision has keys to reap.
365+
if serr := probe.Set(ctx, token+":k1", "v1", 0).Err(); serr != nil {
366+
t.Fatalf("seed key: %v", serr)
367+
}
368+
369+
// --- Deprovision: removes the ACL user and the namespace keys ---
370+
depResp, err := srv.DeprovisionResource(ctx, &provisionerv1.DeprovisionRequest{
371+
Token: token,
372+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_REDIS,
373+
})
374+
if err != nil {
375+
t.Fatalf("DeprovisionResource(redis): %v", err)
376+
}
377+
if !depResp.Deprovisioned {
378+
t.Errorf("DeprovisionResource.Deprovisioned = false; want true")
379+
}
380+
if gerr := probe.Do(ctx, "ACL", "GETUSER", username).Err(); gerr == nil {
381+
t.Errorf("ACL user %q still exists after DeprovisionResource — DELUSER did not run", username)
382+
}
383+
if n, eerr := probe.Exists(ctx, token+":k1").Result(); eerr != nil {
384+
t.Fatalf("EXISTS after deprovision: %v", eerr)
385+
} else if n != 0 {
386+
t.Errorf("namespace key survived DeprovisionResource — SCAN+DEL did not reap it")
387+
}
388+
389+
// --- Idempotency: a second Deprovision is a clean no-op ---
390+
if _, err := srv.DeprovisionResource(ctx, &provisionerv1.DeprovisionRequest{
391+
Token: token,
392+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_REDIS,
393+
}); err != nil {
394+
t.Errorf("second DeprovisionResource (idempotent) returned %v; want nil", err)
395+
}
396+
}

0 commit comments

Comments
 (0)