Skip to content

Commit bb1935b

Browse files
test(provisioner): multi-tenant deprovision-scoping regression (truehomie DROP-incident class) (#46)
PR #45 proved single-tenant Provision/Regrade/Deprovision/idempotency round-trips for postgres + redis through the real gRPC handlers. It did NOT prove SCOPING: that a deprovision is confined to the target tenant. That is the gap the 2026-06-03 truehomie-db DROP incident exposed (an active Pro customer's db+role dropped while a co-resident tenant shared the cluster). Adds server_multitenant_scoping_test.go: provisions TWO co-resident tenants A+B through the genuine gRPC ProvisionResource handler against a real Postgres / real Redis, seeds data into each, deprovisions ONLY A, and asserts B fully survives. - Postgres: after Deprovision(A), A's db_/usr_ are gone (DROP ran) AND B's database + role still exist, B's seeded row is intact, and B can still CONNECT with its own ConnectionUrl credentials. - Redis: after Deprovision(A), A's ACL user + namespace key are reaped AND B's ACL user + namespace key + value survive. Env-gated identically to server_live_roundtrip_test.go (skips clean under -short / no backend; runs for real in coverage.yml's pg+redis services and local dev backends). Verified PASS locally against Postgres 16 + Redis 7. Coverage block: Symptom: unscoped DROP DATABASE/DROP USER (or ACL DELUSER/SCAN+DEL) on deprovision takes out a co-resident tenant (truehomie 2026-06-03) Enumeration: gRPC DeprovisionResource handler path, postgres+redis LocalBackend Sites found: 2 (postgres deprovision, redis deprovision) Sites touched: 2 (both have a co-resident-survival regression test) Coverage test: TestServer_{Postgres,Redis}_Deprovision_IsScopedToTargetTenant Live verified: PASS vs local Postgres 16 + Redis 7 (real backends); B survives A Co-authored-by: Manas Srivastava <[email protected]> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent b2557e7 commit bb1935b

1 file changed

Lines changed: 281 additions & 0 deletions

File tree

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
package server_test
2+
3+
// server_multitenant_scoping_test.go — REGRESSION TEST for the 2026-06-03
4+
// truehomie-db DROP incident class.
5+
//
6+
// On 2026-06-03 an active Pro customer's database AND role were dropped by a
7+
// non-audited path while a *co-resident* tenant shared the same cluster. The
8+
// failure mode this class represents is an UNSCOPED / OVER-BROAD deprovision:
9+
// tearing down tenant A reaches beyond A's own db_/usr_ and takes out a
10+
// neighbor's database, role, or data.
11+
//
12+
// The PR #45 round-trips already prove that deprovisioning a SINGLE tenant
13+
// drops that tenant's db_/usr_ and is idempotent. They do NOT prove SCOPING —
14+
// that the DROP is confined to the target tenant. That is the new value here.
15+
//
16+
// These tests provision TWO co-resident tenants (A and B) through the genuine
17+
// gRPC ProvisionResource handler against a real Postgres / real Redis, seed
18+
// data into each, then DeprovisionResource(A) and assert:
19+
// - A's database + role are gone (DROP ran), AND
20+
// - B's database + role still EXIST, B's seeded row is INTACT, and B can
21+
// still CONNECT with its own credentials (the neighbor survives).
22+
//
23+
// If a future change made the postgres DROP DATABASE / DROP USER (or the redis
24+
// ACL DELUSER / namespace SCAN+DEL) match more than the target token, exactly
25+
// this assertion fails — which is the assertion that would have caught the
26+
// truehomie incident before it reached prod.
27+
//
28+
// Env-gated identically to server_live_roundtrip_test.go: skips cleanly when
29+
// the backend URL is unset (so `go test -short` in CI without a backend stays
30+
// green) and runs for real against local dev Postgres (localhost:5432) / Redis
31+
// (localhost:6379) or CI's coverage.yml docker services.
32+
33+
import (
34+
"context"
35+
"testing"
36+
"time"
37+
38+
"github.com/jackc/pgx/v5"
39+
goredis "github.com/redis/go-redis/v9"
40+
41+
commonv1 "instant.dev/proto/common/v1"
42+
provisionerv1 "instant.dev/proto/provisioner/v1"
43+
)
44+
45+
// pgTenantCanConnectAndRead opens a connection with the tenant's OWN
46+
// credentials (the gRPC-returned ConnectionUrl) and reads back the single
47+
// seeded row, asserting both connectivity and data integrity survive a
48+
// neighbor's deprovision. Fails the test on any error.
49+
func pgTenantCanConnectAndRead(t *testing.T, tenantConnURL, wantVal string) {
50+
t.Helper()
51+
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
52+
defer cancel()
53+
conn, err := pgx.Connect(ctx, tenantConnURL)
54+
if err != nil {
55+
t.Fatalf("tenant B can no longer CONNECT with its own credentials after neighbor deprovision: %v", err)
56+
}
57+
defer conn.Close(ctx) //nolint:errcheck
58+
var got string
59+
if err := conn.QueryRow(ctx, "SELECT v FROM scoping_probe WHERE id = 1").Scan(&got); err != nil {
60+
t.Fatalf("tenant B seeded row unreadable after neighbor deprovision: %v", err)
61+
}
62+
if got != wantVal {
63+
t.Errorf("tenant B seeded data corrupted: got %q want %q", got, wantVal)
64+
}
65+
}
66+
67+
// pgSeedTenant connects with the tenant's own ConnectionUrl, creates a probe
68+
// table, and inserts a single sentinel row. Mirrors what a real customer app
69+
// would do immediately after provisioning.
70+
func pgSeedTenant(t *testing.T, tenantConnURL, val string) {
71+
t.Helper()
72+
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
73+
defer cancel()
74+
conn, err := pgx.Connect(ctx, tenantConnURL)
75+
if err != nil {
76+
t.Fatalf("seed connect (tenant own creds): %v", err)
77+
}
78+
defer conn.Close(ctx) //nolint:errcheck
79+
if _, err := conn.Exec(ctx, "CREATE TABLE IF NOT EXISTS scoping_probe (id int PRIMARY KEY, v text)"); err != nil {
80+
t.Fatalf("seed CREATE TABLE: %v", err)
81+
}
82+
if _, err := conn.Exec(ctx,
83+
"INSERT INTO scoping_probe (id, v) VALUES (1, $1) ON CONFLICT (id) DO UPDATE SET v = EXCLUDED.v", val,
84+
); err != nil {
85+
t.Fatalf("seed INSERT: %v", err)
86+
}
87+
}
88+
89+
// TestServer_Postgres_Deprovision_IsScopedToTargetTenant is the truehomie-DROP
90+
// regression: deprovisioning tenant A must drop ONLY A's db_/usr_ and leave a
91+
// CO-RESIDENT tenant B's database, role, AND seeded data fully intact and
92+
// connectable. This is the assertion the 2026-06-03 incident lacked.
93+
func TestServer_Postgres_Deprovision_IsScopedToTargetTenant(t *testing.T) {
94+
adminDSN := livePostgresAdminDSN()
95+
if adminDSN == "" {
96+
t.Skip("TEST_POSTGRES_CUSTOMERS_URL/TEST_POSTGRES_ADMIN_DSN unset — skipping multi-tenant Postgres scoping test")
97+
}
98+
srv := liveServerWithRealPostgres(adminDSN)
99+
ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
100+
defer cancel()
101+
102+
// Two distinct co-resident tenants on the SAME cluster.
103+
tokenA := liveToken(t) + "a"
104+
tokenB := liveToken(t) + "b"
105+
dbA, usrA := "db_"+tokenA, "usr_"+tokenA
106+
dbB, usrB := "db_"+tokenB, "usr_"+tokenB
107+
t.Cleanup(func() { cleanupPG(t, adminDSN, dbA, usrA) })
108+
t.Cleanup(func() { cleanupPG(t, adminDSN, dbB, usrB) })
109+
110+
// --- Provision A and B through the genuine gRPC handler ---
111+
provA, err := srv.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{
112+
Token: tokenA,
113+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
114+
Tier: "hobby",
115+
})
116+
if err != nil {
117+
t.Fatalf("ProvisionResource(A): %v", err)
118+
}
119+
provB, err := srv.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{
120+
Token: tokenB,
121+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
122+
Tier: "hobby",
123+
})
124+
if err != nil {
125+
t.Fatalf("ProvisionResource(B): %v", err)
126+
}
127+
128+
// Sanity: both databases + roles exist before any teardown.
129+
if !pgDatabaseExists(t, adminDSN, dbA) {
130+
t.Fatalf("precondition: A's database %q missing after provision", dbA)
131+
}
132+
if !pgDatabaseExists(t, adminDSN, dbB) {
133+
t.Fatalf("precondition: B's database %q missing after provision", dbB)
134+
}
135+
if _, ok := pgConnLimit(t, adminDSN, usrA); !ok {
136+
t.Fatalf("precondition: A's role %q missing after provision", usrA)
137+
}
138+
if _, ok := pgConnLimit(t, adminDSN, usrB); !ok {
139+
t.Fatalf("precondition: B's role %q missing after provision", usrB)
140+
}
141+
142+
// Seed real data into each tenant using ITS OWN credentials.
143+
pgSeedTenant(t, provA.ConnectionUrl, "tenant-A-data")
144+
pgSeedTenant(t, provB.ConnectionUrl, "tenant-B-data")
145+
146+
// --- Deprovision ONLY tenant A ---
147+
depA, err := srv.DeprovisionResource(ctx, &provisionerv1.DeprovisionRequest{
148+
Token: tokenA,
149+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
150+
})
151+
if err != nil {
152+
t.Fatalf("DeprovisionResource(A): %v", err)
153+
}
154+
if !depA.Deprovisioned {
155+
t.Errorf("DeprovisionResource(A).Deprovisioned = false; want true")
156+
}
157+
158+
// --- A is gone (DROP actually ran) ---
159+
if pgDatabaseExists(t, adminDSN, dbA) {
160+
t.Errorf("after Deprovision(A), A's database %q still exists — DROP DATABASE did not run", dbA)
161+
}
162+
if _, ok := pgConnLimit(t, adminDSN, usrA); ok {
163+
t.Errorf("after Deprovision(A), A's role %q still exists — DROP USER did not run", usrA)
164+
}
165+
166+
// --- B SURVIVES: the truehomie regression assertion ---
167+
if !pgDatabaseExists(t, adminDSN, dbB) {
168+
t.Fatalf("REGRESSION (truehomie class): deprovisioning A dropped co-resident B's database %q", dbB)
169+
}
170+
if _, ok := pgConnLimit(t, adminDSN, usrB); !ok {
171+
t.Fatalf("REGRESSION (truehomie class): deprovisioning A dropped co-resident B's role %q", usrB)
172+
}
173+
// B's data is intact AND B can still connect with its own credentials.
174+
pgTenantCanConnectAndRead(t, provB.ConnectionUrl, "tenant-B-data")
175+
}
176+
177+
// TestServer_Redis_Deprovision_IsScopedToTargetTenant is the redis analogue:
178+
// deprovisioning tenant A removes A's ACL user + A's namespace keys, and leaves
179+
// co-resident tenant B's ACL user and namespace keys fully intact.
180+
func TestServer_Redis_Deprovision_IsScopedToTargetTenant(t *testing.T) {
181+
redisURL := liveRedisURL()
182+
if redisURL == "" {
183+
t.Skip("TEST_REDIS_URL/CUSTOMER_REDIS_URL unset — skipping multi-tenant Redis scoping test")
184+
}
185+
opt, err := goredis.ParseURL(redisURL)
186+
if err != nil {
187+
t.Skipf("redis URL %q does not parse: %v", redisURL, err)
188+
}
189+
probe := goredis.NewClient(opt)
190+
pctx, pcancel := context.WithTimeout(context.Background(), time.Second)
191+
defer pcancel()
192+
if perr := probe.Ping(pctx).Err(); perr != nil {
193+
_ = probe.Close()
194+
t.Skipf("redis not reachable at %s: %v", opt.Addr, perr)
195+
}
196+
197+
srv := liveServerWithRealRedis(opt.Addr)
198+
ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
199+
defer cancel()
200+
201+
tokenA := liveToken(t) + "a"
202+
tokenB := liveToken(t) + "b"
203+
usrA := "usr_" + tokenA
204+
usrB := "usr_" + tokenB
205+
t.Cleanup(func() {
206+
for _, u := range []string{usrA, usrB} {
207+
_ = probe.Do(context.Background(), "ACL", "DELUSER", u).Err()
208+
}
209+
for _, tok := range []string{tokenA, tokenB} {
210+
if keys, _, kerr := probe.Scan(context.Background(), 0, tok+":*", 100).Result(); kerr == nil && len(keys) > 0 {
211+
_ = probe.Del(context.Background(), keys...).Err()
212+
}
213+
}
214+
_ = probe.Close()
215+
})
216+
217+
// --- Provision A and B through the genuine gRPC handler ---
218+
if _, err := srv.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{
219+
Token: tokenA,
220+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_REDIS,
221+
Tier: "hobby",
222+
}); err != nil {
223+
t.Fatalf("ProvisionResource(redis A): %v", err)
224+
}
225+
if _, err := srv.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{
226+
Token: tokenB,
227+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_REDIS,
228+
Tier: "hobby",
229+
}); err != nil {
230+
t.Fatalf("ProvisionResource(redis B): %v", err)
231+
}
232+
233+
// Both ACL users exist; seed a namespace key into each.
234+
if gerr := probe.Do(ctx, "ACL", "GETUSER", usrA).Err(); gerr != nil {
235+
t.Fatalf("precondition: A's ACL user %q missing: %v", usrA, gerr)
236+
}
237+
if gerr := probe.Do(ctx, "ACL", "GETUSER", usrB).Err(); gerr != nil {
238+
t.Fatalf("precondition: B's ACL user %q missing: %v", usrB, gerr)
239+
}
240+
if serr := probe.Set(ctx, tokenA+":k1", "vA", 0).Err(); serr != nil {
241+
t.Fatalf("seed A key: %v", serr)
242+
}
243+
if serr := probe.Set(ctx, tokenB+":k1", "vB", 0).Err(); serr != nil {
244+
t.Fatalf("seed B key: %v", serr)
245+
}
246+
247+
// --- Deprovision ONLY tenant A ---
248+
depA, err := srv.DeprovisionResource(ctx, &provisionerv1.DeprovisionRequest{
249+
Token: tokenA,
250+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_REDIS,
251+
})
252+
if err != nil {
253+
t.Fatalf("DeprovisionResource(redis A): %v", err)
254+
}
255+
if !depA.Deprovisioned {
256+
t.Errorf("DeprovisionResource(redis A).Deprovisioned = false; want true")
257+
}
258+
259+
// --- A is gone ---
260+
if gerr := probe.Do(ctx, "ACL", "GETUSER", usrA).Err(); gerr == nil {
261+
t.Errorf("after Deprovision(A), A's ACL user %q still exists — DELUSER did not run", usrA)
262+
}
263+
if n, eerr := probe.Exists(ctx, tokenA+":k1").Result(); eerr != nil {
264+
t.Fatalf("EXISTS A key after deprovision: %v", eerr)
265+
} else if n != 0 {
266+
t.Errorf("after Deprovision(A), A's namespace key survived — SCAN+DEL did not reap it")
267+
}
268+
269+
// --- B SURVIVES: the truehomie regression assertion (redis) ---
270+
if gerr := probe.Do(ctx, "ACL", "GETUSER", usrB).Err(); gerr != nil {
271+
t.Fatalf("REGRESSION (truehomie class): deprovisioning A removed co-resident B's ACL user %q: %v", usrB, gerr)
272+
}
273+
if n, eerr := probe.Exists(ctx, tokenB+":k1").Result(); eerr != nil {
274+
t.Fatalf("EXISTS B key after A deprovision: %v", eerr)
275+
} else if n != 1 {
276+
t.Fatalf("REGRESSION (truehomie class): deprovisioning A reaped co-resident B's namespace key %q", tokenB+":k1")
277+
}
278+
if v, gerr := probe.Get(ctx, tokenB+":k1").Result(); gerr != nil || v != "vB" {
279+
t.Errorf("tenant B key value after A deprovision = %q (err %v); want %q", v, gerr, "vB")
280+
}
281+
}

0 commit comments

Comments
 (0)