Skip to content

Commit 339a322

Browse files
feat: add RegradeResource RPC — re-apply tier connection caps to live Postgres roles (#8)
A plan upgrade flips resources.tier but never re-applies the HARD infrastructure limits baked at provision time — the Postgres role CONNECTION LIMIT in particular. RegradeResource closes that gap: * server.go — RegradeResource handler dispatches to the postgres backend; non-postgres types return applied=false + skip_reason. * backend/postgres — Regrade() ALTERs the role CONNECTION LIMIT to the tier-entitled cap from instant.dev/common/plans. K8sBackend is fail-soft when the customer pod is unreachable (returns applied=false so the caller retries on the next sweep). Idempotent: re-applying the same limit is a no-op. 3 new server tests. Phase 1 of the entitlement re-grade work. Requires the matching proto change (RegradeResource RPC) — CI builds check out proto@master, so the proto PR must merge before this one's CI goes green. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0ad971a commit 339a322

7 files changed

Lines changed: 246 additions & 0 deletions

File tree

internal/backend/postgres/backend.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,25 @@ type Backend interface {
3737
Provision(ctx context.Context, token, tier string) (*Credentials, error)
3838
StorageBytes(ctx context.Context, token, providerResourceID string) (int64, error)
3939
Deprovision(ctx context.Context, token, providerResourceID string) error
40+
// Regrade re-applies the tier's per-role CONNECTION LIMIT to an already
41+
// provisioned resource (e.g. after a plan upgrade). Idempotent.
42+
//
43+
// connLimit is the connection cap to apply (-1 = unlimited). Backends that
44+
// own a dedicated pod per resource (k8s) ALTER ROLE in place. The shared
45+
// local/dedicated/neon backends set no per-role cap at provision time, so
46+
// they return RegradeResult{Applied:false, SkipReason:"..."} without error.
47+
//
48+
// A non-error RegradeResult{Applied:false} means "nothing to do / not
49+
// reachable" — the caller can safely retry on the next sweep. An error is
50+
// reserved for unexpected failures.
51+
Regrade(ctx context.Context, token, providerResourceID string, connLimit int) (RegradeResult, error)
52+
}
53+
54+
// RegradeResult is the outcome of a Backend.Regrade call.
55+
type RegradeResult struct {
56+
Applied bool // true if the new connection cap was applied
57+
AppliedConnLimit int // the cap that is now in effect (-1 = unlimited)
58+
SkipReason string // populated when Applied is false
4059
}
4160

4261
// Credentials returned by Provision.

internal/backend/postgres/dedicated.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,14 @@ func (p *DedicatedProvider) Deprovision(ctx context.Context, token, providerReso
7777
return p.deprovisionLocal(ctx, token)
7878
}
7979

80+
// Regrade is a no-op for the dedicated provider. The Neon path manages
81+
// connection limits through the Neon project plan, not a per-role
82+
// CONNECTION LIMIT; the local-admin path sets no per-role cap at provision
83+
// time. Either way there is no cap to re-apply, so a skip result is returned.
84+
func (p *DedicatedProvider) Regrade(ctx context.Context, token, providerResourceID string, connLimit int) (RegradeResult, error) {
85+
return RegradeResult{Applied: false, SkipReason: "backend has no per-role connection cap"}, nil
86+
}
87+
8088
// --- Neon API path ---
8189

8290
func (p *DedicatedProvider) provisionNeon(ctx context.Context, token, tier string) (*Credentials, error) {

internal/backend/postgres/k8s.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,65 @@ func (b *K8sBackend) Deprovision(ctx context.Context, token, providerResourceID
373373
return nil
374374
}
375375

376+
// Regrade re-applies a connection cap to the customer's app Postgres role.
377+
// Used after a plan upgrade: the role's CONNECTION LIMIT was set at provision
378+
// time from the old (lower) tier and nothing re-applies the new cap.
379+
//
380+
// It resolves the resource → its namespace/Service/admin Secret the same way
381+
// StorageBytes does, then runs ALTER ROLE on the customer DB. Re-applying the
382+
// same value is a harmless no-op (idempotent).
383+
//
384+
// When the pod is unreachable (paused, terminating, legacy row without the
385+
// modern Secret/Service) this returns RegradeResult{Applied:false} with a
386+
// skip reason and no error — the caller retries on the next sweep.
387+
func (b *K8sBackend) Regrade(ctx context.Context, token, providerResourceID string, connLimit int) (RegradeResult, error) {
388+
ns := providerResourceID
389+
if ns == "" {
390+
ns = k8sNsPrefix + token
391+
}
392+
393+
// Resolve admin connection — identical pattern to StorageBytes. Legacy rows
394+
// whose pods are gone (missing Secret/Service) are non-actionable: skip,
395+
// don't error, so the caller doesn't retry forever.
396+
secret, err := b.cs.CoreV1().Secrets(ns).Get(ctx, "postgres-admin", metav1.GetOptions{})
397+
if err != nil {
398+
if k8serrors.IsNotFound(err) {
399+
return RegradeResult{Applied: false, SkipReason: "resource not reachable: postgres-admin secret not found"}, nil
400+
}
401+
return RegradeResult{Applied: false, SkipReason: fmt.Sprintf("resource not reachable: get secret: %v", err)}, nil
402+
}
403+
svc, err := b.cs.CoreV1().Services(ns).Get(ctx, "postgres", metav1.GetOptions{})
404+
if err != nil {
405+
if k8serrors.IsNotFound(err) {
406+
return RegradeResult{Applied: false, SkipReason: "resource not reachable: postgres service not found"}, nil
407+
}
408+
return RegradeResult{Applied: false, SkipReason: fmt.Sprintf("resource not reachable: get service: %v", err)}, nil
409+
}
410+
411+
adminUser := string(secret.Data["POSTGRES_USER"])
412+
adminPass := string(secret.Data["POSTGRES_PASSWORD"])
413+
// The app role is derived from the token exactly as in Provision.
414+
appUser := "usr_" + k8sShort(token)
415+
416+
dsn := fmt.Sprintf("postgres://%s:%s@%s:5432/postgres?sslmode=disable", adminUser, adminPass, svc.Spec.ClusterIP)
417+
conn, err := pgx.Connect(ctx, dsn)
418+
if err != nil {
419+
return RegradeResult{Applied: false, SkipReason: fmt.Sprintf("resource not reachable: connect: %v", err)}, nil
420+
}
421+
defer conn.Close(ctx)
422+
423+
// ALTER ROLE re-applies the tier's connection cap. -1 = unlimited (passed
424+
// through verbatim). Identifier quoted with %q, mirroring the CREATE USER
425+
// path in initDatabase.
426+
stmt := fmt.Sprintf(`ALTER ROLE %q CONNECTION LIMIT %d`, appUser, connLimit)
427+
if _, err := conn.Exec(ctx, stmt); err != nil {
428+
// Role missing on a live pod is non-actionable too — treat as skip.
429+
return RegradeResult{Applied: false, SkipReason: fmt.Sprintf("resource not reachable: alter role: %v", err)}, nil
430+
}
431+
432+
return RegradeResult{Applied: true, AppliedConnLimit: connLimit}, nil
433+
}
434+
376435
// --- private resource creators ---
377436

378437
func (b *K8sBackend) applyNamespace(ctx context.Context, ns string) error {

internal/backend/postgres/local.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,13 @@ func (b *LocalBackend) Deprovision(ctx context.Context, token, providerResourceI
217217
return nil
218218
}
219219

220+
// Regrade is a no-op for the shared local backend: Provision sets no per-role
221+
// CONNECTION LIMIT on the shared cluster, so there is no cap to re-apply on a
222+
// plan upgrade. Returns a skip result without error.
223+
func (b *LocalBackend) Regrade(ctx context.Context, token, providerResourceID string, connLimit int) (RegradeResult, error) {
224+
return RegradeResult{Applied: false, SkipReason: "backend has no per-role connection cap"}, nil
225+
}
226+
220227
// buildDBURL constructs the user-facing connection URL for the provisioned database.
221228
// sslmode=disable is explicit because the shared postgres-customers cluster does not
222229
// have SSL configured. Without it, lib/pq defaults to sslmode=prefer and fails with

internal/backend/postgres/neon.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,3 +178,10 @@ func (b *NeonBackend) Deprovision(ctx context.Context, token, providerResourceID
178178
slog.Info("db.neon.Deprovision: deprovisioned", "token", token, "project_id", providerResourceID)
179179
return nil
180180
}
181+
182+
// Regrade is a no-op for the Neon backend: connection limits are governed by
183+
// the Neon project plan, not a per-role CONNECTION LIMIT, so there is nothing
184+
// to re-apply on a plan upgrade.
185+
func (b *NeonBackend) Regrade(ctx context.Context, token, providerResourceID string, connLimit int) (RegradeResult, error) {
186+
return RegradeResult{Applied: false, SkipReason: "backend has no per-role connection cap"}, nil
187+
}

internal/server/server.go

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"google.golang.org/grpc/codes"
1414
"google.golang.org/grpc/status"
1515

16+
"instant.dev/common/plans"
1617
commonv1 "instant.dev/proto/common/v1"
1718
provisionerv1 "instant.dev/proto/provisioner/v1"
1819

@@ -546,6 +547,94 @@ func (s *Server) GetStorageBytes(ctx context.Context, req *provisionerv1.Storage
546547
}
547548
}
548549

550+
// regradeConnLimits is the source of truth for per-tier Postgres connection
551+
// caps used by RegradeResource. It is the shared plans registry — the same
552+
// source the agent API uses via plans.Registry.ConnectionsLimit(tier,
553+
// "postgres") — so the cap re-applied here stays platform-consistent.
554+
var regradeConnLimits = plans.Default()
555+
556+
// RegradeResource re-applies the tier's per-role connection cap to an
557+
// already-provisioned resource. It exists because a plan upgrade does not, on
558+
// its own, re-apply the higher CONNECTION LIMIT to the customer's Postgres
559+
// role — the role keeps the old (lower) cap until this RPC runs.
560+
//
561+
// Phase 1 is Postgres-only. Non-Postgres resource types and backends with no
562+
// per-role connection cap (the shared local/dedicated/neon backends) return
563+
// {applied:false} with a skip reason rather than an error.
564+
func (s *Server) RegradeResource(ctx context.Context, req *provisionerv1.RegradeRequest) (*provisionerv1.RegradeResponse, error) {
565+
if req.Token == "" {
566+
return nil, status.Error(codes.InvalidArgument, "token is required")
567+
}
568+
569+
ctx, span := otel.Tracer("instant.dev/provisioner").Start(ctx, "RegradeResource",
570+
trace.WithAttributes(
571+
attribute.String("resource_type", req.ResourceType.String()),
572+
attribute.String("tier", req.Tier),
573+
attribute.String("resource.token", req.Token),
574+
),
575+
)
576+
defer span.End()
577+
578+
// Phase 1: Postgres only.
579+
if req.ResourceType != commonv1.ResourceType_RESOURCE_TYPE_POSTGRES {
580+
slog.Info("server.RegradeResource",
581+
"token", req.Token, "tier", req.Tier,
582+
"applied", false, "skip_reason", "unsupported resource type for regrade",
583+
"request_id", req.RequestId)
584+
return &provisionerv1.RegradeResponse{
585+
Applied: false,
586+
SkipReason: "unsupported resource type for regrade",
587+
}, nil
588+
}
589+
590+
// Select the backend that actually owns this resource. k8s namespace IDs
591+
// (prefix "instant-customer-") go through the regular postgresBackend,
592+
// matching the routing DeprovisionResource uses.
593+
backend := s.postgresBackend
594+
if s.dedicatedPostgresBackend != nil && req.ProviderResourceId != "" &&
595+
!strings.HasPrefix(req.ProviderResourceId, "instant-customer-") {
596+
backend = s.dedicatedPostgresBackend
597+
}
598+
599+
// Only the k8s backend applies a per-role CONNECTION LIMIT at provision
600+
// time. Every other backend would return the same skip via Regrade, but
601+
// checking here keeps the contract explicit and avoids a needless k8s/DB
602+
// round-trip.
603+
if _, ok := backend.(*postgres.K8sBackend); !ok {
604+
slog.Info("server.RegradeResource",
605+
"token", req.Token, "tier", req.Tier,
606+
"applied", false, "skip_reason", "backend has no per-role connection cap",
607+
"request_id", req.RequestId)
608+
return &provisionerv1.RegradeResponse{
609+
Applied: false,
610+
SkipReason: "backend has no per-role connection cap",
611+
}, nil
612+
}
613+
614+
// Connection cap comes from the shared plans registry, keeping it
615+
// consistent with the cap the agent API reports and the k8s backend
616+
// applies at provision time. -1 = unlimited; passed through verbatim.
617+
connLimit := regradeConnLimits.ConnectionsLimit(req.Tier, "postgres")
618+
619+
result, err := backend.Regrade(ctx, req.Token, req.ProviderResourceId, connLimit)
620+
if err != nil {
621+
return nil, mapError("RegradeResource.postgres", err)
622+
}
623+
624+
slog.Info("server.RegradeResource",
625+
"token", req.Token, "tier", req.Tier,
626+
"applied", result.Applied,
627+
"applied_conn_limit", result.AppliedConnLimit,
628+
"skip_reason", result.SkipReason,
629+
"request_id", req.RequestId)
630+
631+
return &provisionerv1.RegradeResponse{
632+
Applied: result.Applied,
633+
AppliedConnLimit: int32(result.AppliedConnLimit),
634+
SkipReason: result.SkipReason,
635+
}, nil
636+
}
637+
549638
// mapError converts backend errors to appropriate gRPC status codes.
550639
func mapError(op string, err error) error {
551640
if err == nil {

internal/server/server_test.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ type mockPostgresBackend struct {
2525
provision func(ctx context.Context, token, tier string) (*postgres.Credentials, error)
2626
storageBytes func(ctx context.Context, token, providerResourceID string) (int64, error)
2727
deprovision func(ctx context.Context, token, providerResourceID string) error
28+
regrade func(ctx context.Context, token, providerResourceID string, connLimit int) (postgres.RegradeResult, error)
2829
}
2930

3031
func (m *mockPostgresBackend) Provision(ctx context.Context, token, tier string) (*postgres.Credentials, error) {
@@ -52,6 +53,13 @@ func (m *mockPostgresBackend) Deprovision(ctx context.Context, token, id string)
5253
return nil
5354
}
5455

56+
func (m *mockPostgresBackend) Regrade(ctx context.Context, token, id string, connLimit int) (postgres.RegradeResult, error) {
57+
if m.regrade != nil {
58+
return m.regrade(ctx, token, id, connLimit)
59+
}
60+
return postgres.RegradeResult{Applied: false, SkipReason: "backend has no per-role connection cap"}, nil
61+
}
62+
5563
type mockRedisBackend struct {
5664
provision func(ctx context.Context, token, tier string) (*redis.Credentials, error)
5765
storageBytes func(ctx context.Context, token, providerResourceID string) (int64, error)
@@ -357,6 +365,55 @@ func TestGetStorageBytes_Storage_NilMinIOBackend_ReturnsZero(t *testing.T) {
357365
}
358366
}
359367

368+
// --- RegradeResource tests ---
369+
370+
func TestRegradeResource_EmptyToken_ReturnsInvalidArgument(t *testing.T) {
371+
srv := newTestServer()
372+
_, err := srv.RegradeResource(context.Background(), &provisionerv1.RegradeRequest{
373+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
374+
Tier: "pro",
375+
})
376+
assertCode(t, err, codes.InvalidArgument)
377+
}
378+
379+
func TestRegradeResource_NonPostgres_SkipsWithReason(t *testing.T) {
380+
srv := newTestServer()
381+
resp, err := srv.RegradeResource(context.Background(), &provisionerv1.RegradeRequest{
382+
Token: "tok-123",
383+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_REDIS,
384+
Tier: "pro",
385+
})
386+
if err != nil {
387+
t.Fatalf("unexpected error: %v", err)
388+
}
389+
if resp.Applied {
390+
t.Fatal("expected applied=false for non-postgres resource")
391+
}
392+
if resp.SkipReason != "unsupported resource type for regrade" {
393+
t.Fatalf("unexpected skip_reason: %q", resp.SkipReason)
394+
}
395+
}
396+
397+
func TestRegradeResource_NonK8sBackend_SkipsWithReason(t *testing.T) {
398+
// newTestServer wires the shared mockPostgresBackend, which is not a
399+
// *postgres.K8sBackend — the server should skip without touching it.
400+
srv := newTestServer()
401+
resp, err := srv.RegradeResource(context.Background(), &provisionerv1.RegradeRequest{
402+
Token: "tok-123",
403+
ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES,
404+
Tier: "pro",
405+
})
406+
if err != nil {
407+
t.Fatalf("unexpected error: %v", err)
408+
}
409+
if resp.Applied {
410+
t.Fatal("expected applied=false for non-k8s backend")
411+
}
412+
if resp.SkipReason != "backend has no per-role connection cap" {
413+
t.Fatalf("unexpected skip_reason: %q", resp.SkipReason)
414+
}
415+
}
416+
360417
// --- helper ---
361418

362419
func assertCode(t *testing.T, err error, want codes.Code) {

0 commit comments

Comments
 (0)