diff --git a/app.config.ts b/app.config.ts index 73d482d..154556c 100644 --- a/app.config.ts +++ b/app.config.ts @@ -129,7 +129,7 @@ export default ({ config }: ConfigContext): ExpoConfig => ({ [ "@sentry/react-native/expo", { - url: "https://sentry.io/", + url: "https://de.sentry.io/", project: "mapvault", organization: "patrick-alvarez", }, diff --git a/docs/database.md b/docs/database.md index 91cae8b..cce70fd 100644 --- a/docs/database.md +++ b/docs/database.md @@ -1,6 +1,6 @@ # Database Schema -9 tables in the `public` schema, all with RLS enabled. Supabase Auth manages `auth.users`; everything else lives here. +9 application tables in the `public` schema plus 1 internal infrastructure table (`drift_check_runs`), all with RLS enabled. Supabase Auth manages `auth.users`; everything else lives here. ## Entity Relationships @@ -171,6 +171,22 @@ Invite tokens for sharing maps with other users. **RLS:** Map members can SELECT invites for their maps. Owners can INSERT (enforced via `create-invite` Edge Function which checks premium entitlement). +## Infrastructure Tables + +These tables back internal machinery, not user-facing data. They are intentionally excluded from the entity-relationship diagram above. + +### drift_check_runs + +Mutex row for the `rc-entitlement-drift-check` Edge Function so overlapping cron fires don't double-report drift. See `docs/payments.md` → "Drift Health Check". + +| Column | Type | Constraints | Notes | +|--------|------|-------------|-------| +| job_name | text | PK | Stable string `'rc-entitlement-drift-check'` | +| started_at | timestamptz | NOT NULL, default `now()` | Set on every acquire | +| finished_at | timestamptz | nullable | NULL = run in flight; set by `release_drift_check_lock` | + +**RLS:** Enabled with **no policies** (default-deny). The `SECURITY DEFINER` helpers `try_acquire_drift_check_lock(text, interval)` and `release_drift_check_lock(text)` run as table owner and bypass RLS; `anon` and `authenticated` cannot read or write via PostgREST. + ## Triggers ### on_auth_user_created → `handle_new_user()` @@ -217,3 +233,5 @@ SECURITY DEFINER function that checks if the current user is a member of a map. | `20260223000001_cleanup_orphaned_places_on_delete.sql` | Add orphaned places cleanup to deletion trigger | | `20260304000001_freemium_roles_redesign.sql` | Rename `editor` → `contributor`, add `member` role, restrict RLS to owner/contributor writes, add CHECK constraints | | `20260305000001_fix_map_place_tags_cross_map.sql` | Fix cross-map tag validation in map_place_tags INSERT and DELETE RLS policies | +| `20260513000001_enable_pg_cron_and_drift_check_lock.sql` | Enable `pg_cron`; create `drift_check_runs` + `try_acquire_drift_check_lock` / `release_drift_check_lock` RPCs | +| `20260513000002_schedule_rc_entitlement_drift_check.sql` | Schedule `rc-entitlement-drift-check` cron job (every 6h at :17 UTC) with bearer from `vault.decrypted_secrets` | diff --git a/docs/deployment.md b/docs/deployment.md index aad2ace..83e5f3a 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -26,10 +26,10 @@ Deploy individual functions after changes: supabase functions deploy --no-verify-jwt ``` -Deploy all 5 at once: +Deploy all 7 at once: ```bash -for fn in create-map add-place accept-invite revenuecat-webhook delete-account; do +for fn in create-map add-place accept-invite create-invite revenuecat-webhook rc-entitlement-drift-check delete-account; do supabase functions deploy "$fn" --no-verify-jwt done ``` @@ -51,7 +51,9 @@ Server-side secrets (not in `.env`): | Secret | Purpose | |--------|---------| | `REVENUECAT_WEBHOOK_SECRET` | Authenticates RevenueCat webhook requests | -| `REVENUECAT_SECRET_API_KEY` | Admin API key for subscriber deletion | +| `REVENUECAT_SECRET_API_KEY` | Admin API key (used by `delete-account` and `rc-entitlement-drift-check`) | +| `REVENUECAT_PROJECT_ID` | RC project id (used by `rc-entitlement-drift-check`) | +| `RC_DRIFT_CHECK_INVOKE_SECRET` | Bearer for pg_cron → `rc-entitlement-drift-check`; must mirror `vault.secrets.rc_drift_check_invoke_secret`. See `docs/payments.md` → "Drift Health Check" for the first-time setup and rotation runbook. | `SUPABASE_URL` and `SUPABASE_SERVICE_ROLE_KEY` are auto-injected — no manual setup needed. diff --git a/docs/edge-functions.md b/docs/edge-functions.md index d4f4677..d649630 100644 --- a/docs/edge-functions.md +++ b/docs/edge-functions.md @@ -1,6 +1,6 @@ # Edge Functions Reference -6 Supabase Edge Functions that enforce business rules that can't be trusted to the client. All deployed with `--no-verify-jwt` and validate auth internally — most use `auth.getUser()` with a user Bearer token, except `revenuecat-webhook` which validates a shared webhook secret. +7 Supabase Edge Functions that enforce business rules that can't be trusted to the client. All deployed with `--no-verify-jwt` and validate auth internally — most use `auth.getUser()` with a user Bearer token, except `revenuecat-webhook` and `rc-entitlement-drift-check` which validate a shared bearer secret. ## Overview @@ -11,6 +11,7 @@ | `accept-invite` | Accept an invite token and join a map | Validates expiry, max uses, duplicates | | `create-invite` | Create an invite link for a map | Premium owners only | | `revenuecat-webhook` | Sync purchase events to entitlement | Maps RC events → `profiles.entitlement` | +| `rc-entitlement-drift-check` | Scheduled reconciliation of RC active entitlements vs `profiles.entitlement` | Every 6h at :17 UTC via `pg_cron`; Sentry alert on drift > 0 | | `delete-account` | Delete user and all associated data | RC cleanup (best-effort) + auth deletion | --- @@ -308,3 +309,55 @@ No request body required. `auth.users` (delete) → cascading cleanup via trigger handles all public schema tables See `docs/account-deletion.md` for the full deletion pipeline and what gets preserved vs deleted. + +--- + +## rc-entitlement-drift-check + +Out-of-band health check that walks every RevenueCat customer and reconciles their active entitlements against `profiles.entitlement`. Drift > 0 fires a Sentry event with a stable fingerprint so consecutive runs collapse into one issue; drift = 0 produces a JSON heartbeat log only. Invoked by `pg_cron` every 6 hours at `:17` past the hour (UTC). + +**Auth:** Invoke-secret bearer (NOT a user token) + +### Request + +``` +POST /functions/v1/rc-entitlement-drift-check +Authorization: Bearer +``` + +No request body. The cron migration (`20260513000002_schedule_rc_entitlement_drift_check.sql`) pulls the bearer live from `vault.decrypted_secrets` on each fire. + +### Responses + +| Status | Body | When | +|--------|------|------| +| 200 | `{ "drift_count": N }` | Run completed; `N == 0` is the healthy heartbeat (logs only). `N > 0` fires a single Sentry event with stable fingerprint, still 200 — drift is not an HTTP-level failure. | +| 200 | `{ "message": "Concurrent run skipped" }` | Another run was in flight (table-row mutex) | +| 401 | `{ "error": "Unauthorized" }` | Wrong/missing invoke secret; Sentry `rc_drift_check_auth_fail` fires | +| 500 | `{ "error": "Internal server error" }` | RC API failure, cursor parse failure, or missing env vars; Sentry exception fires | + +### Drift Categories + +| Tag | Meaning | Sentry level | +|---|---|---| +| `count_missing` (`drift_premium_missing`) | RC says active premium, Supabase says `free` — the 2026-05-12 outage class | `error` | +| `count_stale` (`drift_premium_stale`) | Supabase says `premium`, RC has no active premium | `warning` | +| `count_orphan` (`drift_orphan`) | RC active premium but no Supabase profile matches | `warning` | + +Each Sentry event includes up to 50 ids per category in `extra`; full totals are in the tag `count_*` values. + +### Concurrency + +A table-row mutex on `public.drift_check_runs` (default-deny RLS) prevents overlapping runs. Stale rows (`started_at` > 10 minutes ago with `finished_at IS NULL`) are replaced on the next acquire attempt — there is no background sweeper, but the next 6-hourly cron fire is the heal trigger, so a crashed run can't block the cron for more than one cycle. + +### Secrets Required + +- `RC_DRIFT_CHECK_INVOKE_SECRET` — bearer that pg_cron uses to invoke this function; must mirror `vault.secrets.rc_drift_check_invoke_secret` +- `REVENUECAT_SECRET_API_KEY` — RC v2 admin key (shared with `delete-account`) +- `REVENUECAT_PROJECT_ID` — RC project id (`proj18594bd9`) + +### Tables Written + +`drift_check_runs` (mutex only; never touches `profiles`) + +See `docs/payments.md` → "Drift Health Check" for the operator runbook, secret rotation, and first-time setup. diff --git a/docs/payments.md b/docs/payments.md index 5c07877..c653e07 100644 --- a/docs/payments.md +++ b/docs/payments.md @@ -66,6 +66,104 @@ Both keys are empty in development builds (`APP_VARIANT=development`), disabling The webhook is platform-agnostic — RevenueCat normalizes events from both Apple and Google into the same format. +### Drift Health Check + +The webhook is the realtime path. The drift health check is the out-of-band backstop for deliveries that never arrive — RC retry-queue expiry, network blip, deploy mid-delivery, or in-band Sentry outages. It runs every 6 hours at `:17` past the hour (UTC), via a `pg_cron` job that POSTs to the `rc-entitlement-drift-check` Edge Function. + +**What it does:** + +1. Lists every RevenueCat customer for the project (paginated via `starting_after`, 100 per page). +2. Reads `id, entitlement` from every `profiles` row. +3. Classifies each user into one of three drift categories. Healthy users are not reported. +4. If drift > 0, emits **one** Sentry event with a stable fingerprint so consecutive runs collapse into a single issue. + +**Drift categories:** + +| Category | Meaning | Sentry level | +|---|---|---| +| `drift_premium_missing` | RC says active premium, Supabase says `free`. **This is the 2026-05-12 outage class** — user paid but is locked out. | `error` | +| `drift_premium_stale` | Supabase says `premium`, RC has no active premium (refund/expiration didn't propagate, or a manual grant has no RC backing). | `warning` | +| `drift_orphan` | RC has active premium but no Supabase profile matches. Usually a deleted account whose RC record wasn't cleaned up, but worth eyeballing. | `warning` | + +The Sentry event's `extra` payload includes the first 50 affected ids per category. Use the `count_*` tags for full totals. + +**Reading the function logs:** + +Every run prints one heartbeat to the Edge Function logs regardless of outcome: + +```json +{"event":"drift_check_complete","drift_count":0,"count_missing":0,"count_stale":0,"count_orphan":0,"rc_customer_count":63,"supabase_profile_count":189,"run_at":"..."} +``` + +`mcp__supabase__get_logs --service edge-function` is the fastest way to find it. A missing heartbeat means the cron job didn't run, which is itself a signal worth investigating. + +**Operator runbook — drift event fires:** + +1. Open the Sentry issue. Look at `tags.count_missing` first; that's the urgent class. +2. Cross-check one affected id with `mcp__revenuecat__get-customer` and `select entitlement from profiles where id = ''`. If they disagree as the event claims, the webhook is the prime suspect — same diagnostic chain as the 2026-05-12 incident. +3. Fix the underlying webhook problem (secret drift, dead-letter, missing event). For acute relief on a specific user, replay the RC event from the dashboard or manually `update profiles set entitlement = 'premium' where id = ''`. +4. Once the next scheduled run logs `drift_count: 0`, **manually resolve the Sentry issue**. The fingerprint is stable, so the issue does not auto-resolve. + +**No allowlist policy:** there is intentionally no mechanism to mute a known-drifted user. If a user is drifted, it's a bug. If a beta tester ever needs grandfathered premium without RC backing, fix it by issuing them an RC entitlement (RC supports manual grants); do not add a Supabase-side exception. + +**Fate-sharing trade-off:** the drift check runs on Supabase, so a Supabase outage will take down the check at the same time as the webhook it backstops. The alternative — running the check as a GitHub Action — was rejected as too much new infrastructure for a small marginal robustness gain. Revisit if a future incident takes out Supabase scheduling specifically. + +**Secret-rotation runbook (`rc_drift_check_invoke_secret`):** + +The bearer that pg_cron uses to invoke the Edge Function lives in two places. **Both must change in lockstep** — the 2026-05-12 outage was caused by exactly this kind of multi-location single-secret drift. + +```bash +NEW_SECRET=$(openssl rand -hex 32) +supabase secrets set RC_DRIFT_CHECK_INVOKE_SECRET="$NEW_SECRET" +``` + +Then in the Supabase SQL editor: + +```sql +select vault.update_secret( + (select id from vault.secrets where name = 'rc_drift_check_invoke_secret'), + '' +); + +-- VERIFY the vault row actually changed. Vault function signatures have +-- varied across versions, and a silent no-op is exactly the failure mode +-- that motivated this whole feature. +select decrypted_secret = '' as rotated_ok + from vault.decrypted_secrets + where name = 'rc_drift_check_invoke_secret'; +``` + +If `rotated_ok` is not `true`, the function-env and vault values are now out of sync. Stop, investigate, and re-rotate before the next cron fire. + +After both have changed and the vault row is verified, manually fire one run to confirm: `select cron.run_job((select jobid from cron.job where jobname = 'rc-entitlement-drift-check'));` and watch for a 200 + heartbeat in the function logs. + +**First-time setup (only once per environment):** + +```bash +# 1. Generate and set the function env var +INVOKE_SECRET=$(openssl rand -hex 32) +supabase secrets set RC_DRIFT_CHECK_INVOKE_SECRET="$INVOKE_SECRET" +supabase secrets set REVENUECAT_PROJECT_ID="proj18594bd9" +# REVENUECAT_SECRET_API_KEY is already set (used by delete-account) + +# 2. Deploy the function +supabase functions deploy rc-entitlement-drift-check --no-verify-jwt +``` + +Then in the Supabase SQL editor (Vault is not exposed via the supabase CLI): + +```sql +select vault.create_secret( + '', + 'rc_drift_check_invoke_secret', + 'Bearer for the rc-entitlement-drift-check Edge Function' +); +``` + +**Order matters:** apply the migrations **after** the vault secret is created — `supabase db push`. If migrations land first, the cron job schedules immediately and the next `:17` fire concatenates a NULL bearer (the vault subquery returns NULL when the secret row doesn't exist; `'Bearer ' || NULL = NULL` in SQL). The Edge Function then sees either a missing or null Authorization header and returns 401, producing a spurious `rc_drift_check_auth_fail` Sentry event with `reason: missing_secret` until the vault row is created. + +The cron job will start firing at the next `:17 mod 6h` UTC mark. + ### Paywall - **Annual-only** subscription at €9.99/year @@ -107,6 +205,9 @@ The webhook is platform-agnostic — RevenueCat normalizes events from both Appl - **Edge Function secrets** (set via dashboard or CLI): - `REVENUECAT_WEBHOOK_SECRET` — must match the Bearer token configured in RevenueCat webhook settings + - `REVENUECAT_SECRET_API_KEY` — RC v2 REST API key, used by `delete-account` and `rc-entitlement-drift-check` + - `REVENUECAT_PROJECT_ID` — RC project id, used by `rc-entitlement-drift-check` + - `RC_DRIFT_CHECK_INVOKE_SECRET` — Bearer for pg_cron → `rc-entitlement-drift-check`; must mirror `vault.secrets.rc_drift_check_invoke_secret` - `SUPABASE_URL` and `SUPABASE_SERVICE_ROLE_KEY` — auto-injected, no manual setup needed - **Deploy edge functions:** ```bash @@ -114,6 +215,7 @@ The webhook is platform-agnostic — RevenueCat normalizes events from both Appl supabase functions deploy create-map supabase functions deploy add-place supabase functions deploy create-invite + supabase functions deploy rc-entitlement-drift-check ``` ### Environment Variables @@ -123,6 +225,9 @@ The webhook is platform-agnostic — RevenueCat normalizes events from both Appl | `EXPO_PUBLIC_REVENUECAT_API_KEY` | `.env` + EAS secrets | RevenueCat Apple API key, read at build time | | `EXPO_PUBLIC_REVENUECAT_GOOGLE_API_KEY` | `.env` + EAS secrets | RevenueCat Google API key, read at build time | | `REVENUECAT_WEBHOOK_SECRET` | Supabase Edge Function secrets | Webhook auth, server-side only | +| `REVENUECAT_SECRET_API_KEY` | Supabase Edge Function secrets | RC v2 REST API admin key (`delete-account`, `rc-entitlement-drift-check`) | +| `REVENUECAT_PROJECT_ID` | Supabase Edge Function secrets | RC project id, used by `rc-entitlement-drift-check` | +| `RC_DRIFT_CHECK_INVOKE_SECRET` | Supabase Edge Function secrets **and** Supabase Vault | Bearer that pg_cron uses to invoke `rc-entitlement-drift-check`; rotate in both places together | --- diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index c8ad2a9..4ada47c 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -87,10 +87,22 @@ Freemium limits are enforced server-side in Edge Functions, so there's no client **Symptom:** Purchase succeeds in RevenueCat dashboard but `profiles.entitlement` stays `free`. **Check:** -1. Is the webhook URL correct in RevenueCat dashboard? (`https://.supabase.co/functions/v1/revenuecat-webhook`) -2. Does the Bearer token in RevenueCat match `REVENUECAT_WEBHOOK_SECRET` in Supabase secrets? -3. Is the Edge Function deployed? Check with: `curl -s -o /dev/null -w "%{http_code}" https://.supabase.co/functions/v1/revenuecat-webhook` -4. Check Supabase Edge Function logs for errors +1. **Sentry first** — search for an open `rc_entitlement_drift` or `revenuecat_webhook_auth_fail` issue. The scheduled drift check (`rc-entitlement-drift-check`, every 6h) is the fastest signal of in-band webhook failure. See `docs/payments.md` → "Drift Health Check" for the operator runbook. +2. Is the webhook URL correct in RevenueCat dashboard? (`https://.supabase.co/functions/v1/revenuecat-webhook`) +3. Does the Bearer token in RevenueCat match `REVENUECAT_WEBHOOK_SECRET` in Supabase secrets? +4. Is the Edge Function deployed? Check with: `curl -s -o /dev/null -w "%{http_code}" https://.supabase.co/functions/v1/revenuecat-webhook` +5. Check Supabase Edge Function logs for errors + +### Drift Check Alerts Firing + +**Symptom:** Sentry shows an open `rc_entitlement_drift` issue tagged `function: rc-entitlement-drift-check`. + +**Read the event:** `extra.drift_premium_missing` (highest priority — paid users locked out, same class as the 2026-05-12 outage), `extra.drift_premium_stale` (refund/expiration didn't propagate), and `extra.drift_orphan` (RC has a customer but no Supabase profile matches) list the affected `app_user_id`s. The `count_*` tags carry the full totals; `extra` arrays are capped at 50 ids each. + +**Check:** +- Cross-check one id from `drift_premium_missing` against `mcp__revenuecat__get-customer` and `select entitlement from profiles where id = ''`. If they disagree as the event claims, the webhook is the prime suspect — same diagnostic chain as "Webhook Not Updating Entitlement" above. +- For acute relief: replay the RC event from the dashboard, or `update profiles set entitlement = 'premium' where id = ''`. +- The Sentry issue uses a stable fingerprint and **does not auto-resolve**. Once the next 6h run logs `drift_count: 0`, manually resolve the issue. ## Google Places API diff --git a/supabase/functions/rc-entitlement-drift-check/index.ts b/supabase/functions/rc-entitlement-drift-check/index.ts new file mode 100644 index 0000000..68e529a --- /dev/null +++ b/supabase/functions/rc-entitlement-drift-check/index.ts @@ -0,0 +1,275 @@ +import { serve } from "https://deno.land/std@0.168.0/http/server.ts"; +import { createClient } from "https://esm.sh/@supabase/supabase-js@2"; +import * as Sentry from "npm:@sentry/node"; + +Sentry.init({ + dsn: Deno.env.get("SENTRY_DSN"), + tracesSampleRate: 0, +}); + +const RC_API_BASE = "https://api.revenuecat.com/v2"; +const JOB_NAME = "rc-entitlement-drift-check"; +const MAX_EXTRA_IDS = 50; +const RC_PAGE_LIMIT = 100; + +interface RcEntitlement { + id: string; + lookup_key: string; +} + +interface RcCustomerActiveEntitlement { + entitlement_id: string; + // expires_at is intentionally not checked client-side — we trust RC's + // server-side active filter, which keeps in-grace-period entitlements active. + expires_at: number | null; +} + +interface RcCustomer { + id: string; + active_entitlements?: { + items?: RcCustomerActiveEntitlement[]; + }; +} + +interface RcListResponse { + items: T[]; + next_page: string | null; +} + +serve(async (req) => { + const runAt = new Date().toISOString(); + + try { + // 1. Bearer auth + const authHeader = req.headers.get("Authorization"); + const invokeSecret = Deno.env.get("RC_DRIFT_CHECK_INVOKE_SECRET"); + + if (!invokeSecret || authHeader !== `Bearer ${invokeSecret}`) { + const reason = !invokeSecret ? "missing_secret" : "mismatch"; + Sentry.captureMessage("rc_drift_check_auth_fail", { + level: "error", + tags: { function: JOB_NAME, reason }, + }); + // Deno Edge isolates terminate when Response is returned; Sentry's + // async transport may drop queued events without an explicit flush. + await Sentry.flush(2000); + return new Response( + JSON.stringify({ error: "Unauthorized" }), + { status: 401, headers: { "Content-Type": "application/json" } }, + ); + } + + const supabase = createClient( + Deno.env.get("SUPABASE_URL")!, + Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!, + ); + + // 2. Mutex: skip if another run is already in flight (per-run table row, + // stale entries auto-replaced after 10 minutes by the RPC). + const { data: lockAcquired, error: lockError } = await supabase.rpc( + "try_acquire_drift_check_lock", + { p_job_name: JOB_NAME }, + ); + if (lockError) { + throw new Error(`lock_acquire_failed: ${lockError.message}`); + } + if (!lockAcquired) { + console.log(JSON.stringify({ + event: "concurrent_skip", + job: JOB_NAME, + run_at: runAt, + })); + return new Response( + JSON.stringify({ message: "Concurrent run skipped" }), + { status: 200, headers: { "Content-Type": "application/json" } }, + ); + } + + try { + const projectId = Deno.env.get("REVENUECAT_PROJECT_ID"); + const rcKey = Deno.env.get("REVENUECAT_SECRET_API_KEY"); + if (!projectId || !rcKey) { + throw new Error("missing_revenuecat_env"); + } + + // 3. Resolve "premium" entitlement to its RC-internal id. The active + // entitlement objects on customers only carry the internal id, so we + // need this mapping to compare against MapVault's "premium" string. + const premiumEntitlementId = await resolvePremiumEntitlementId( + projectId, + rcKey, + ); + if (!premiumEntitlementId) { + throw new Error("premium_entitlement_not_found"); + } + + // 4. Walk every RC customer, collect those whose active entitlements + // include premium. Pagination via starting_after cursor. + const rcPremiumIds = new Set(); + let rcCustomerCount = 0; + let cursor: string | null = null; + do { + const url = new URL(`${RC_API_BASE}/projects/${projectId}/customers`); + url.searchParams.set("limit", String(RC_PAGE_LIMIT)); + if (cursor) url.searchParams.set("starting_after", cursor); + + const res = await fetch(url.toString(), { + headers: { Authorization: `Bearer ${rcKey}` }, + signal: AbortSignal.timeout(15_000), + }); + if (!res.ok) { + throw new Error(`rc_list_customers_${res.status}`); + } + const page = (await res.json()) as RcListResponse; + rcCustomerCount += page.items.length; + for (const customer of page.items) { + const items = customer.active_entitlements?.items ?? []; + if (items.some((e) => e.entitlement_id === premiumEntitlementId)) { + rcPremiumIds.add(customer.id); + } + } + if (page.next_page) { + const next = new URL(page.next_page, "https://api.revenuecat.com") + .searchParams.get("starting_after"); + if (!next) { + // RC said "more results exist" but didn't give us a cursor we + // understand. Refuse to silently undercount — that would give + // a false clean drift report. + throw new Error("rc_next_page_parse_failed"); + } + cursor = next; + } else { + cursor = null; + } + } while (cursor); + + // 5. Read Supabase entitlement state. + const { data: profiles, error: profilesError } = await supabase + .from("profiles") + .select("id, entitlement"); + if (profilesError) { + throw new Error(`supabase_profiles_${profilesError.message}`); + } + const supabasePremiumIds = new Set( + (profiles ?? []) + .filter((p) => p.entitlement === "premium") + .map((p) => p.id), + ); + const supabaseAllIds = new Set( + (profiles ?? []).map((p) => p.id), + ); + + // 6. Classify drift. See docs/payments.md "Drift Health Check" for the + // category definitions and what each one means operationally. + const driftPremiumMissing: string[] = []; // RC premium, Supabase free + const driftPremiumStale: string[] = []; // Supabase premium, RC not premium + const driftOrphan: string[] = []; // RC premium with no Supabase profile + + for (const rcId of rcPremiumIds) { + if (!supabaseAllIds.has(rcId)) { + driftOrphan.push(rcId); + } else if (!supabasePremiumIds.has(rcId)) { + driftPremiumMissing.push(rcId); + } + } + for (const sbId of supabasePremiumIds) { + if (!rcPremiumIds.has(sbId)) { + driftPremiumStale.push(sbId); + } + } + + const countMissing = driftPremiumMissing.length; + const countStale = driftPremiumStale.length; + const countOrphan = driftOrphan.length; + const driftCount = countMissing + countStale + countOrphan; + + // 7. Single Sentry event when drift > 0; stable fingerprint collapses + // consecutive runs into one issue. No event on the healthy path — + // the heartbeat below is the only "still running" signal. + if (driftCount > 0) { + Sentry.captureMessage("rc_entitlement_drift", { + level: countMissing > 0 ? "error" : "warning", + fingerprint: ["rc-entitlement-drift"], + tags: { + function: JOB_NAME, + context: "rc_entitlement_drift", + count_missing: String(countMissing), + count_stale: String(countStale), + count_orphan: String(countOrphan), + }, + extra: { + drift_premium_missing: driftPremiumMissing.slice(0, MAX_EXTRA_IDS), + drift_premium_stale: driftPremiumStale.slice(0, MAX_EXTRA_IDS), + drift_orphan: driftOrphan.slice(0, MAX_EXTRA_IDS), + rc_customer_count: rcCustomerCount, + supabase_profile_count: supabaseAllIds.size, + run_at: runAt, + }, + }); + // Flush before the function returns so the Deno isolate doesn't + // tear down the Sentry transport mid-send. + await Sentry.flush(2000); + } + + console.log(JSON.stringify({ + event: "drift_check_complete", + job: JOB_NAME, + run_at: runAt, + drift_count: driftCount, + count_missing: countMissing, + count_stale: countStale, + count_orphan: countOrphan, + rc_customer_count: rcCustomerCount, + supabase_profile_count: supabaseAllIds.size, + })); + + return new Response( + JSON.stringify({ drift_count: driftCount }), + { status: 200, headers: { "Content-Type": "application/json" } }, + ); + } finally { + // Always release the lock so the next scheduled run isn't blocked, + // even if the work above threw. The stale-after fallback in the RPC + // is the safety net for genuinely crashed runs. + const { error: releaseError } = await supabase.rpc( + "release_drift_check_lock", + { p_job_name: JOB_NAME }, + ); + if (releaseError) { + console.error("release_drift_check_lock failed:", releaseError.message); + } + } + } catch (err) { + console.error(`${JOB_NAME} error:`, err); + Sentry.captureException(err, { tags: { function: JOB_NAME } }); + // Flush before the 500 return so the exception event reaches Sentry + // before the Deno isolate tears down. + await Sentry.flush(2000); + return new Response( + JSON.stringify({ error: "Internal server error" }), + { status: 500, headers: { "Content-Type": "application/json" } }, + ); + } +}); + +async function resolvePremiumEntitlementId( + projectId: string, + rcKey: string, +): Promise { + // MapVault has one entitlement today (lookup_key="premium"). If the project + // ever has more than 100 entitlements, this needs pagination via + // starting_after — but well before that point, the data model has changed + // enough that the drift check itself should be re-evaluated. + const res = await fetch( + `${RC_API_BASE}/projects/${projectId}/entitlements?limit=100`, + { + headers: { Authorization: `Bearer ${rcKey}` }, + signal: AbortSignal.timeout(10_000), + }, + ); + if (!res.ok) { + throw new Error(`rc_list_entitlements_${res.status}`); + } + const data = (await res.json()) as RcListResponse; + return data.items.find((e) => e.lookup_key === "premium")?.id ?? null; +} diff --git a/supabase/migrations/20260513000001_enable_pg_cron_and_drift_check_lock.sql b/supabase/migrations/20260513000001_enable_pg_cron_and_drift_check_lock.sql new file mode 100644 index 0000000..a4f9aca --- /dev/null +++ b/supabase/migrations/20260513000001_enable_pg_cron_and_drift_check_lock.sql @@ -0,0 +1,63 @@ +-- Enables pg_cron and creates the mutex table + RPCs used by the +-- rc-entitlement-drift-check Edge Function to skip overlapping runs. +-- +-- pg_net is already installed on this project; pg_cron is enabled here for +-- the first time so the next migration can schedule the drift check. + +create extension if not exists pg_cron; + +create table if not exists public.drift_check_runs ( + job_name text primary key, + started_at timestamptz not null default now(), + finished_at timestamptz +); + +-- RLS with no policies = default-deny. anon/authenticated cannot read or +-- write the lock row via PostgREST. The SECURITY DEFINER RPCs below run as +-- the table owner and bypass RLS, so the cron path still works. +alter table public.drift_check_runs enable row level security; + +-- Try to claim the lock for `p_job_name`. Returns true if the caller now owns +-- the run slot. Replaces stale rows (started > p_stale_after ago and never +-- finished) so a crashed run cannot block all future runs forever. +-- p_stale_after is the safety net for runs that died before calling release. +-- 10 minutes leaves headroom over the worst-case paginated run (RC fetch +-- pages * 15s timeout each + Supabase reads + Sentry capture) while still +-- self-healing well before the next 6-hour cron fire. +create or replace function public.try_acquire_drift_check_lock( + p_job_name text, + p_stale_after interval default interval '10 minutes' +) +returns boolean +language plpgsql +security definer +set search_path = public +as $$ +declare + v_rows_affected int; +begin + insert into public.drift_check_runs (job_name, started_at, finished_at) + values (p_job_name, now(), null) + on conflict (job_name) do update + set started_at = now(), finished_at = null + where drift_check_runs.finished_at is not null + or drift_check_runs.started_at < now() - p_stale_after; + + get diagnostics v_rows_affected = row_count; + return v_rows_affected > 0; +end; +$$; + +create or replace function public.release_drift_check_lock(p_job_name text) +returns void +language sql +security definer +set search_path = public +as $$ + update public.drift_check_runs + set finished_at = now() + where job_name = p_job_name; +$$; + +revoke all on function public.try_acquire_drift_check_lock(text, interval) from public, anon, authenticated; +revoke all on function public.release_drift_check_lock(text) from public, anon, authenticated; diff --git a/supabase/migrations/20260513000002_schedule_rc_entitlement_drift_check.sql b/supabase/migrations/20260513000002_schedule_rc_entitlement_drift_check.sql new file mode 100644 index 0000000..ad41f32 --- /dev/null +++ b/supabase/migrations/20260513000002_schedule_rc_entitlement_drift_check.sql @@ -0,0 +1,35 @@ +-- Schedules the rc-entitlement-drift-check Edge Function to run every 6h. +-- +-- PREREQUISITE (run once, out-of-band, before this migration): +-- +-- select vault.create_secret( +-- '', +-- 'rc_drift_check_invoke_secret', +-- 'Bearer for the rc-entitlement-drift-check Edge Function' +-- ); +-- +-- The same bearer must also be set as the function env var +-- RC_DRIFT_CHECK_INVOKE_SECRET. See docs/payments.md "Drift Health Check" +-- for the deploy + rotation runbook. +-- +-- The function URL hardcodes the project ref. If the Supabase project is +-- ever migrated to a new ref, this migration must be re-applied. + +select cron.schedule( + 'rc-entitlement-drift-check', + '17 */6 * * *', -- UTC; offset from :00 to avoid bunching with other cron jobs + $$ + select net.http_post( + url := 'https://doycewmbehxdqfumdgke.supabase.co/functions/v1/rc-entitlement-drift-check', + headers := jsonb_build_object( + 'Content-Type', 'application/json', + 'Authorization', 'Bearer ' || ( + select decrypted_secret + from vault.decrypted_secrets + where name = 'rc_drift_check_invoke_secret' + ) + ), + body := '{}'::jsonb + ); + $$ +);