From 180f42ae39f44a5f0b2e932a5ec2557719640a26 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 22 Mar 2026 23:16:48 -0400 Subject: [PATCH 1/3] Create DATA.md data storage guide --- DATA.md | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 DATA.md diff --git a/DATA.md b/DATA.md new file mode 100644 index 0000000..0af3974 --- /dev/null +++ b/DATA.md @@ -0,0 +1,194 @@ +# Data Storage Guide + +## Project purpose + +This app does two things: + +1. Collects waitlist emails for Loop. +2. Optionally scans a signed-in Cornell user's Gmail metadata to collect unique sender email addresses (not message bodies) for listserv landscape analysis. + +## Storage locations + +### 1) Cloudflare D1 (primary persistent storage) + +- Database binding: `loop_extract_emails_prod` +- Database name: `loop-extract-emails-prod` +- Config file: `wrangler.jsonc` + +All server-side persistent data is stored in this D1 database. + +### 2) Browser localStorage (client-side job resume state) + +- Key: `loop_extract_emails_active_job` +- Value shape: `{ "jobId": string, "jobKey": string }` +- Used only to resume progress polling after refresh/navigation for an active extraction job. + +## Cloudflare account context + +This project is managed in a shared Cornell DTI Cloudflare account associated with `Dtiincubator@gmail.com`. Create your own Cloudflare account and you can easily be added to this. + +## Database tables and schemas + +## `waitlist_emails` + +Created by migration: `migrations/0002_init_waitlist_table.sql` + +Columns: + +- `id INTEGER PRIMARY KEY` +- `email TEXT NOT NULL UNIQUE` +- `created_at DATETIME DEFAULT CURRENT_TIMESTAMP` + +What it stores: + +- Waitlist signup email addresses. +- One row per unique email (duplicates ignored via `INSERT OR IGNORE`). + +Write path: + +- `src/routes/emails.remote.ts` -> `storeWaitlistEmail` + +## `emails` + +Created by migration: `migrations/0001_init_emails_schema.sql` + +Columns: + +- `id INTEGER PRIMARY KEY` +- `email TEXT NOT NULL UNIQUE` + +What it stores: + +- Canonical unique sender email addresses extracted from Gmail `From` headers. +- Global dedupe table across users. + +Write path: + +- `src/lib/server/extraction-jobs.ts` -> `storeEmailsForUser` +- SQL: `INSERT OR IGNORE INTO emails (email) VALUES (?)` + +## `email_submissions` + +Created by migration: `migrations/0001_init_emails_schema.sql` + +Columns: + +- `email_id INTEGER NOT NULL` +- `user_hash TEXT NOT NULL` +- `PRIMARY KEY (email_id, user_hash)` +- `FOREIGN KEY (email_id) REFERENCES emails(id) ON DELETE CASCADE` + +Index: + +- `idx_email_submissions_user_hash ON email_submissions(user_hash)` + +What it stores: + +- Many-to-many link between a unique sender email (`email_id`) and a specific user mailbox hash (`user_hash`). +- This represents "this hashed user has received mail from this sender". + +Write path: + +- `src/lib/server/extraction-jobs.ts` -> `storeEmailsForUser` +- SQL: `INSERT OR IGNORE INTO email_submissions (email_id, user_hash) SELECT id, ? FROM emails WHERE email = ?` + +## `extraction_jobs` + +Created at runtime by application code (not migration): + +- `src/lib/server/extraction-jobs.ts` -> `ensureExtractionTables` + +Columns: + +- `id TEXT PRIMARY KEY` +- `job_key TEXT` +- `user_email TEXT NOT NULL` +- `user_hash TEXT NOT NULL` +- `access_token TEXT` +- `next_page_token TEXT` +- `status TEXT NOT NULL` (`pending` | `running` | `completed` | `failed`) +- `scanned_messages INTEGER NOT NULL DEFAULT 0` +- `estimated_total_messages INTEGER` +- `unique_senders INTEGER NOT NULL DEFAULT 0` +- `last_error TEXT` +- `created_at TEXT NOT NULL` +- `updated_at TEXT NOT NULL` + +Index: + +- `extraction_jobs_status_idx ON extraction_jobs(status, updated_at)` + +What it stores: + +- Progress and control state for background inbox extraction jobs. +- Temporary OAuth access token used while extraction runs. +- Cursor (`next_page_token`) for pagination across Gmail messages. +- User mailbox identity in plaintext (`user_email`) and hashed (`user_hash`). + +Lifecycle details: + +- On job creation: row inserted with `status='pending'`, token set, and estimated Gmail message count if available. +- During processing: status becomes `running`, scanned counts and unique sender counts are updated. +- On completion: status set to `completed`, `access_token` set to `NULL`. +- On failure: status set to `failed`, `access_token` and `job_key` set to `NULL`, `last_error` populated. + +## End-to-end data flow + +## A) Waitlist submission flow + +1. User enters email on the landing page. +2. Client calls remote command `storeWaitlistEmail`. +3. Server executes `INSERT OR IGNORE` into `waitlist_emails`. + +Stored data from this flow: + +- Waitlist email address (+ DB timestamp). + +## B) Gmail extraction flow + +1. User clicks sign-in; Google OAuth returns an access token (`gmail.readonly` scope). +2. Client calls remote command `startEmailExtraction` with that token. +3. Server: + - Fetches Gmail profile to determine `user_email` and total message estimate. + - Computes `user_hash = SHA-256(user_email + USER_HASH_SALT)`. + - Inserts an `extraction_jobs` row with job IDs and token. +4. Background endpoint `/api/extraction/process` processes one page step at a time. +5. For each Gmail page: + - Retrieves message IDs. + - Batch-fetches message metadata headers (`From` only). + - Extracts normalized sender emails via regex. + - Filters out the mailbox owner's own email. + - Upserts into `emails` and `email_submissions`. +6. Job status is polled by the client until `completed` or `failed`. +7. On completion/failure, token is cleared server-side. + +Stored data from this flow: + +- Sender email addresses (unique global set in `emails`). +- Sender-to-user-hash relationships in `email_submissions`. +- Job metadata/progress/errors in `extraction_jobs`. +- Local browser resume token pair (`jobId`, `jobKey`) in localStorage during active jobs. + +## Environment variables and secrets + +Expected runtime env vars (Cloudflare Worker environment): + +- `PUBLIC_GOOGLE_CLIENT_ID`: public OAuth client ID used by frontend sign-in. +- `GOOGLE_CLIENT_SECRET`: present in worker types; not directly referenced in current app code paths. +- `USER_HASH_SALT`: server-side salt used when hashing user emails. + +Important notes: + +- Do not commit real secret values in repository files. +- `USER_HASH_SALT` should remain secret and stable for consistent hashing. + +## Privacy notes + +- `user_email` is stored in plaintext in `extraction_jobs`. +- OAuth `access_token` is stored temporarily in `extraction_jobs` while processing and cleared on terminal states. +- `jobId` + `jobKey` gate read/process access to job status and processing steps. + +## Operational notes + +- New environments should apply migrations before first use. +- `wrangler.jsonc` currently points to a remote D1 DB, so local development can affect production-linked data unless bindings are changed. From b5c26ae56d3f0ada4d23661950ec76599407d639 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 22 Mar 2026 23:20:33 -0400 Subject: [PATCH 2/3] Create extraction_jobs_table with migration instead of runtime code --- DATA.md | 4 +-- .../0003_init_extraction_jobs_table.sql | 19 ++++++++++++ src/lib/server/extraction-jobs.ts | 30 ------------------- src/routes/api/extraction/process/+server.ts | 7 +---- src/routes/emails.remote.ts | 3 -- 5 files changed, 21 insertions(+), 42 deletions(-) create mode 100644 migrations/0003_init_extraction_jobs_table.sql diff --git a/DATA.md b/DATA.md index 0af3974..f442cd9 100644 --- a/DATA.md +++ b/DATA.md @@ -94,9 +94,7 @@ Write path: ## `extraction_jobs` -Created at runtime by application code (not migration): - -- `src/lib/server/extraction-jobs.ts` -> `ensureExtractionTables` +Created by migration: `migrations/0003_init_extraction_jobs_table.sql` Columns: diff --git a/migrations/0003_init_extraction_jobs_table.sql b/migrations/0003_init_extraction_jobs_table.sql new file mode 100644 index 0000000..57579f4 --- /dev/null +++ b/migrations/0003_init_extraction_jobs_table.sql @@ -0,0 +1,19 @@ +-- Migration number: 0003 2026-03-22T00:00:00.000Z +CREATE TABLE IF NOT EXISTS extraction_jobs ( + id TEXT PRIMARY KEY, + job_key TEXT, + user_email TEXT NOT NULL, + user_hash TEXT NOT NULL, + access_token TEXT, + next_page_token TEXT, + status TEXT NOT NULL, + scanned_messages INTEGER NOT NULL DEFAULT 0, + estimated_total_messages INTEGER, + unique_senders INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); + +CREATE INDEX IF NOT EXISTS extraction_jobs_status_idx + ON extraction_jobs(status, updated_at); diff --git a/src/lib/server/extraction-jobs.ts b/src/lib/server/extraction-jobs.ts index f4b976c..11e4f2a 100644 --- a/src/lib/server/extraction-jobs.ts +++ b/src/lib/server/extraction-jobs.ts @@ -66,36 +66,6 @@ const RETRYABLE_GMAIL_403_REASONS = new Set([ 'backenderror' ]); -export async function ensureExtractionTables(db: D1Database): Promise { - await db - .prepare( - ` - CREATE TABLE IF NOT EXISTS extraction_jobs ( - id TEXT PRIMARY KEY, - job_key TEXT, - user_email TEXT NOT NULL, - user_hash TEXT NOT NULL, - access_token TEXT, - next_page_token TEXT, - status TEXT NOT NULL, - scanned_messages INTEGER NOT NULL DEFAULT 0, - estimated_total_messages INTEGER, - unique_senders INTEGER NOT NULL DEFAULT 0, - last_error TEXT, - created_at TEXT NOT NULL, - updated_at TEXT NOT NULL - ) - ` - ) - .run(); - - await db - .prepare( - 'CREATE INDEX IF NOT EXISTS extraction_jobs_status_idx ON extraction_jobs(status, updated_at)' - ) - .run(); -} - export async function createExtractionJob(params: { db: D1Database; salt: string; diff --git a/src/routes/api/extraction/process/+server.ts b/src/routes/api/extraction/process/+server.ts index d8a13cd..46de36a 100644 --- a/src/routes/api/extraction/process/+server.ts +++ b/src/routes/api/extraction/process/+server.ts @@ -1,10 +1,6 @@ import { json } from '@sveltejs/kit'; import type { RequestHandler } from './$types'; -import { - ensureExtractionTables, - processExtractionJobStep, - scheduleExtractionStep -} from '$lib/server/extraction-jobs'; +import { processExtractionJobStep, scheduleExtractionStep } from '$lib/server/extraction-jobs'; export const POST: RequestHandler = async (event) => { const db = event.platform?.env.loop_extract_emails_prod; @@ -26,7 +22,6 @@ export const POST: RequestHandler = async (event) => { return json({ ok: false, error: 'jobId and jobKey are required' }, { status: 400 }); } - await ensureExtractionTables(db); const { done } = await processExtractionJobStep({ db, jobId, jobKey }); if (!done) { diff --git a/src/routes/emails.remote.ts b/src/routes/emails.remote.ts index d224d1f..7409c06 100644 --- a/src/routes/emails.remote.ts +++ b/src/routes/emails.remote.ts @@ -2,7 +2,6 @@ import * as v from 'valibot'; import { command, getRequestEvent } from '$app/server'; import { createExtractionJob, - ensureExtractionTables, getExtractionStatus, scheduleExtractionStep } from '$lib/server/extraction-jobs'; @@ -17,7 +16,6 @@ export const startEmailExtraction = command( if (!db) throw new Error('D1 binding not configured'); if (!salt) throw new Error('USER_HASH_SALT not configured'); - await ensureExtractionTables(db); const { jobId, jobKey } = await createExtractionJob({ db, salt, accessToken }); scheduleExtractionStep(event, { jobId, jobKey }); @@ -33,7 +31,6 @@ export const getEmailExtractionStatus = command( const db = event.platform?.env.loop_extract_emails_prod; if (!db) throw new Error('D1 binding not configured'); - await ensureExtractionTables(db); return getExtractionStatus(db, jobId, jobKey); } ); From 38e63c42dfd04c283ff29f12897d1f01e1406bf2 Mon Sep 17 00:00:00 2001 From: ben Date: Wed, 8 Apr 2026 22:14:00 -0400 Subject: [PATCH 3/3] Mark account on worker connection --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2e33c02..6dace13 100644 --- a/README.md +++ b/README.md @@ -27,4 +27,4 @@ pnpm build You can preview the production build with `pnpm preview`. -> For deployment, a [Cloudflare Worker](https://workers.cloudflare.com/) is configured to automatically track the `main` branch. +> For deployment, a [Cloudflare Worker](https://workers.cloudflare.com/) is configured to automatically track the `main` branch. This worker is connected to the `Dtiincubator@gmail.com` account.