From 180f42ae39f44a5f0b2e932a5ec2557719640a26 Mon Sep 17 00:00:00 2001
From: ben <koppe.development@gmail.com>
Date: Sun, 22 Mar 2026 23:16:48 -0400
Subject: [PATCH 1/3] Create DATA.md data storage guide

---
 DATA.md | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 DATA.md

diff --git a/DATA.md b/DATA.md
new file mode 100644
index 0000000..0af3974
--- /dev/null
+++ b/DATA.md
@@ -0,0 +1,194 @@
+# Data Storage Guide
+
+## Project purpose
+
+This app does two things:
+
+1. Collects waitlist emails for Loop.
+2. Optionally scans a signed-in Cornell user's Gmail metadata to collect unique sender email addresses (not message bodies) for listserv landscape analysis.
+
+## Storage locations
+
+### 1) Cloudflare D1 (primary persistent storage)
+
+- Database binding: `loop_extract_emails_prod`
+- Database name: `loop-extract-emails-prod`
+- Config file: `wrangler.jsonc`
+
+All server-side persistent data is stored in this D1 database.
+
+### 2) Browser localStorage (client-side job resume state)
+
+- Key: `loop_extract_emails_active_job`
+- Value shape: `{ "jobId": string, "jobKey": string }`
+- Used only to resume progress polling after refresh/navigation for an active extraction job.
+
+## Cloudflare account context
+
+This project is managed in a shared Cornell DTI Cloudflare account associated with `Dtiincubator@gmail.com`. Create your own Cloudflare account and you can easily be added to this.
+
+## Database tables and schemas
+
+## `waitlist_emails`
+
+Created by migration: `migrations/0002_init_waitlist_table.sql`
+
+Columns:
+
+- `id INTEGER PRIMARY KEY`
+- `email TEXT NOT NULL UNIQUE`
+- `created_at DATETIME DEFAULT CURRENT_TIMESTAMP`
+
+What it stores:
+
+- Waitlist signup email addresses.
+- One row per unique email (duplicates ignored via `INSERT OR IGNORE`).
+
+Write path:
+
+- `src/routes/emails.remote.ts` -> `storeWaitlistEmail`
+
+## `emails`
+
+Created by migration: `migrations/0001_init_emails_schema.sql`
+
+Columns:
+
+- `id INTEGER PRIMARY KEY`
+- `email TEXT NOT NULL UNIQUE`
+
+What it stores:
+
+- Canonical unique sender email addresses extracted from Gmail `From` headers.
+- Global dedupe table across users.
+
+Write path:
+
+- `src/lib/server/extraction-jobs.ts` -> `storeEmailsForUser`
+- SQL: `INSERT OR IGNORE INTO emails (email) VALUES (?)`
+
+## `email_submissions`
+
+Created by migration: `migrations/0001_init_emails_schema.sql`
+
+Columns:
+
+- `email_id INTEGER NOT NULL`
+- `user_hash TEXT NOT NULL`
+- `PRIMARY KEY (email_id, user_hash)`
+- `FOREIGN KEY (email_id) REFERENCES emails(id) ON DELETE CASCADE`
+
+Index:
+
+- `idx_email_submissions_user_hash ON email_submissions(user_hash)`
+
+What it stores:
+
+- Many-to-many link between a unique sender email (`email_id`) and a specific user mailbox hash (`user_hash`).
+- This represents "this hashed user has received mail from this sender".
+
+Write path:
+
+- `src/lib/server/extraction-jobs.ts` -> `storeEmailsForUser`
+- SQL: `INSERT OR IGNORE INTO email_submissions (email_id, user_hash) SELECT id, ? FROM emails WHERE email = ?`
+
+## `extraction_jobs`
+
+Created at runtime by application code (not migration):
+
+- `src/lib/server/extraction-jobs.ts` -> `ensureExtractionTables`
+
+Columns:
+
+- `id TEXT PRIMARY KEY`
+- `job_key TEXT`
+- `user_email TEXT NOT NULL`
+- `user_hash TEXT NOT NULL`
+- `access_token TEXT`
+- `next_page_token TEXT`
+- `status TEXT NOT NULL` (`pending` | `running` | `completed` | `failed`)
+- `scanned_messages INTEGER NOT NULL DEFAULT 0`
+- `estimated_total_messages INTEGER`
+- `unique_senders INTEGER NOT NULL DEFAULT 0`
+- `last_error TEXT`
+- `created_at TEXT NOT NULL`
+- `updated_at TEXT NOT NULL`
+
+Index:
+
+- `extraction_jobs_status_idx ON extraction_jobs(status, updated_at)`
+
+What it stores:
+
+- Progress and control state for background inbox extraction jobs.
+- Temporary OAuth access token used while extraction runs.
+- Cursor (`next_page_token`) for pagination across Gmail messages.
+- User mailbox identity in plaintext (`user_email`) and hashed (`user_hash`).
+
+Lifecycle details:
+
+- On job creation: row inserted with `status='pending'`, token set, and estimated Gmail message count if available.
+- During processing: status becomes `running`, scanned counts and unique sender counts are updated.
+- On completion: status set to `completed`, `access_token` set to `NULL`.
+- On failure: status set to `failed`, `access_token` and `job_key` set to `NULL`, `last_error` populated.
+
+## End-to-end data flow
+
+## A) Waitlist submission flow
+
+1. User enters email on the landing page.
+2. Client calls remote command `storeWaitlistEmail`.
+3. Server executes `INSERT OR IGNORE` into `waitlist_emails`.
+
+Stored data from this flow:
+
+- Waitlist email address (+ DB timestamp).
+
+## B) Gmail extraction flow
+
+1. User clicks sign-in; Google OAuth returns an access token (`gmail.readonly` scope).
+2. Client calls remote command `startEmailExtraction` with that token.
+3. Server:
+   - Fetches Gmail profile to determine `user_email` and total message estimate.
+   - Computes `user_hash = SHA-256(user_email + USER_HASH_SALT)`.
+   - Inserts an `extraction_jobs` row with job IDs and token.
+4. Background endpoint `/api/extraction/process` processes one page step at a time.
+5. For each Gmail page:
+   - Retrieves message IDs.
+   - Batch-fetches message metadata headers (`From` only).
+   - Extracts normalized sender emails via regex.
+   - Filters out the mailbox owner's own email.
+   - Upserts into `emails` and `email_submissions`.
+6. Job status is polled by the client until `completed` or `failed`.
+7. On completion/failure, token is cleared server-side.
+
+Stored data from this flow:
+
+- Sender email addresses (unique global set in `emails`).
+- Sender-to-user-hash relationships in `email_submissions`.
+- Job metadata/progress/errors in `extraction_jobs`.
+- Local browser resume token pair (`jobId`, `jobKey`) in localStorage during active jobs.
+
+## Environment variables and secrets
+
+Expected runtime env vars (Cloudflare Worker environment):
+
+- `PUBLIC_GOOGLE_CLIENT_ID`: public OAuth client ID used by frontend sign-in.
+- `GOOGLE_CLIENT_SECRET`: present in worker types; not directly referenced in current app code paths.
+- `USER_HASH_SALT`: server-side salt used when hashing user emails.
+
+Important notes:
+
+- Do not commit real secret values in repository files.
+- `USER_HASH_SALT` should remain secret and stable for consistent hashing.
+
+## Privacy notes
+
+- `user_email` is stored in plaintext in `extraction_jobs`.
+- OAuth `access_token` is stored temporarily in `extraction_jobs` while processing and cleared on terminal states.
+- `jobId` + `jobKey` gate read/process access to job status and processing steps.
+
+## Operational notes
+
+- New environments should apply migrations before first use.
+- `wrangler.jsonc` currently points to a remote D1 DB, so local development can affect production-linked data unless bindings are changed.

From b5c26ae56d3f0ada4d23661950ec76599407d639 Mon Sep 17 00:00:00 2001
From: ben <koppe.development@gmail.com>
Date: Sun, 22 Mar 2026 23:20:33 -0400
Subject: [PATCH 2/3] Create extraction_jobs_table with migration instead of
 runtime code

---
 DATA.md                                       |  4 +--
 .../0003_init_extraction_jobs_table.sql       | 19 ++++++++++++
 src/lib/server/extraction-jobs.ts             | 30 -------------------
 src/routes/api/extraction/process/+server.ts  |  7 +----
 src/routes/emails.remote.ts                   |  3 --
 5 files changed, 21 insertions(+), 42 deletions(-)
 create mode 100644 migrations/0003_init_extraction_jobs_table.sql

diff --git a/DATA.md b/DATA.md
index 0af3974..f442cd9 100644
--- a/DATA.md
+++ b/DATA.md
@@ -94,9 +94,7 @@ Write path:
 
 ## `extraction_jobs`
 
-Created at runtime by application code (not migration):
-
-- `src/lib/server/extraction-jobs.ts` -> `ensureExtractionTables`
+Created by migration: `migrations/0003_init_extraction_jobs_table.sql`
 
 Columns:
 
diff --git a/migrations/0003_init_extraction_jobs_table.sql b/migrations/0003_init_extraction_jobs_table.sql
new file mode 100644
index 0000000..57579f4
--- /dev/null
+++ b/migrations/0003_init_extraction_jobs_table.sql
@@ -0,0 +1,19 @@
+-- Migration number: 0003 	 2026-03-22T00:00:00.000Z
+CREATE TABLE IF NOT EXISTS extraction_jobs (
+  id TEXT PRIMARY KEY,
+  job_key TEXT,
+  user_email TEXT NOT NULL,
+  user_hash TEXT NOT NULL,
+  access_token TEXT,
+  next_page_token TEXT,
+  status TEXT NOT NULL,
+  scanned_messages INTEGER NOT NULL DEFAULT 0,
+  estimated_total_messages INTEGER,
+  unique_senders INTEGER NOT NULL DEFAULT 0,
+  last_error TEXT,
+  created_at TEXT NOT NULL,
+  updated_at TEXT NOT NULL
+);
+
+CREATE INDEX IF NOT EXISTS extraction_jobs_status_idx
+  ON extraction_jobs(status, updated_at);
diff --git a/src/lib/server/extraction-jobs.ts b/src/lib/server/extraction-jobs.ts
index f4b976c..11e4f2a 100644
--- a/src/lib/server/extraction-jobs.ts
+++ b/src/lib/server/extraction-jobs.ts
@@ -66,36 +66,6 @@ const RETRYABLE_GMAIL_403_REASONS = new Set([
 	'backenderror'
 ]);
 
-export async function ensureExtractionTables(db: D1Database): Promise<void> {
-	await db
-		.prepare(
-			`
-            CREATE TABLE IF NOT EXISTS extraction_jobs (
-                id TEXT PRIMARY KEY,
-                job_key TEXT,
-                user_email TEXT NOT NULL,
-                user_hash TEXT NOT NULL,
-                access_token TEXT,
-                next_page_token TEXT,
-                status TEXT NOT NULL,
-                scanned_messages INTEGER NOT NULL DEFAULT 0,
-                estimated_total_messages INTEGER,
-                unique_senders INTEGER NOT NULL DEFAULT 0,
-                last_error TEXT,
-                created_at TEXT NOT NULL,
-                updated_at TEXT NOT NULL
-            )
-            `
-		)
-		.run();
-
-	await db
-		.prepare(
-			'CREATE INDEX IF NOT EXISTS extraction_jobs_status_idx ON extraction_jobs(status, updated_at)'
-		)
-		.run();
-}
-
 export async function createExtractionJob(params: {
 	db: D1Database;
 	salt: string;
diff --git a/src/routes/api/extraction/process/+server.ts b/src/routes/api/extraction/process/+server.ts
index d8a13cd..46de36a 100644
--- a/src/routes/api/extraction/process/+server.ts
+++ b/src/routes/api/extraction/process/+server.ts
@@ -1,10 +1,6 @@
 import { json } from '@sveltejs/kit';
 import type { RequestHandler } from './$types';
-import {
-	ensureExtractionTables,
-	processExtractionJobStep,
-	scheduleExtractionStep
-} from '$lib/server/extraction-jobs';
+import { processExtractionJobStep, scheduleExtractionStep } from '$lib/server/extraction-jobs';
 
 export const POST: RequestHandler = async (event) => {
 	const db = event.platform?.env.loop_extract_emails_prod;
@@ -26,7 +22,6 @@ export const POST: RequestHandler = async (event) => {
 		return json({ ok: false, error: 'jobId and jobKey are required' }, { status: 400 });
 	}
 
-	await ensureExtractionTables(db);
 	const { done } = await processExtractionJobStep({ db, jobId, jobKey });
 
 	if (!done) {
diff --git a/src/routes/emails.remote.ts b/src/routes/emails.remote.ts
index d224d1f..7409c06 100644
--- a/src/routes/emails.remote.ts
+++ b/src/routes/emails.remote.ts
@@ -2,7 +2,6 @@ import * as v from 'valibot';
 import { command, getRequestEvent } from '$app/server';
 import {
 	createExtractionJob,
-	ensureExtractionTables,
 	getExtractionStatus,
 	scheduleExtractionStep
 } from '$lib/server/extraction-jobs';
@@ -17,7 +16,6 @@ export const startEmailExtraction = command(
 		if (!db) throw new Error('D1 binding not configured');
 		if (!salt) throw new Error('USER_HASH_SALT not configured');
 
-		await ensureExtractionTables(db);
 		const { jobId, jobKey } = await createExtractionJob({ db, salt, accessToken });
 
 		scheduleExtractionStep(event, { jobId, jobKey });
@@ -33,7 +31,6 @@ export const getEmailExtractionStatus = command(
 		const db = event.platform?.env.loop_extract_emails_prod;
 		if (!db) throw new Error('D1 binding not configured');
 
-		await ensureExtractionTables(db);
 		return getExtractionStatus(db, jobId, jobKey);
 	}
 );

From 38e63c42dfd04c283ff29f12897d1f01e1406bf2 Mon Sep 17 00:00:00 2001
From: ben <koppe.development@gmail.com>
Date: Wed, 8 Apr 2026 22:14:00 -0400
Subject: [PATCH 3/3] Mark account on worker connection

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2e33c02..6dace13 100644
--- a/README.md
+++ b/README.md
@@ -27,4 +27,4 @@ pnpm build
 
 You can preview the production build with `pnpm preview`.
 
-> For deployment, a [Cloudflare Worker](https://workers.cloudflare.com/) is configured to automatically track the `main` branch.
+> For deployment, a [Cloudflare Worker](https://workers.cloudflare.com/) is configured to automatically track the `main` branch. This worker is connected to the `Dtiincubator@gmail.com` account.