perf: optimize scraper DB queries

caiopizzol · caiopizzol · commit 8b60a6d1cd27 · 2026-03-12T06:20:10.000-03:00
- Transfer md5 hashes instead of full URLs for processedUrls loading
  (~6s vs ~14s, 70% less data over the wire)
- Batch cross-crawl dup inserts (100 per round trip vs 1)
- Add covering indexes on (status, source_url) and (crawl_id, source_url)
  for index-only scans
- Add upsertDuplicateBatch to DbClient for bulk dup record creation
diff --git a/db/migrations/005_add_covering_indexes.sql b/db/migrations/005_add_covering_indexes.sql
@@ -0,0 +1,10 @@
+-- Covering indexes for common scraper queries
+-- Allows index-only scans (no table lookups) for URL loading
+
+-- Covers: SELECT source_url FROM documents WHERE status IN ('uploaded', 'duplicate')
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_documents_status_url
+  ON documents(status, source_url);
+
+-- Covers: SELECT source_url FROM documents WHERE crawl_id = $1
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_documents_crawl_url
+  ON documents(crawl_id, source_url);
diff --git a/db/schema.sql b/db/schema.sql
@@ -51,6 +51,10 @@ CREATE INDEX IF NOT EXISTS idx_documents_status ON documents(status);
 CREATE INDEX IF NOT EXISTS idx_documents_crawl_id ON documents(crawl_id);
 CREATE INDEX IF NOT EXISTS idx_documents_source_url ON documents(source_url);
 
+-- Covering indexes for scraper URL loading (index-only scans)
+CREATE INDEX IF NOT EXISTS idx_documents_status_url ON documents(status, source_url);
+CREATE INDEX IF NOT EXISTS idx_documents_crawl_url ON documents(crawl_id, source_url);
+
 -- Indexes for extraction/embedding/classification queries
 CREATE INDEX IF NOT EXISTS idx_documents_extracted ON documents(extracted_at) WHERE extracted_at IS NOT NULL;
 CREATE INDEX IF NOT EXISTS idx_documents_embedded ON documents(embedded_at) WHERE embedded_at IS NOT NULL;
diff --git a/packages/scraper/scraper.ts b/packages/scraper/scraper.ts
@@ -22,20 +22,26 @@ import {
 } from "@docx-corpus/shared";
 import { computeHash, extractFilename, validateDocx } from "./validation";
 
+function md5Url(url: string): string {
+  const hasher = new Bun.CryptoHasher("md5");
+  hasher.update(url);
+  return hasher.digest("hex");
+}
+
 interface ProcessContext {
   db: Awaited<ReturnType<typeof createDb>>;
   storage: Storage;
   config: Config;
   crawlId: string;
   stats: { saved: number; skipped: number; failed: number };
   rateLimiter: RateLimiter;
-  processedUrls: Set<string>;
+  processedHashes: Set<string>;
   force?: boolean;
   onError?: (status: number, url: string, message: string) => void;
 }
 
 async function processRecord(record: CdxRecord, ctx: ProcessContext) {
-  const { db, storage, config, crawlId, stats, rateLimiter, processedUrls, onError } = ctx;
+  const { db, storage, config, crawlId, stats, rateLimiter, processedHashes, onError } = ctx;
 
   // Download from WARC
   let result: WarcResult;
@@ -65,7 +71,7 @@ async function processRecord(record: CdxRecord, ctx: ProcessContext) {
   await db.deleteDocument(`failed-${urlHash}`, crawlId);
 
   // Mark URL as processed so duplicate CDX entries are skipped
-  processedUrls.add(record.url);
+  processedHashes.add(md5Url(record.url));
 
   // Validate
   const validation = validateDocx(result.content);
@@ -185,15 +191,12 @@ export async function scrape(options: ScrapeOptions) {
   // Initialize database
   const db = await createDb(config.database.url);
 
-  // Pre-load processed URLs for fast duplicate checking
+  // Pre-load processed URL hashes for fast duplicate checking
   // Failed URLs are excluded so they get retried (different WARC capture might succeed)
+  // Uses md5 hashes instead of full URLs to reduce network transfer (~6s vs ~14s)
   section("Loading");
-  const uploadedUrls = force ? new Set<string>() : await db.getUploadedUrls();
-  keyValue("Uploaded", `${uploadedUrls.size} URLs`);
-  const duplicateUrls = force ? new Set<string>() : await db.getDuplicateUrls();
-  keyValue("Duplicate", `${duplicateUrls.size} URLs`);
-  const processedUrls = new Set([...uploadedUrls, ...duplicateUrls]);
-  keyValue("Total", `${processedUrls.size} known URLs`);
+  const processedHashes = force ? new Set<string>() : await db.getProcessedUrlHashes();
+  keyValue("Processed", `${processedHashes.size} hashes (uploaded + duplicate)`);
 
   // Aggregate stats across all crawls
   const totalStats = { saved: 0, skipped: 0, failed: 0 };
@@ -272,25 +275,39 @@ export async function scrape(options: ScrapeOptions) {
 
     const tasks: Set<Promise<void>> = new Set();
 
+    // Batch cross-crawl dup inserts to avoid per-record DB round trips
+    type DupRecord = { id: string; sourceUrl: string; crawlId: string; filename: string };
+    const pendingDups: DupRecord[] = [];
+    const DUP_BATCH_SIZE = 100;
+
+    const flushDups = async () => {
+      if (pendingDups.length === 0) return;
+      const batch = pendingDups.splice(0);
+      await db.upsertDuplicateBatch(batch);
+    };
+
     try {
       for await (const record of streamCdxFromR2(config, crawlId)) {
         stats.discovered++;
 
         // Fast skip: URL already processed (outside downloadLimit for instant throughput)
-        if (!force && processedUrls.has(record.url)) {
+        if (!force && processedHashes.has(md5Url(record.url))) {
           stats.skipped++;
-          // Create cross-crawl dup record if URL exists in DB but not under this crawl
+          // Queue cross-crawl dup record if URL exists in DB but not under this crawl
           if (!crawlUrls.has(record.url)) {
-            const crawlScopedHash = await computeHash(new TextEncoder().encode(record.url + crawlId));
-            await db.upsertDocument({
+            const hasher = new Bun.CryptoHasher("sha256");
+            hasher.update(record.url + crawlId);
+            const crawlScopedHash = hasher.digest("hex");
+            pendingDups.push({
               id: `dup-${crawlScopedHash}`,
-              source_url: record.url,
-              crawl_id: crawlId,
-              original_filename: extractFilename(record.url),
-              status: "duplicate",
-              error_message: "cross-crawl duplicate",
+              sourceUrl: record.url,
+              crawlId,
+              filename: extractFilename(record.url),
             });
             crawlUrls.add(record.url);
+            if (pendingDups.length >= DUP_BATCH_SIZE) {
+              await flushDups();
+            }
           }
           updateProgress();
           continue;
@@ -308,7 +325,7 @@ export async function scrape(options: ScrapeOptions) {
               crawlId,
               stats,
               rateLimiter,
-              processedUrls,
+              processedHashes,
               force,
               onError,
             });
@@ -334,6 +351,8 @@ export async function scrape(options: ScrapeOptions) {
       prevLineCount = 0;
     }
 
+    // Flush remaining cross-crawl dup records
+    await flushDups();
     await Promise.all(tasks);
     clearLines(prevLineCount);
 
diff --git a/packages/shared/db.ts b/packages/shared/db.ts
@@ -82,6 +82,8 @@ export interface DbClient {
   deleteDocument(id: string, crawlId?: string): Promise<void>;
   getDocument(id: string): Promise<DocumentRecord | null>;
   getDocumentByUrl(url: string): Promise<DocumentRecord | null>;
+  getProcessedUrlHashes(): Promise<Set<string>>;
+  upsertDuplicateBatch(records: { id: string; sourceUrl: string; crawlId: string; filename: string }[]): Promise<void>;
   getUploadedUrls(): Promise<Set<string>>;
   getFailedUrls(): Promise<Set<string>>;
   getDuplicateUrls(): Promise<Set<string>>;
@@ -185,6 +187,30 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
       return rows[0] || null;
     },
 
+    async getProcessedUrlHashes() {
+      const rows = await sql<{ h: string }[]>`
+        SELECT md5(source_url) as h FROM documents WHERE status IN ('uploaded', 'duplicate')
+      `;
+      return new Set(rows.map((r) => r.h));
+    },
+
+    async upsertDuplicateBatch(records) {
+      if (records.length === 0) return;
+      const values = records.map((_, i) => {
+        const b = i * 6;
+        return `($${b + 1}, $${b + 2}, $${b + 3}, $${b + 4}, $${b + 5}, $${b + 6})`;
+      }).join(", ");
+      const params = records.flatMap((r) => [
+        r.id, r.sourceUrl, r.crawlId, r.filename, "duplicate", "cross-crawl duplicate",
+      ]);
+      await sql.unsafe(
+        `INSERT INTO documents (id, source_url, crawl_id, original_filename, status, error_message)
+         VALUES ${values}
+         ON CONFLICT (id) DO NOTHING`,
+        params
+      );
+    },
+
     async getUploadedUrls() {
       const rows = await sql<{ source_url: string }[]>`
         SELECT source_url FROM documents WHERE status = 'uploaded'