feat: scraper dedup improvements and failed URL retry

caiopizzol · caiopizzol · commit bcc9ea3b62c3 · 2026-03-11T07:26:02.000-03:00
- Add cross-crawl duplicate detection (creates dup records when URL exists under different crawl)
- Content-dedup now matches any existing hash, not just uploaded status
- Clean up stale failed-* records when WARC retry succeeds
- Track processedUrls after successful download to prevent same-URL CDX duplicates
- Failed URLs are no longer skipped on re-runs (different WARC capture might succeed)
- Add deleteDocument and getUrlsForCrawl to DbClient
- Document dedup model in CLAUDE.md
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -29,6 +29,47 @@ Each stage writes to the same PostgreSQL database (`documents` table):
 3. **Embed** (TS) — Google API → pgvector (`embedding`, `embedded_at`)
 4. **Classify** (Python) — ModernBERT → labels (`document_type`, `document_topic`)
 
+## Scraper deduplication
+
+The scraper maintains **exact parity** between CDX URLs and database records: every URL in a crawl's CDX files has exactly one record in the `documents` table under that `crawl_id`.
+
+### Document statuses
+
+- `uploaded` — valid .docx saved to R2, ID is `{contentHash}`
+- `failed` — WARC download failed or content is invalid docx, ID is `failed-{urlHash}` (download error) or `{contentHash}` (validation error)
+- `duplicate` — same content already exists under a different URL, ID is `dup-{urlHash}`
+
+### ID scheme
+
+IDs are content-addressed for storage mapping (`documents/{id}.docx`):
+
+| Scenario | ID | Reason |
+|---|---|---|
+| Uploaded | `{sha256(content)}` | Maps to R2 storage key |
+| Download failed | `failed-{sha256(url)}` | No content available, use URL hash |
+| Validation failed | `{sha256(content)}` | Content exists but isn't valid docx |
+| Content duplicate | `dup-{sha256(url)}` | Content hash would collide with original |
+
+### Dedup paths
+
+The scraper handles three dedup scenarios in order:
+
+1. **URL-dedup** (instant, no download) — URL already in `processedUrls` Set (loaded from all crawls at startup). If the URL exists under a different `crawl_id`, creates a cross-crawl `duplicate` record under the current crawl. If already under the current crawl, silently skips.
+
+2. **Content-dedup** (requires WARC download) — URL is new but content hash matches an existing document. Creates a `duplicate` record pointing to the original.
+
+3. **Same-URL retry** (within same crawl) — Same URL appears multiple times in CDX files (different WARC captures). After a successful WARC download, the URL is added to `processedUrls` so subsequent entries are skipped. Failed downloads do NOT add to `processedUrls`, allowing retry from a different WARC capture.
+
+### Stale record cleanup
+
+When a WARC download succeeds, the scraper deletes any previous `failed-{urlHash}` record for that URL. This prevents duplicate records when a URL fails on one attempt but succeeds on a later retry (since the failed and successful records have different IDs).
+
+### Re-run safety
+
+Running the scraper on the same crawl again is safe:
+- `--force` flag: re-downloads everything but checks `source_url` before creating dup records, so existing records aren't duplicated
+- Without `--force`: all existing URLs are in `processedUrls` and skipped instantly
+
 ## Database
 
 Single `documents` table in PostgreSQL (NeonDB) with pgvector. All pipeline stages write to this table.
diff --git a/packages/scraper/scraper.ts b/packages/scraper/scraper.ts
@@ -29,13 +29,13 @@ interface ProcessContext {
   crawlId: string;
   stats: { saved: number; skipped: number; failed: number };
   rateLimiter: RateLimiter;
-  uploadedUrls: Set<string>;
+  processedUrls: Set<string>;
   force?: boolean;
   onError?: (status: number, url: string, message: string) => void;
 }
 
 async function processRecord(record: CdxRecord, ctx: ProcessContext) {
-  const { db, storage, config, crawlId, stats, rateLimiter, onError } = ctx;
+  const { db, storage, config, crawlId, stats, rateLimiter, processedUrls, onError } = ctx;
 
   // Download from WARC
   let result: WarcResult;
@@ -59,6 +59,13 @@ async function processRecord(record: CdxRecord, ctx: ProcessContext) {
     return;
   }
 
+  // Clean up any previous failed-* record for this URL (from a prior failed attempt)
+  const urlHash = await computeHash(new TextEncoder().encode(record.url));
+  await db.deleteDocument(`failed-${urlHash}`);
+
+  // Mark URL as processed so duplicate CDX entries are skipped
+  processedUrls.add(record.url);
+
   // Validate
   const validation = validateDocx(result.content);
   if (!validation.isValid) {
@@ -82,20 +89,16 @@ async function processRecord(record: CdxRecord, ctx: ProcessContext) {
 
   // Check if already exists by hash (different URL, same file content)
   const existingByHash = await db.getDocument(hash);
-  if (existingByHash && existingByHash.status === "uploaded") {
+  if (existingByHash && existingByHash.source_url !== record.url) {
     stats.skipped++;
-    // Only create duplicate record if it's actually a different URL
-    if (existingByHash.source_url !== record.url) {
-      const urlHash = await computeHash(new TextEncoder().encode(record.url));
-      await db.upsertDocument({
-        id: `dup-${urlHash}`,
-        source_url: record.url,
-        crawl_id: crawlId,
-        original_filename: extractFilename(record.url),
-        status: "duplicate",
-        error_message: `duplicate content of ${hash}`,
-      });
-    }
+    await db.upsertDocument({
+      id: `dup-${urlHash}`,
+      source_url: record.url,
+      crawl_id: crawlId,
+      original_filename: extractFilename(record.url),
+      status: "duplicate",
+      error_message: `duplicate content of ${hash}`,
+    });
     return;
   }
 
@@ -176,17 +179,20 @@ export async function scrape(options: ScrapeOptions) {
   // Initialize database
   const db = await createDb(config.database.url);
 
-  // Pre-load processed URLs for fast duplicate checking (includes uploaded, failed, and duplicate)
+  // Pre-load processed URLs for fast duplicate checking
+  // Failed URLs are excluded so they get retried (different WARC capture might succeed)
   const uploadedUrls = force ? new Set<string>() : await db.getUploadedUrls();
-  const failedUrls = force ? new Set<string>() : await db.getFailedUrls();
   const duplicateUrls = force ? new Set<string>() : await db.getDuplicateUrls();
-  const processedUrls = new Set([...uploadedUrls, ...failedUrls, ...duplicateUrls]);
+  const processedUrls = new Set([...uploadedUrls, ...duplicateUrls]);
 
   // Aggregate stats across all crawls
   const totalStats = { saved: 0, skipped: 0, failed: 0 };
 
   // Process each crawl
   for (const crawlId of resolvedCrawlIds) {
+    // Pre-load URLs already tracked under this crawl for cross-crawl dedup
+    const crawlUrls = force ? new Set<string>() : await db.getUrlsForCrawl(crawlId);
+
     const crawlStartTime = Date.now();
 
     // Per-crawl stats
@@ -264,6 +270,19 @@ export async function scrape(options: ScrapeOptions) {
         // Check duplicates BEFORE rate limiting (instant skip)
         if (!force && processedUrls.has(record.url)) {
           stats.skipped++;
+          // Create cross-crawl dup record if URL exists in DB but not under this crawl
+          if (!crawlUrls.has(record.url)) {
+            const urlHash = await computeHash(new TextEncoder().encode(record.url));
+            await db.upsertDocument({
+              id: `dup-${urlHash}`,
+              source_url: record.url,
+              crawl_id: crawlId,
+              original_filename: extractFilename(record.url),
+              status: "duplicate",
+              error_message: "cross-crawl duplicate",
+            });
+            crawlUrls.add(record.url);
+          }
           updateProgress();
           return;
         }
@@ -276,7 +295,7 @@ export async function scrape(options: ScrapeOptions) {
           crawlId,
           stats,
           rateLimiter,
-          uploadedUrls,
+          processedUrls,
           force,
           onError,
         });
diff --git a/packages/shared/db.ts b/packages/shared/db.ts
@@ -79,11 +79,13 @@ export interface LLMClassificationData {
 export interface DbClient {
   // Scraping methods (existing)
   upsertDocument(doc: Partial<DocumentRecord> & { id: string }): Promise<void>;
+  deleteDocument(id: string): Promise<void>;
   getDocument(id: string): Promise<DocumentRecord | null>;
   getDocumentByUrl(url: string): Promise<DocumentRecord | null>;
   getUploadedUrls(): Promise<Set<string>>;
   getFailedUrls(): Promise<Set<string>>;
   getDuplicateUrls(): Promise<Set<string>>;
+  getUrlsForCrawl(crawlId: string): Promise<Set<string>>;
   getDocumentsByStatus(status: DocumentStatus, limit?: number): Promise<DocumentRecord[]>;
   getStats(): Promise<{ status: string; count: number }[]>;
   getAllDocuments(limit?: number): Promise<DocumentRecord[]>;
@@ -161,6 +163,10 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
       }
     },
 
+    async deleteDocument(id: string) {
+      await sql`DELETE FROM documents WHERE id = ${id}`;
+    },
+
     async getDocument(id: string) {
       const rows = await sql<DocumentRecord[]>`
         SELECT * FROM documents WHERE id = ${id}
@@ -196,6 +202,13 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
       return new Set(rows.map((r) => r.source_url));
     },
 
+    async getUrlsForCrawl(crawlId: string) {
+      const rows = await sql<{ source_url: string }[]>`
+        SELECT source_url FROM documents WHERE crawl_id = ${crawlId}
+      `;
+      return new Set(rows.map((r) => r.source_url));
+    },
+
     async getDocumentsByStatus(status: DocumentStatus, limit = 100) {
       return sql<DocumentRecord[]>`
         SELECT * FROM documents WHERE status = ${status} LIMIT ${limit}