feat: add --retry-failed flag, skip failed URLs by default

caiopizzol · caiopizzol · commit e65ed8510aa7 · 2026-03-12T06:27:57.000-03:00
Default runs now skip all known URLs (uploaded + duplicate + failed)
for fast scans. Use --retry-failed to re-download only failures, or
--force to re-process everything from scratch.
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -54,11 +54,11 @@ IDs are content-addressed for storage mapping (`documents/{id}.docx`):
 
 The scraper handles three dedup scenarios in order:
 
-1. **URL-dedup** (instant, no download) — URL already in `processedUrls` Set (loaded from all crawls at startup). If the URL exists under a different `crawl_id`, creates a cross-crawl `duplicate` record under the current crawl. If already under the current crawl, silently skips.
+1. **URL-dedup** (instant, no download) — URL already in `processedHashes` Set (md5 hashes loaded from all crawls at startup). Includes uploaded, duplicate, AND failed URLs by default. If the URL exists under a different `crawl_id`, creates a cross-crawl `duplicate` record under the current crawl. If already under the current crawl, silently skips.
 
 2. **Content-dedup** (requires WARC download) — URL is new but content hash matches an existing document. Creates a `duplicate` record pointing to the original.
 
-3. **Same-URL retry** (within same crawl) — Same URL appears multiple times in CDX files (different WARC captures). After a successful WARC download, the URL is added to `processedUrls` so subsequent entries are skipped. Failed downloads do NOT add to `processedUrls`, allowing retry from a different WARC capture.
+3. **Same-URL retry** (within same crawl) — Same URL appears multiple times in CDX files (different WARC captures). After a successful WARC download, the URL is added to `processedHashes` so subsequent entries are skipped.
 
 ### Stale record cleanup
 
@@ -67,8 +67,9 @@ When a WARC download succeeds, the scraper deletes any previous `failed-{urlHash
 ### Re-run safety
 
 Running the scraper on the same crawl again is safe:
-- `--force` flag: re-downloads everything but checks `source_url` before creating dup records, so existing records aren't duplicated
-- Without `--force`: all existing URLs are in `processedUrls` and skipped instantly
+- `--force`: re-downloads everything from scratch
+- `--retry-failed`: re-downloads only previously failed URLs
+- Default: all known URLs (uploaded + duplicate + failed) are skipped instantly
 
 ## Database
 
diff --git a/apps/cli/cli.test.ts b/apps/cli/cli.test.ts
@@ -5,6 +5,7 @@ interface ParsedFlags {
   crawlCount?: number;
   verbose?: boolean;
   force?: boolean;
+  retryFailed?: boolean;
 }
 
 function parseFlags(args: string[]): ParsedFlags {
@@ -35,6 +36,8 @@ function parseFlags(args: string[]): ParsedFlags {
       flags.verbose = true;
     } else if (arg === "--force" || arg === "-f") {
       flags.force = true;
+    } else if (arg === "--retry-failed") {
+      flags.retryFailed = true;
     }
   }
 
diff --git a/apps/cli/commands/scrape.ts b/apps/cli/commands/scrape.ts
@@ -5,6 +5,7 @@ interface ParsedFlags {
   crawlCount?: number;
   verbose?: boolean;
   force?: boolean;
+  retryFailed?: boolean;
 }
 
 function parseFlags(args: string[]): ParsedFlags {
@@ -35,6 +36,8 @@ function parseFlags(args: string[]): ParsedFlags {
       flags.verbose = true;
     } else if (arg === "--force" || arg === "-f") {
       flags.force = true;
+    } else if (arg === "--retry-failed") {
+      flags.retryFailed = true;
     }
   }
 
@@ -52,8 +55,9 @@ Options
                     <n>         Latest n crawls (e.g., --crawl 3)
                     <id>        Single crawl ID
                     <id>,<id>   Comma-separated list
-  --force         Re-process URLs already in database
-  --verbose       Show detailed logs for debugging
+  --force           Re-process all URLs from scratch
+  --retry-failed    Re-download only previously failed URLs
+  --verbose         Show detailed logs for debugging
 
 Environment Variables
   CRAWL_ID             Common Crawl index ID (e.g., CC-MAIN-2025-51)
@@ -101,6 +105,7 @@ export async function runScrape(args: string[]) {
     config,
     verbose: flags.verbose,
     force: flags.force,
+    retryFailed: flags.retryFailed,
     crawlIds,
     version: VERSION,
   });
diff --git a/packages/scraper/scraper.ts b/packages/scraper/scraper.ts
@@ -136,12 +136,13 @@ export interface ScrapeOptions {
   config: Config;
   verbose?: boolean;
   force?: boolean;
+  retryFailed?: boolean;
   crawlIds?: string[];
   version: string;
 }
 
 export async function scrape(options: ScrapeOptions) {
-  const { config, verbose, force, crawlIds, version } = options;
+  const { config, verbose, force, retryFailed, crawlIds, version } = options;
   const startTime = Date.now();
   const useCloud = hasCloudflareCredentials(config);
 
@@ -175,6 +176,7 @@ export async function scrape(options: ScrapeOptions) {
   keyValue("Crawl(s)", resolvedCrawlIds.join(", "));
   keyValue("Workers", config.crawl.concurrency);
   if (force) keyValue("Force", "re-process all URLs");
+  if (retryFailed) keyValue("Retry", "re-download failed URLs");
   if (verbose) keyValue("Verbose", "enabled");
   blank();
 
@@ -191,12 +193,14 @@ export async function scrape(options: ScrapeOptions) {
   // Initialize database
   const db = await createDb(config.database.url);
 
-  // Pre-load processed URL hashes for fast duplicate checking
-  // Failed URLs are excluded so they get retried (different WARC capture might succeed)
-  // Uses md5 hashes instead of full URLs to reduce network transfer (~6s vs ~14s)
+  // Pre-load known URL hashes for fast skip checking
+  // --retry-failed: exclude failed URLs so they get re-downloaded
+  // --force: skip loading entirely, re-process all URLs
   section("Loading");
-  const processedHashes = force ? new Set<string>() : await db.getProcessedUrlHashes();
-  keyValue("Processed", `${processedHashes.size} hashes (uploaded + duplicate)`);
+  const processedHashes = force
+    ? new Set<string>()
+    : await db.getProcessedUrlHashes({ excludeFailed: retryFailed });
+  keyValue("Processed", `${processedHashes.size} hashes`);
 
   // Aggregate stats across all crawls
   const totalStats = { saved: 0, skipped: 0, failed: 0 };
diff --git a/packages/shared/db.ts b/packages/shared/db.ts
@@ -82,7 +82,7 @@ export interface DbClient {
   deleteDocument(id: string, crawlId?: string): Promise<void>;
   getDocument(id: string): Promise<DocumentRecord | null>;
   getDocumentByUrl(url: string): Promise<DocumentRecord | null>;
-  getProcessedUrlHashes(): Promise<Set<string>>;
+  getProcessedUrlHashes(opts?: { excludeFailed?: boolean }): Promise<Set<string>>;
   upsertDuplicateBatch(records: { id: string; sourceUrl: string; crawlId: string; filename: string }[]): Promise<void>;
   getUploadedUrls(): Promise<Set<string>>;
   getFailedUrls(): Promise<Set<string>>;
@@ -187,10 +187,14 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
       return rows[0] || null;
     },
 
-    async getProcessedUrlHashes() {
-      const rows = await sql<{ h: string }[]>`
-        SELECT md5(source_url) as h FROM documents WHERE status IN ('uploaded', 'duplicate')
-      `;
+    async getProcessedUrlHashes(opts?: { excludeFailed?: boolean }) {
+      const rows = opts?.excludeFailed
+        ? await sql<{ h: string }[]>`
+            SELECT md5(source_url) as h FROM documents WHERE status IN ('uploaded', 'duplicate')
+          `
+        : await sql<{ h: string }[]>`
+            SELECT md5(source_url) as h FROM documents
+          `;
       return new Set(rows.map((r) => r.h));
     },
 

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@ interface ParsedFlags {`
`5`	`5`	`crawlCount?: number;`
`6`	`6`	`verbose?: boolean;`
`7`	`7`	`force?: boolean;`
	`8`	`+ retryFailed?: boolean;`
`8`	`9`	`}`
`9`	`10`
`10`	`11`	`function parseFlags(args: string[]): ParsedFlags {`
`@@ -35,6 +36,8 @@ function parseFlags(args: string[]): ParsedFlags {`
`35`	`36`	`flags.verbose = true;`
`36`	`37`	`} else if (arg === "--force" \|\| arg === "-f") {`
`37`	`38`	`flags.force = true;`
	`39`	`+ } else if (arg === "--retry-failed") {`
	`40`	`+ flags.retryFailed = true;`
`38`	`41`	`}`
`39`	`42`	`}`
`40`	`43`