Skip to content

Commit e65ed85

Browse files
committed
feat: add --retry-failed flag, skip failed URLs by default
Default runs now skip all known URLs (uploaded + duplicate + failed) for fast scans. Use --retry-failed to re-download only failures, or --force to re-process everything from scratch.
1 parent 4061de6 commit e65ed85

5 files changed

Lines changed: 34 additions & 17 deletions

File tree

CLAUDE.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,11 @@ IDs are content-addressed for storage mapping (`documents/{id}.docx`):
5454

5555
The scraper handles three dedup scenarios in order:
5656

57-
1. **URL-dedup** (instant, no download) — URL already in `processedUrls` Set (loaded from all crawls at startup). If the URL exists under a different `crawl_id`, creates a cross-crawl `duplicate` record under the current crawl. If already under the current crawl, silently skips.
57+
1. **URL-dedup** (instant, no download) — URL already in `processedHashes` Set (md5 hashes loaded from all crawls at startup). Includes uploaded, duplicate, AND failed URLs by default. If the URL exists under a different `crawl_id`, creates a cross-crawl `duplicate` record under the current crawl. If already under the current crawl, silently skips.
5858

5959
2. **Content-dedup** (requires WARC download) — URL is new but content hash matches an existing document. Creates a `duplicate` record pointing to the original.
6060

61-
3. **Same-URL retry** (within same crawl) — Same URL appears multiple times in CDX files (different WARC captures). After a successful WARC download, the URL is added to `processedUrls` so subsequent entries are skipped. Failed downloads do NOT add to `processedUrls`, allowing retry from a different WARC capture.
61+
3. **Same-URL retry** (within same crawl) — Same URL appears multiple times in CDX files (different WARC captures). After a successful WARC download, the URL is added to `processedHashes` so subsequent entries are skipped.
6262

6363
### Stale record cleanup
6464

@@ -67,8 +67,9 @@ When a WARC download succeeds, the scraper deletes any previous `failed-{urlHash
6767
### Re-run safety
6868

6969
Running the scraper on the same crawl again is safe:
70-
- `--force` flag: re-downloads everything but checks `source_url` before creating dup records, so existing records aren't duplicated
71-
- Without `--force`: all existing URLs are in `processedUrls` and skipped instantly
70+
- `--force`: re-downloads everything from scratch
71+
- `--retry-failed`: re-downloads only previously failed URLs
72+
- Default: all known URLs (uploaded + duplicate + failed) are skipped instantly
7273

7374
## Database
7475

apps/cli/cli.test.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ interface ParsedFlags {
55
crawlCount?: number;
66
verbose?: boolean;
77
force?: boolean;
8+
retryFailed?: boolean;
89
}
910

1011
function parseFlags(args: string[]): ParsedFlags {
@@ -35,6 +36,8 @@ function parseFlags(args: string[]): ParsedFlags {
3536
flags.verbose = true;
3637
} else if (arg === "--force" || arg === "-f") {
3738
flags.force = true;
39+
} else if (arg === "--retry-failed") {
40+
flags.retryFailed = true;
3841
}
3942
}
4043

apps/cli/commands/scrape.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ interface ParsedFlags {
55
crawlCount?: number;
66
verbose?: boolean;
77
force?: boolean;
8+
retryFailed?: boolean;
89
}
910

1011
function parseFlags(args: string[]): ParsedFlags {
@@ -35,6 +36,8 @@ function parseFlags(args: string[]): ParsedFlags {
3536
flags.verbose = true;
3637
} else if (arg === "--force" || arg === "-f") {
3738
flags.force = true;
39+
} else if (arg === "--retry-failed") {
40+
flags.retryFailed = true;
3841
}
3942
}
4043

@@ -52,8 +55,9 @@ Options
5255
<n> Latest n crawls (e.g., --crawl 3)
5356
<id> Single crawl ID
5457
<id>,<id> Comma-separated list
55-
--force Re-process URLs already in database
56-
--verbose Show detailed logs for debugging
58+
--force Re-process all URLs from scratch
59+
--retry-failed Re-download only previously failed URLs
60+
--verbose Show detailed logs for debugging
5761
5862
Environment Variables
5963
CRAWL_ID Common Crawl index ID (e.g., CC-MAIN-2025-51)
@@ -101,6 +105,7 @@ export async function runScrape(args: string[]) {
101105
config,
102106
verbose: flags.verbose,
103107
force: flags.force,
108+
retryFailed: flags.retryFailed,
104109
crawlIds,
105110
version: VERSION,
106111
});

packages/scraper/scraper.ts

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -136,12 +136,13 @@ export interface ScrapeOptions {
136136
config: Config;
137137
verbose?: boolean;
138138
force?: boolean;
139+
retryFailed?: boolean;
139140
crawlIds?: string[];
140141
version: string;
141142
}
142143

143144
export async function scrape(options: ScrapeOptions) {
144-
const { config, verbose, force, crawlIds, version } = options;
145+
const { config, verbose, force, retryFailed, crawlIds, version } = options;
145146
const startTime = Date.now();
146147
const useCloud = hasCloudflareCredentials(config);
147148

@@ -175,6 +176,7 @@ export async function scrape(options: ScrapeOptions) {
175176
keyValue("Crawl(s)", resolvedCrawlIds.join(", "));
176177
keyValue("Workers", config.crawl.concurrency);
177178
if (force) keyValue("Force", "re-process all URLs");
179+
if (retryFailed) keyValue("Retry", "re-download failed URLs");
178180
if (verbose) keyValue("Verbose", "enabled");
179181
blank();
180182

@@ -191,12 +193,14 @@ export async function scrape(options: ScrapeOptions) {
191193
// Initialize database
192194
const db = await createDb(config.database.url);
193195

194-
// Pre-load processed URL hashes for fast duplicate checking
195-
// Failed URLs are excluded so they get retried (different WARC capture might succeed)
196-
// Uses md5 hashes instead of full URLs to reduce network transfer (~6s vs ~14s)
196+
// Pre-load known URL hashes for fast skip checking
197+
// --retry-failed: exclude failed URLs so they get re-downloaded
198+
// --force: skip loading entirely, re-process all URLs
197199
section("Loading");
198-
const processedHashes = force ? new Set<string>() : await db.getProcessedUrlHashes();
199-
keyValue("Processed", `${processedHashes.size} hashes (uploaded + duplicate)`);
200+
const processedHashes = force
201+
? new Set<string>()
202+
: await db.getProcessedUrlHashes({ excludeFailed: retryFailed });
203+
keyValue("Processed", `${processedHashes.size} hashes`);
200204

201205
// Aggregate stats across all crawls
202206
const totalStats = { saved: 0, skipped: 0, failed: 0 };

packages/shared/db.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ export interface DbClient {
8282
deleteDocument(id: string, crawlId?: string): Promise<void>;
8383
getDocument(id: string): Promise<DocumentRecord | null>;
8484
getDocumentByUrl(url: string): Promise<DocumentRecord | null>;
85-
getProcessedUrlHashes(): Promise<Set<string>>;
85+
getProcessedUrlHashes(opts?: { excludeFailed?: boolean }): Promise<Set<string>>;
8686
upsertDuplicateBatch(records: { id: string; sourceUrl: string; crawlId: string; filename: string }[]): Promise<void>;
8787
getUploadedUrls(): Promise<Set<string>>;
8888
getFailedUrls(): Promise<Set<string>>;
@@ -187,10 +187,14 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
187187
return rows[0] || null;
188188
},
189189

190-
async getProcessedUrlHashes() {
191-
const rows = await sql<{ h: string }[]>`
192-
SELECT md5(source_url) as h FROM documents WHERE status IN ('uploaded', 'duplicate')
193-
`;
190+
async getProcessedUrlHashes(opts?: { excludeFailed?: boolean }) {
191+
const rows = opts?.excludeFailed
192+
? await sql<{ h: string }[]>`
193+
SELECT md5(source_url) as h FROM documents WHERE status IN ('uploaded', 'duplicate')
194+
`
195+
: await sql<{ h: string }[]>`
196+
SELECT md5(source_url) as h FROM documents
197+
`;
194198
return new Set(rows.map((r) => r.h));
195199
},
196200

0 commit comments

Comments
 (0)