Skip to content

Commit 3d768df

Browse files
committed
fix: skip duplicate record when re-scraping same URL with --force
Only create a duplicate record when the content hash matches a different URL. Prevents spurious duplicates when re-running the same crawl with --force.
1 parent 72e733e commit 3d768df

1 file changed

Lines changed: 12 additions & 9 deletions

File tree

packages/scraper/scraper.ts

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,15 +84,18 @@ async function processRecord(record: CdxRecord, ctx: ProcessContext) {
8484
const existingByHash = await db.getDocument(hash);
8585
if (existingByHash && existingByHash.status === "uploaded") {
8686
stats.skipped++;
87-
const urlHash = await computeHash(new TextEncoder().encode(record.url));
88-
await db.upsertDocument({
89-
id: `dup-${urlHash}`,
90-
source_url: record.url,
91-
crawl_id: crawlId,
92-
original_filename: extractFilename(record.url),
93-
status: "duplicate",
94-
error_message: `duplicate content of ${hash}`,
95-
});
87+
// Only create duplicate record if it's actually a different URL
88+
if (existingByHash.source_url !== record.url) {
89+
const urlHash = await computeHash(new TextEncoder().encode(record.url));
90+
await db.upsertDocument({
91+
id: `dup-${urlHash}`,
92+
source_url: record.url,
93+
crawl_id: crawlId,
94+
original_filename: extractFilename(record.url),
95+
status: "duplicate",
96+
error_message: `duplicate content of ${hash}`,
97+
});
98+
}
9699
return;
97100
}
98101

0 commit comments

Comments
 (0)