Skip to content

Commit bf83dcd

Browse files
committed
fix: remove dup batching and add unhandled rejection handler
- Simplify cross-crawl dup inserts (inline instead of batched) - Add process.on('unhandledRejection') to surface silent crashes
1 parent 324e4ea commit bf83dcd

1 file changed

Lines changed: 12 additions & 24 deletions

File tree

packages/scraper/scraper.ts

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ export async function scrape(options: ScrapeOptions) {
140140
const startTime = Date.now();
141141
const useCloud = hasCloudflareCredentials(config);
142142

143+
// Catch unhandled rejections so they don't silently crash the process
144+
process.on("unhandledRejection", (err) => {
145+
console.error("\n[FATAL] Unhandled rejection:", err);
146+
});
147+
143148
header("docx-corpus", version);
144149

145150
// Resolve crawl IDs: from param, config, or fetch latest
@@ -267,25 +272,6 @@ export async function scrape(options: ScrapeOptions) {
267272

268273
const tasks: Set<Promise<void>> = new Set();
269274

270-
// Batch cross-crawl dup inserts for performance
271-
const pendingDups: { id: string; url: string; filename: string }[] = [];
272-
const DUP_BATCH_SIZE = 100;
273-
274-
const flushDups = async () => {
275-
if (pendingDups.length === 0) return;
276-
const batch = pendingDups.splice(0);
277-
for (const dup of batch) {
278-
await db.upsertDocument({
279-
id: dup.id,
280-
source_url: dup.url,
281-
crawl_id: crawlId,
282-
original_filename: dup.filename,
283-
status: "duplicate",
284-
error_message: "cross-crawl duplicate",
285-
});
286-
}
287-
};
288-
289275
try {
290276
for await (const record of streamCdxFromR2(config, crawlId)) {
291277
if (stats.saved >= batchSize) break;
@@ -295,15 +281,18 @@ export async function scrape(options: ScrapeOptions) {
295281
// Fast skip: URL already processed (outside downloadLimit for instant throughput)
296282
if (!force && processedUrls.has(record.url)) {
297283
stats.skipped++;
284+
// Create cross-crawl dup record if URL exists in DB but not under this crawl
298285
if (!crawlUrls.has(record.url)) {
299286
const crawlScopedHash = await computeHash(new TextEncoder().encode(record.url + crawlId));
300-
pendingDups.push({
287+
await db.upsertDocument({
301288
id: `dup-${crawlScopedHash}`,
302-
url: record.url,
303-
filename: extractFilename(record.url),
289+
source_url: record.url,
290+
crawl_id: crawlId,
291+
original_filename: extractFilename(record.url),
292+
status: "duplicate",
293+
error_message: "cross-crawl duplicate",
304294
});
305295
crawlUrls.add(record.url);
306-
if (pendingDups.length >= DUP_BATCH_SIZE) await flushDups();
307296
}
308297
updateProgress();
309298
continue;
@@ -348,7 +337,6 @@ export async function scrape(options: ScrapeOptions) {
348337
}
349338

350339
await Promise.all(tasks);
351-
await flushDups();
352340
clearLines(prevLineCount);
353341

354342
// Accumulate totals

0 commit comments

Comments
 (0)