@@ -140,6 +140,11 @@ export async function scrape(options: ScrapeOptions) {
140140 const startTime = Date . now ( ) ;
141141 const useCloud = hasCloudflareCredentials ( config ) ;
142142
143+ // Catch unhandled rejections so they don't silently crash the process
144+ process . on ( "unhandledRejection" , ( err ) => {
145+ console . error ( "\n[FATAL] Unhandled rejection:" , err ) ;
146+ } ) ;
147+
143148 header ( "docx-corpus" , version ) ;
144149
145150 // Resolve crawl IDs: from param, config, or fetch latest
@@ -267,25 +272,6 @@ export async function scrape(options: ScrapeOptions) {
267272
268273 const tasks : Set < Promise < void > > = new Set ( ) ;
269274
270- // Batch cross-crawl dup inserts for performance
271- const pendingDups : { id : string ; url : string ; filename : string } [ ] = [ ] ;
272- const DUP_BATCH_SIZE = 100 ;
273-
274- const flushDups = async ( ) => {
275- if ( pendingDups . length === 0 ) return ;
276- const batch = pendingDups . splice ( 0 ) ;
277- for ( const dup of batch ) {
278- await db . upsertDocument ( {
279- id : dup . id ,
280- source_url : dup . url ,
281- crawl_id : crawlId ,
282- original_filename : dup . filename ,
283- status : "duplicate" ,
284- error_message : "cross-crawl duplicate" ,
285- } ) ;
286- }
287- } ;
288-
289275 try {
290276 for await ( const record of streamCdxFromR2 ( config , crawlId ) ) {
291277 if ( stats . saved >= batchSize ) break ;
@@ -295,15 +281,18 @@ export async function scrape(options: ScrapeOptions) {
295281 // Fast skip: URL already processed (outside downloadLimit for instant throughput)
296282 if ( ! force && processedUrls . has ( record . url ) ) {
297283 stats . skipped ++ ;
284+ // Create cross-crawl dup record if URL exists in DB but not under this crawl
298285 if ( ! crawlUrls . has ( record . url ) ) {
299286 const crawlScopedHash = await computeHash ( new TextEncoder ( ) . encode ( record . url + crawlId ) ) ;
300- pendingDups . push ( {
287+ await db . upsertDocument ( {
301288 id : `dup-${ crawlScopedHash } ` ,
302- url : record . url ,
303- filename : extractFilename ( record . url ) ,
289+ source_url : record . url ,
290+ crawl_id : crawlId ,
291+ original_filename : extractFilename ( record . url ) ,
292+ status : "duplicate" ,
293+ error_message : "cross-crawl duplicate" ,
304294 } ) ;
305295 crawlUrls . add ( record . url ) ;
306- if ( pendingDups . length >= DUP_BATCH_SIZE ) await flushDups ( ) ;
307296 }
308297 updateProgress ( ) ;
309298 continue ;
@@ -348,7 +337,6 @@ export async function scrape(options: ScrapeOptions) {
348337 }
349338
350339 await Promise . all ( tasks ) ;
351- await flushDups ( ) ;
352340 clearLines ( prevLineCount ) ;
353341
354342 // Accumulate totals
0 commit comments