1- import { PlaywrightCrawler , Configuration , purgeDefaultStorages , type PlaywrightCrawlingContext } from 'crawlee' ;
1+ import { PlaywrightCrawler , Configuration , type PlaywrightCrawlingContext } from 'crawlee' ;
22import { v4 as uuidv4 } from 'uuid' ;
33import { scanPage } from '../scanner/engine' ;
44import { parseAxeResults } from '../scanner/result-parser' ;
@@ -84,8 +84,14 @@ export async function startCrawl(
8484 } ) ;
8585 emitProgress ( crawlId , completedPages , onProgress ) ;
8686
87- // Prevent crawlee from writing state to disk
88- Configuration . getGlobalConfig ( ) . set ( 'persistStorage' , false ) ;
87+ // Each crawl gets its own Configuration so crawlee uses a fresh,
88+ // isolated request queue. Without this, the default singleton queue
89+ // retains "already handled" URLs from previous crawls, causing
90+ // subsequent crawls to immediately shut down with 0 results.
91+ const crawlConfig = new Configuration ( {
92+ persistStorage : false ,
93+ storageClientOptions : { localDataDirectory : `/tmp/crawlee-${ crawlId } ` } ,
94+ } ) ;
8995
9096 // Track depth per URL
9197 const urlDepth = new Map < string , number > ( ) ;
@@ -96,6 +102,7 @@ export async function startCrawl(
96102 maxConcurrency : config . concurrency ,
97103 requestHandlerTimeoutSecs : 60 ,
98104 navigationTimeoutSecs : 30 ,
105+ configuration : crawlConfig ,
99106 launchContext : {
100107 launchOptions : {
101108 headless : true ,
@@ -309,8 +316,6 @@ export async function startCrawl(
309316 } finally {
310317 activeAbortControllers . delete ( crawlId ) ;
311318 clearRobotsCache ( ) ;
312- // Purge crawlee's internal storage to prevent stale state across crawl runs
313- await purgeDefaultStorages ( ) ;
314319 }
315320}
316321
0 commit comments