1- import { PlaywrightCrawler , Configuration , type PlaywrightCrawlingContext } from 'crawlee' ;
1+ import { PlaywrightCrawler , Configuration , RequestQueue , purgeDefaultStorages , type PlaywrightCrawlingContext } from 'crawlee' ;
22import { v4 as uuidv4 } from 'uuid' ;
33import { scanPage } from '../scanner/engine' ;
44import { parseAxeResults } from '../scanner/result-parser' ;
@@ -43,6 +43,9 @@ export async function startCrawl(
4343 // We update this on the first request so domain boundary checks use the post-redirect hostname.
4444 let effectiveSeedUrl = seedUrl ;
4545
46+ // Per-crawl request queue — declared here so it can be dropped in finally
47+ let requestQueue : Awaited < ReturnType < typeof RequestQueue . open > > | undefined ;
48+
4649 try {
4750 // Phase: discovering
4851 updateCrawl ( crawlId , { status : 'discovering' , progress : 5 , message : 'Fetching robots.txt and sitemaps...' } ) ;
@@ -84,14 +87,14 @@ export async function startCrawl(
8487 } ) ;
8588 emitProgress ( crawlId , completedPages , onProgress ) ;
8689
87- // Each crawl gets its own Configuration so crawlee uses a fresh ,
88- // isolated request queue. Without this, the default singleton queue
89- // retains "already handled" URLs from previous crawls, causing
90- // subsequent crawls to immediately shut down with 0 results.
91- const crawlConfig = new Configuration ( {
92- persistStorage : false ,
93- storageClientOptions : { localDataDirectory : `/tmp/crawlee- ${ crawlId } ` } ,
94- } ) ;
90+ // Purge default storages before each crawl to clear any stale state ,
91+ // then open a uniquely-named RequestQueue so this crawl is fully isolated
92+ // from previous runs. Without this, crawlee's default singleton queue
93+ // retains "already handled" URLs, causing subsequent crawls to shut down
94+ // with 0 results.
95+ Configuration . getGlobalConfig ( ) . set ( ' persistStorage' , false ) ;
96+ await purgeDefaultStorages ( ) ;
97+ requestQueue = await RequestQueue . open ( `crawl- ${ crawlId } ` ) ;
9598
9699 // Track depth per URL
97100 const urlDepth = new Map < string , number > ( ) ;
@@ -102,7 +105,7 @@ export async function startCrawl(
102105 maxConcurrency : config . concurrency ,
103106 requestHandlerTimeoutSecs : 60 ,
104107 navigationTimeoutSecs : 30 ,
105- configuration : crawlConfig ,
108+ requestQueue ,
106109 launchContext : {
107110 launchOptions : {
108111 headless : true ,
@@ -316,6 +319,10 @@ export async function startCrawl(
316319 } finally {
317320 activeAbortControllers . delete ( crawlId ) ;
318321 clearRobotsCache ( ) ;
322+ // Drop the per-crawl request queue to free memory
323+ if ( requestQueue ) {
324+ await requestQueue . drop ( ) . catch ( ( ) => { } ) ;
325+ }
319326 }
320327}
321328
0 commit comments