Skip to content

Commit e9728a1

Browse files
Merge pull request #35 from devopsabcs-engineering/feature/fix-crawl-isolated-queue
fix(crawler): isolate crawlee request queue per crawl
2 parents a40ab11 + f5db1d3 commit e9728a1

2 files changed

Lines changed: 13 additions & 9 deletions

File tree

src/lib/crawler/__tests__/site-crawler.test.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,9 @@ vi.mock('crawlee', () => ({
1010
this.run = vi.fn().mockResolvedValue(undefined);
1111
return this;
1212
}),
13-
Configuration: {
14-
getGlobalConfig: vi.fn().mockReturnValue({ set: vi.fn() }),
15-
},
16-
purgeDefaultStorages: vi.fn().mockResolvedValue(undefined),
13+
Configuration: vi.fn().mockImplementation(function () {
14+
return { get: vi.fn(), set: vi.fn() };
15+
}),
1716
}));
1817

1918
vi.mock('uuid', () => ({ v4: vi.fn().mockReturnValue('mock-page-id') }));

src/lib/crawler/site-crawler.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { PlaywrightCrawler, Configuration, purgeDefaultStorages, type PlaywrightCrawlingContext } from 'crawlee';
1+
import { PlaywrightCrawler, Configuration, type PlaywrightCrawlingContext } from 'crawlee';
22
import { v4 as uuidv4 } from 'uuid';
33
import { scanPage } from '../scanner/engine';
44
import { parseAxeResults } from '../scanner/result-parser';
@@ -84,8 +84,14 @@ export async function startCrawl(
8484
});
8585
emitProgress(crawlId, completedPages, onProgress);
8686

87-
// Prevent crawlee from writing state to disk
88-
Configuration.getGlobalConfig().set('persistStorage', false);
87+
// Each crawl gets its own Configuration so crawlee uses a fresh,
88+
// isolated request queue. Without this, the default singleton queue
89+
// retains "already handled" URLs from previous crawls, causing
90+
// subsequent crawls to immediately shut down with 0 results.
91+
const crawlConfig = new Configuration({
92+
persistStorage: false,
93+
storageClientOptions: { localDataDirectory: `/tmp/crawlee-${crawlId}` },
94+
});
8995

9096
// Track depth per URL
9197
const urlDepth = new Map<string, number>();
@@ -96,6 +102,7 @@ export async function startCrawl(
96102
maxConcurrency: config.concurrency,
97103
requestHandlerTimeoutSecs: 60,
98104
navigationTimeoutSecs: 30,
105+
configuration: crawlConfig,
99106
launchContext: {
100107
launchOptions: {
101108
headless: true,
@@ -309,8 +316,6 @@ export async function startCrawl(
309316
} finally {
310317
activeAbortControllers.delete(crawlId);
311318
clearRobotsCache();
312-
// Purge crawlee's internal storage to prevent stale state across crawl runs
313-
await purgeDefaultStorages();
314319
}
315320
}
316321

0 commit comments

Comments
 (0)