Skip to content

Commit a4bef9d

Browse files
Merge pull request #36 from devopsabcs-engineering/feature/fix-crawl-requestqueue
fix(crawler): use per-crawl RequestQueue for isolation (crawlee v3 compatible)
2 parents e9728a1 + 111d6d2 commit a4bef9d

2 files changed

Lines changed: 26 additions & 13 deletions

File tree

src/lib/crawler/__tests__/site-crawler.test.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,15 @@ vi.mock('crawlee', () => ({
1010
this.run = vi.fn().mockResolvedValue(undefined);
1111
return this;
1212
}),
13-
Configuration: vi.fn().mockImplementation(function () {
14-
return { get: vi.fn(), set: vi.fn() };
15-
}),
13+
Configuration: {
14+
getGlobalConfig: vi.fn().mockReturnValue({ set: vi.fn() }),
15+
},
16+
RequestQueue: {
17+
open: vi.fn().mockResolvedValue({
18+
drop: vi.fn().mockResolvedValue(undefined),
19+
}),
20+
},
21+
purgeDefaultStorages: vi.fn().mockResolvedValue(undefined),
1622
}));
1723

1824
vi.mock('uuid', () => ({ v4: vi.fn().mockReturnValue('mock-page-id') }));

src/lib/crawler/site-crawler.ts

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { PlaywrightCrawler, Configuration, type PlaywrightCrawlingContext } from 'crawlee';
1+
import { PlaywrightCrawler, Configuration, RequestQueue, purgeDefaultStorages, type PlaywrightCrawlingContext } from 'crawlee';
22
import { v4 as uuidv4 } from 'uuid';
33
import { scanPage } from '../scanner/engine';
44
import { parseAxeResults } from '../scanner/result-parser';
@@ -43,6 +43,9 @@ export async function startCrawl(
4343
// We update this on the first request so domain boundary checks use the post-redirect hostname.
4444
let effectiveSeedUrl = seedUrl;
4545

46+
// Per-crawl request queue — declared here so it can be dropped in finally
47+
let requestQueue: Awaited<ReturnType<typeof RequestQueue.open>> | undefined;
48+
4649
try {
4750
// Phase: discovering
4851
updateCrawl(crawlId, { status: 'discovering', progress: 5, message: 'Fetching robots.txt and sitemaps...' });
@@ -84,14 +87,14 @@ export async function startCrawl(
8487
});
8588
emitProgress(crawlId, completedPages, onProgress);
8689

87-
// Each crawl gets its own Configuration so crawlee uses a fresh,
88-
// isolated request queue. Without this, the default singleton queue
89-
// retains "already handled" URLs from previous crawls, causing
90-
// subsequent crawls to immediately shut down with 0 results.
91-
const crawlConfig = new Configuration({
92-
persistStorage: false,
93-
storageClientOptions: { localDataDirectory: `/tmp/crawlee-${crawlId}` },
94-
});
90+
// Purge default storages before each crawl to clear any stale state,
91+
// then open a uniquely-named RequestQueue so this crawl is fully isolated
92+
// from previous runs. Without this, crawlee's default singleton queue
93+
// retains "already handled" URLs, causing subsequent crawls to shut down
94+
// with 0 results.
95+
Configuration.getGlobalConfig().set('persistStorage', false);
96+
await purgeDefaultStorages();
97+
requestQueue = await RequestQueue.open(`crawl-${crawlId}`);
9598

9699
// Track depth per URL
97100
const urlDepth = new Map<string, number>();
@@ -102,7 +105,7 @@ export async function startCrawl(
102105
maxConcurrency: config.concurrency,
103106
requestHandlerTimeoutSecs: 60,
104107
navigationTimeoutSecs: 30,
105-
configuration: crawlConfig,
108+
requestQueue,
106109
launchContext: {
107110
launchOptions: {
108111
headless: true,
@@ -316,6 +319,10 @@ export async function startCrawl(
316319
} finally {
317320
activeAbortControllers.delete(crawlId);
318321
clearRobotsCache();
322+
// Drop the per-crawl request queue to free memory
323+
if (requestQueue) {
324+
await requestQueue.drop().catch(() => {});
325+
}
319326
}
320327
}
321328

0 commit comments

Comments
 (0)