Skip to content

Commit 14170a1

Browse files
Merge pull request #28 from devopsabcs-engineering/feature/fix-crawl-seed-cap
fix(crawler): cap seed URLs to prevent crawlee premature termination
2 parents 2b364ac + a0e6a4b commit 14170a1

2 files changed

Lines changed: 38 additions & 2 deletions

File tree

src/lib/crawler/__tests__/site-crawler.test.ts

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,33 @@ describe('site-crawler', () => {
216216
const abortUpdate = calls.find(([, data]) => 'abortController' in data);
217217
expect(abortUpdate).toBeDefined();
218218
});
219+
220+
it('caps seed URLs below maxPages to prevent crawlee premature termination', async () => {
221+
// Simulate a large sitemap that returns more URLs than maxPages
222+
const sitemapUrls = Array.from({ length: 200 }, (_, i) => `https://example.com/page${i}`);
223+
vi.mocked(discoverSitemapUrls).mockResolvedValueOnce(sitemapUrls);
224+
225+
const crawlee = await import('crawlee');
226+
let capturedRunArg: string[] = [];
227+
vi.mocked(crawlee.PlaywrightCrawler).mockImplementationOnce(function (this: Record<string, unknown>, options: Record<string, unknown>) {
228+
_capturedOptions = options;
229+
this.run = vi.fn().mockImplementation((urls: string[]) => {
230+
capturedRunArg = urls;
231+
return Promise.resolve();
232+
});
233+
return this;
234+
});
235+
236+
const config = { ...defaultConfig, maxPages: 50 };
237+
await startCrawl('crawl-cap', 'https://example.com', config);
238+
239+
// Should pass fewer than maxPages seed URLs to prevent the
240+
// crawler from hitting maxRequestsPerCrawl during the enqueue phase
241+
expect(capturedRunArg.length).toBeLessThan(config.maxPages);
242+
expect(capturedRunArg.length).toBe(config.maxPages - 1);
243+
// The original seed URL must always be included first
244+
expect(capturedRunArg[0]).toBe('https://example.com');
245+
});
219246
});
220247

221248
describe('cancelCrawl', () => {

src/lib/crawler/site-crawler.ts

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,8 +263,17 @@ export async function startCrawl(
263263
},
264264
});
265265

266-
// Run the crawler with seed URLs
267-
await crawler.run(Array.from(seedUrls));
266+
// Run the crawler with seed URLs.
267+
// Cap the seed list to (maxPages - 1) so crawlee's internal request counter
268+
// does not reach maxRequestsPerCrawl during the initial enqueue phase.
269+
// When the limit is hit before the processing loop starts, the crawler
270+
// terminates immediately with 0 pages processed (observed with large
271+
// sitemaps such as ontario.ca and microsoft.com).
272+
const primarySeed = normalizeUrl(seedUrl);
273+
const remainingSeeds = Array.from(seedUrls).filter(u => u !== primarySeed);
274+
const maxSeeds = Math.max(1, config.maxPages - 1);
275+
const cappedSeeds = [primarySeed, ...remainingSeeds].slice(0, maxSeeds);
276+
await crawler.run(cappedSeeds);
268277

269278
// Aggregation phase
270279
updateCrawl(crawlId, { status: 'aggregating', progress: 95, message: 'Aggregating results...' });

0 commit comments

Comments
 (0)