@@ -216,6 +216,33 @@ describe('site-crawler', () => {
216216 const abortUpdate = calls . find ( ( [ , data ] ) => 'abortController' in data ) ;
217217 expect ( abortUpdate ) . toBeDefined ( ) ;
218218 } ) ;
219+
220+ it ( 'caps seed URLs below maxPages to prevent crawlee premature termination' , async ( ) => {
221+ // Simulate a large sitemap that returns more URLs than maxPages
222+ const sitemapUrls = Array . from ( { length : 200 } , ( _ , i ) => `https://example.com/page${ i } ` ) ;
223+ vi . mocked ( discoverSitemapUrls ) . mockResolvedValueOnce ( sitemapUrls ) ;
224+
225+ const crawlee = await import ( 'crawlee' ) ;
226+ let capturedRunArg : string [ ] = [ ] ;
227+ vi . mocked ( crawlee . PlaywrightCrawler ) . mockImplementationOnce ( function ( this : Record < string , unknown > , options : Record < string , unknown > ) {
228+ _capturedOptions = options ;
229+ this . run = vi . fn ( ) . mockImplementation ( ( urls : string [ ] ) => {
230+ capturedRunArg = urls ;
231+ return Promise . resolve ( ) ;
232+ } ) ;
233+ return this ;
234+ } ) ;
235+
236+ const config = { ...defaultConfig , maxPages : 50 } ;
237+ await startCrawl ( 'crawl-cap' , 'https://example.com' , config ) ;
238+
239+ // Should pass fewer than maxPages seed URLs to prevent the
240+ // crawler from hitting maxRequestsPerCrawl during the enqueue phase
241+ expect ( capturedRunArg . length ) . toBeLessThan ( config . maxPages ) ;
242+ expect ( capturedRunArg . length ) . toBe ( config . maxPages - 1 ) ;
243+ // The original seed URL must always be included first
244+ expect ( capturedRunArg [ 0 ] ) . toBe ( 'https://example.com' ) ;
245+ } ) ;
219246 } ) ;
220247
221248 describe ( 'cancelCrawl' , ( ) => {
0 commit comments