@@ -20,7 +20,7 @@ import {
2020 RequestList ,
2121 RequestQueueV2 ,
2222} from '@crawlee/http' ;
23- import { discoverValidSitemaps , parseSitemap } from '@crawlee/utils' ;
23+ import { discoverValidSitemaps , parseSitemap , sleep } from '@crawlee/utils' ;
2424import type { ApifyEnv } from 'apify' ;
2525import { Actor } from 'apify' ;
2626
@@ -44,6 +44,7 @@ const SCHEMA = JSON.parse(
4444) ;
4545
4646const REQUESTS_BATCH_SIZE = 25 ;
47+ const SITEMAP_DISCOVERY_TIMEOUT_MILLIS = 30_000 ;
4748
4849const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9 ;
4950const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED' ;
@@ -108,16 +109,29 @@ export class CrawlerSetup implements CrawlerSetupOptions {
108109 }
109110
110111 private async _initializeAsync ( ) {
111- const discoveredSitemaps = new Set (
112- await Array . fromAsync (
113- discoverValidSitemaps (
114- this . input . startUrls
115- . map ( ( x ) => x . url )
116- . filter ( ( x ) => x !== undefined ) ,
117- { proxyUrl : await this . proxyConfiguration ?. newUrl ( ) } ,
118- ) ,
112+ const discoveryPromise = Array . fromAsync (
113+ discoverValidSitemaps (
114+ this . input . startUrls
115+ . map ( ( x ) => x . url )
116+ . filter ( ( x ) => x !== undefined ) ,
117+ { proxyUrl : await this . proxyConfiguration ?. newUrl ( ) } ,
119118 ) ,
120119 ) ;
120+ const discovered = await Promise . race ( [
121+ discoveryPromise ,
122+ sleep ( SITEMAP_DISCOVERY_TIMEOUT_MILLIS ) ,
123+ ] ) ;
124+ if ( ! discovered ) {
125+ log . warning (
126+ `Sitemap discovery timed out after ${ Math . round (
127+ SITEMAP_DISCOVERY_TIMEOUT_MILLIS / 1000 ,
128+ ) } s, continuing without sitemaps.`,
129+ ) ;
130+ }
131+ const discoveredSitemaps =
132+ discovered && discovered . length > 0
133+ ? new Set ( discovered )
134+ : new Set < string > ( ) ;
121135 if ( discoveredSitemaps . size === 0 ) {
122136 throw await Actor . fail (
123137 'No valid sitemaps were discovered from the provided startUrls.' ,
0 commit comments