@@ -54,6 +54,15 @@ const gunzip = promisify(zlibGunzip);
5454const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9 ;
5555const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED' ;
5656
57+ type SitemapDiscoveryAttempt = {
58+ discovered ?: string [ ] ;
59+ error ?: unknown ;
60+ } ;
61+
62+ type SitemapDiscoveryResult = SitemapDiscoveryAttempt & {
63+ disableProxyForRun : boolean ;
64+ } ;
65+
5766const NOOP_COOKIE_JAR = {
5867 async getCookies ( ) {
5968 return [ ] ;
@@ -184,6 +193,71 @@ export class CrawlerSetup {
184193 return anyProxy as ProxyConfiguration ;
185194 }
186195
196+ private _getStartUrls ( ) {
197+ return this . input . startUrls
198+ . map ( ( request ) => request . url )
199+ . filter ( ( url ) : url is string => url !== undefined ) ;
200+ }
201+
202+ private async _discoverSitemapsWithTimeout (
203+ startUrls : string [ ] ,
204+ proxyUrl ?: string ,
205+ ) : Promise < SitemapDiscoveryAttempt > {
206+ try {
207+ const discovered = await Promise . race < string [ ] | void > ( [
208+ Array . fromAsync (
209+ discoverValidSitemaps ( startUrls , {
210+ proxyUrl,
211+ httpClient : this . sitemapHttpClient ,
212+ } as any ) ,
213+ ) ,
214+ sleep ( SITEMAP_DISCOVERY_TIMEOUT_MILLIS ) ,
215+ ] ) ;
216+ return {
217+ discovered : discovered ?? undefined ,
218+ } ;
219+ } catch ( error ) {
220+ return { error } ;
221+ }
222+ }
223+
224+ private async _discoverSitemaps (
225+ startUrls : string [ ] ,
226+ ) : Promise < SitemapDiscoveryResult > {
227+ const discoveryProxyUrl = await this . proxyConfiguration ?. newUrl ( ) ;
228+ const proxyAttempt = await this . _discoverSitemapsWithTimeout (
229+ startUrls ,
230+ discoveryProxyUrl ,
231+ ) ;
232+
233+ const proxyDiscoveryFailed =
234+ discoveryProxyUrl &&
235+ ( proxyAttempt . error ||
236+ ! proxyAttempt . discovered ||
237+ proxyAttempt . discovered . length === 0 ) ;
238+
239+ if ( ! proxyDiscoveryFailed ) {
240+ return {
241+ ...proxyAttempt ,
242+ disableProxyForRun : false ,
243+ } ;
244+ }
245+
246+ log . warning (
247+ 'Sitemap discovery through proxy failed or returned no sitemaps. Retrying once without proxy.' ,
248+ ) ;
249+
250+ const noProxyAttempt =
251+ await this . _discoverSitemapsWithTimeout ( startUrls ) ;
252+ return {
253+ ...noProxyAttempt ,
254+ disableProxyForRun : Boolean (
255+ noProxyAttempt . discovered &&
256+ noProxyAttempt . discovered . length > 0 ,
257+ ) ,
258+ } ;
259+ }
260+
187261 private async _initializeAsync ( ) {
188262 // Proxy configuration
189263 const proxyConfiguration = ( await Actor . createProxyConfiguration (
@@ -192,28 +266,32 @@ export class CrawlerSetup {
192266 this . proxyConfiguration =
193267 this . _wrapProxyConfiguration ( proxyConfiguration ) ;
194268
195- const discoveryPromise = Array . fromAsync (
196- discoverValidSitemaps (
197- this . input . startUrls
198- . map ( ( x ) => x . url )
199- . filter ( ( x ) => x !== undefined ) ,
200- {
201- proxyUrl : await this . proxyConfiguration ?. newUrl ( ) ,
202- httpClient : this . sitemapHttpClient ,
203- } as any ,
204- ) ,
205- ) ;
206- const discovered = await Promise . race < string [ ] | void > ( [
207- discoveryPromise ,
208- sleep ( SITEMAP_DISCOVERY_TIMEOUT_MILLIS ) ,
209- ] ) ;
210- if ( ! discovered ) {
269+ const startUrls = this . _getStartUrls ( ) ;
270+ const {
271+ discovered,
272+ error : discoveryError ,
273+ disableProxyForRun,
274+ } = await this . _discoverSitemaps ( startUrls ) ;
275+
276+ if ( disableProxyForRun ) {
277+ log . warning (
278+ 'Sitemap discovery succeeded only without proxy. Disabling proxy for the rest of this run.' ,
279+ ) ;
280+ this . proxyConfiguration = undefined ;
281+ }
282+
283+ if ( ! discovered && ! discoveryError ) {
211284 log . warning (
212285 `Sitemap discovery timed out after ${ Math . round (
213286 SITEMAP_DISCOVERY_TIMEOUT_MILLIS / 1000 ,
214- ) } s, continuing without sitemaps .`,
287+ ) } s.`,
215288 ) ;
216289 }
290+
291+ if ( discoveryError ) {
292+ throw discoveryError ;
293+ }
294+
217295 const discoveredSitemaps =
218296 discovered && discovered . length > 0
219297 ? new Set ( discovered )
0 commit comments