Skip to content

Commit bff9c69

Browse files
Merge pull request #242 from apify/fix/retry-discover-sitemaps-wo-proxy
fix(sitemap-extractor): retry discovery once without proxy
2 parents 6e3ea98 + bfd6f1c commit bff9c69

1 file changed

Lines changed: 95 additions & 17 deletions

File tree

packages/actor-scraper/sitemap-scraper/src/internals/crawler_setup.ts

Lines changed: 95 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,15 @@ const gunzip = promisify(zlibGunzip);
5454
const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
5555
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
5656

57+
type SitemapDiscoveryAttempt = {
58+
discovered?: string[];
59+
error?: unknown;
60+
};
61+
62+
type SitemapDiscoveryResult = SitemapDiscoveryAttempt & {
63+
disableProxyForRun: boolean;
64+
};
65+
5766
const NOOP_COOKIE_JAR = {
5867
async getCookies() {
5968
return [];
@@ -184,6 +193,71 @@ export class CrawlerSetup {
184193
return anyProxy as ProxyConfiguration;
185194
}
186195

196+
private _getStartUrls() {
197+
return this.input.startUrls
198+
.map((request) => request.url)
199+
.filter((url): url is string => url !== undefined);
200+
}
201+
202+
private async _discoverSitemapsWithTimeout(
203+
startUrls: string[],
204+
proxyUrl?: string,
205+
): Promise<SitemapDiscoveryAttempt> {
206+
try {
207+
const discovered = await Promise.race<string[] | void>([
208+
Array.fromAsync(
209+
discoverValidSitemaps(startUrls, {
210+
proxyUrl,
211+
httpClient: this.sitemapHttpClient,
212+
} as any),
213+
),
214+
sleep(SITEMAP_DISCOVERY_TIMEOUT_MILLIS),
215+
]);
216+
return {
217+
discovered: discovered ?? undefined,
218+
};
219+
} catch (error) {
220+
return { error };
221+
}
222+
}
223+
224+
private async _discoverSitemaps(
225+
startUrls: string[],
226+
): Promise<SitemapDiscoveryResult> {
227+
const discoveryProxyUrl = await this.proxyConfiguration?.newUrl();
228+
const proxyAttempt = await this._discoverSitemapsWithTimeout(
229+
startUrls,
230+
discoveryProxyUrl,
231+
);
232+
233+
const proxyDiscoveryFailed =
234+
discoveryProxyUrl &&
235+
(proxyAttempt.error ||
236+
!proxyAttempt.discovered ||
237+
proxyAttempt.discovered.length === 0);
238+
239+
if (!proxyDiscoveryFailed) {
240+
return {
241+
...proxyAttempt,
242+
disableProxyForRun: false,
243+
};
244+
}
245+
246+
log.warning(
247+
'Sitemap discovery through proxy failed or returned no sitemaps. Retrying once without proxy.',
248+
);
249+
250+
const noProxyAttempt =
251+
await this._discoverSitemapsWithTimeout(startUrls);
252+
return {
253+
...noProxyAttempt,
254+
disableProxyForRun: Boolean(
255+
noProxyAttempt.discovered &&
256+
noProxyAttempt.discovered.length > 0,
257+
),
258+
};
259+
}
260+
187261
private async _initializeAsync() {
188262
// Proxy configuration
189263
const proxyConfiguration = (await Actor.createProxyConfiguration(
@@ -192,28 +266,32 @@ export class CrawlerSetup {
192266
this.proxyConfiguration =
193267
this._wrapProxyConfiguration(proxyConfiguration);
194268

195-
const discoveryPromise = Array.fromAsync(
196-
discoverValidSitemaps(
197-
this.input.startUrls
198-
.map((x) => x.url)
199-
.filter((x) => x !== undefined),
200-
{
201-
proxyUrl: await this.proxyConfiguration?.newUrl(),
202-
httpClient: this.sitemapHttpClient,
203-
} as any,
204-
),
205-
);
206-
const discovered = await Promise.race<string[] | void>([
207-
discoveryPromise,
208-
sleep(SITEMAP_DISCOVERY_TIMEOUT_MILLIS),
209-
]);
210-
if (!discovered) {
269+
const startUrls = this._getStartUrls();
270+
const {
271+
discovered,
272+
error: discoveryError,
273+
disableProxyForRun,
274+
} = await this._discoverSitemaps(startUrls);
275+
276+
if (disableProxyForRun) {
277+
log.warning(
278+
'Sitemap discovery succeeded only without proxy. Disabling proxy for the rest of this run.',
279+
);
280+
this.proxyConfiguration = undefined;
281+
}
282+
283+
if (!discovered && !discoveryError) {
211284
log.warning(
212285
`Sitemap discovery timed out after ${Math.round(
213286
SITEMAP_DISCOVERY_TIMEOUT_MILLIS / 1000,
214-
)}s, continuing without sitemaps.`,
287+
)}s.`,
215288
);
216289
}
290+
291+
if (discoveryError) {
292+
throw discoveryError;
293+
}
294+
217295
const discoveredSitemaps =
218296
discovered && discovered.length > 0
219297
? new Set(discovered)

0 commit comments

Comments
 (0)