Skip to content

Commit d374eba

Browse files
Merge pull request #230 from apify/fix/timeout-for-discoverysitemaps
fix: add timeout to discoveryValidSitemaps
2 parents 56ac23d + ad49229 commit d374eba

1 file changed

Lines changed: 23 additions & 9 deletions

File tree

packages/actor-scraper/sitemap-scraper/src/internals/crawler_setup.ts

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import {
2020
RequestList,
2121
RequestQueueV2,
2222
} from '@crawlee/http';
23-
import { discoverValidSitemaps, parseSitemap } from '@crawlee/utils';
23+
import { discoverValidSitemaps, parseSitemap, sleep } from '@crawlee/utils';
2424
import type { ApifyEnv } from 'apify';
2525
import { Actor } from 'apify';
2626

@@ -44,6 +44,7 @@ const SCHEMA = JSON.parse(
4444
);
4545

4646
const REQUESTS_BATCH_SIZE = 25;
47+
const SITEMAP_DISCOVERY_TIMEOUT_MILLIS = 30_000;
4748

4849
const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
4950
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
@@ -108,16 +109,29 @@ export class CrawlerSetup implements CrawlerSetupOptions {
108109
}
109110

110111
private async _initializeAsync() {
111-
const discoveredSitemaps = new Set(
112-
await Array.fromAsync(
113-
discoverValidSitemaps(
114-
this.input.startUrls
115-
.map((x) => x.url)
116-
.filter((x) => x !== undefined),
117-
{ proxyUrl: await this.proxyConfiguration?.newUrl() },
118-
),
112+
const discoveryPromise = Array.fromAsync(
113+
discoverValidSitemaps(
114+
this.input.startUrls
115+
.map((x) => x.url)
116+
.filter((x) => x !== undefined),
117+
{ proxyUrl: await this.proxyConfiguration?.newUrl() },
119118
),
120119
);
120+
const discovered = await Promise.race([
121+
discoveryPromise,
122+
sleep(SITEMAP_DISCOVERY_TIMEOUT_MILLIS),
123+
]);
124+
if (!discovered) {
125+
log.warning(
126+
`Sitemap discovery timed out after ${Math.round(
127+
SITEMAP_DISCOVERY_TIMEOUT_MILLIS / 1000,
128+
)}s, continuing without sitemaps.`,
129+
);
130+
}
131+
const discoveredSitemaps =
132+
discovered && discovered.length > 0
133+
? new Set(discovered)
134+
: new Set<string>();
121135
if (discoveredSitemaps.size === 0) {
122136
throw await Actor.fail(
123137
'No valid sitemaps were discovered from the provided startUrls.',

0 commit comments

Comments
 (0)