Skip to content

Commit 1d2528b

Browse files
authored
fix: honor custom respectRobotsTxtFile userAgent in enqueueLinks (apify#3578)
1 parent c0b9b50 commit 1d2528b

4 files changed

Lines changed: 146 additions & 3 deletions

File tree

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1734,6 +1734,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
17341734
return enqueueLinks({
17351735
requestQueue,
17361736
robotsTxtFile: await this.getRobotsTxtFileForUrl(request!.url),
1737+
respectRobotsTxtFile: this.respectRobotsTxtFile,
17371738
onSkippedRequest,
17381739
limit: this.calculateEnqueuedRequestLimit(options.limit),
17391740

packages/core/src/enqueue_links/enqueue_links.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,13 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
182182
*/
183183
robotsTxtFile?: Pick<RobotsTxtFile, 'isAllowed'>;
184184

185+
/**
186+
* Mirrors {@apilink BasicCrawlerOptions.respectRobotsTxtFile}: pass `false` to disable filtering or
187+
* `{ userAgent }` to evaluate rules for a specific user-agent. Defaults to `*` when
188+
* {@apilink EnqueueLinksOptions.robotsTxtFile|`robotsTxtFile`} is provided.
189+
*/
190+
respectRobotsTxtFile?: boolean | { userAgent?: string };
191+
185192
/**
186193
* When a request is skipped for some reason, you can use this callback to act on it.
187194
* This is currently fired for requests skipped
@@ -296,6 +303,7 @@ export async function enqueueLinks(
296303
urls: ow.array.ofType(ow.string),
297304
requestQueue: ow.object.hasKeys('addRequestsBatched'),
298305
robotsTxtFile: ow.optional.object.hasKeys('isAllowed'),
306+
respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object.exactShape({ userAgent: ow.optional.string })),
299307
onSkippedRequest: ow.optional.function,
300308
forefront: ow.optional.boolean,
301309
skipNavigation: ow.optional.boolean,
@@ -422,11 +430,13 @@ export async function enqueueLinks(
422430

423431
let requestOptions = createRequestOptions(urls, options);
424432

425-
if (robotsTxtFile) {
433+
if (robotsTxtFile && options.respectRobotsTxtFile !== false) {
434+
const robotsUserAgent =
435+
typeof options.respectRobotsTxtFile === 'object' ? (options.respectRobotsTxtFile.userAgent ?? '*') : '*';
426436
const skippedRequests: RequestOptions[] = [];
427437

428438
requestOptions = requestOptions.filter((request) => {
429-
if (robotsTxtFile.isAllowed(request.url)) {
439+
if (robotsTxtFile.isAllowed(request.url, robotsUserAgent)) {
430440
return true;
431441
}
432442

test/core/crawlers/basic_crawler.test.ts

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1722,6 +1722,79 @@ describe('BasicCrawler', () => {
17221722
expect(addRequestsBatchedSpy).toHaveBeenCalledOnce();
17231723
});
17241724

1725+
test('enqueueLinks should respect custom user-agent robots.txt rules', async () => {
1726+
const requestQueue = await RequestQueue.open();
1727+
const visitedUrls: string[] = [];
1728+
1729+
const crawler = new (class MockedRobotsTxtCrawler extends BasicCrawler {
1730+
override async getRobotsTxtFileForUrl(_: string) {
1731+
return RobotsTxtFile.from(
1732+
'http://example.com/robots.txt',
1733+
`User-agent: *
1734+
Disallow: /
1735+
Allow: /yes
1736+
1737+
User-agent: MyCrawler
1738+
Disallow: /no
1739+
Allow: /my-crawler
1740+
`,
1741+
);
1742+
}
1743+
})({
1744+
requestQueue,
1745+
maxConcurrency: 1,
1746+
respectRobotsTxtFile: { userAgent: 'MyCrawler' },
1747+
requestHandler: async (context) => {
1748+
visitedUrls.push(context.request.url);
1749+
1750+
if (context.request.label) {
1751+
return;
1752+
}
1753+
1754+
await context.enqueueLinks({
1755+
urls: [
1756+
'http://example.com/yes',
1757+
'http://example.com/no',
1758+
'http://example.com/no-globally',
1759+
'http://example.com/my-crawler/anything',
1760+
],
1761+
label: 'child',
1762+
});
1763+
},
1764+
});
1765+
1766+
await crawler.run(['http://example.com/start']);
1767+
1768+
expect(visitedUrls).toEqual([
1769+
'http://example.com/start',
1770+
'http://example.com/yes',
1771+
'http://example.com/my-crawler/anything',
1772+
]);
1773+
});
1774+
1775+
test('enqueueLinks forwards respectRobotsTxtFile.userAgent to the robots.txt check', async () => {
1776+
const requestQueue = await RequestQueue.open();
1777+
const isAllowedSpy = vitest.fn(() => true);
1778+
1779+
const crawler = new (class MockedRobotsTxtCrawler extends BasicCrawler {
1780+
override async getRobotsTxtFileForUrl(_: string) {
1781+
return { isAllowed: isAllowedSpy } as unknown as RobotsTxtFile;
1782+
}
1783+
})({
1784+
requestQueue,
1785+
maxConcurrency: 1,
1786+
respectRobotsTxtFile: { userAgent: 'MyCrawler' },
1787+
requestHandler: async (context) => {
1788+
if (context.request.label) return;
1789+
await context.enqueueLinks({ urls: ['http://example.com/child'], label: 'child' });
1790+
},
1791+
});
1792+
1793+
await crawler.run(['http://example.com/start']);
1794+
1795+
expect(isAllowedSpy).toHaveBeenCalledWith('http://example.com/child', 'MyCrawler');
1796+
});
1797+
17251798
test('enqueueLinks should respect maxRequestsPerCrawl', async () => {
17261799
const requestQueue = await RequestQueue.open();
17271800
const addRequestsBatchedSpy = vitest.spyOn(requestQueue, 'addRequestsBatched');

test/core/enqueue_links/enqueue_links.test.ts

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { type AddRequestsBatchedOptions, cheerioCrawlerEnqueueLinks } from '@crawlee/cheerio';
2+
import { enqueueLinks } from '@crawlee/core';
23
import { launchPlaywright } from '@crawlee/playwright';
34
import type { RequestQueueOperationOptions, Source } from '@crawlee/puppeteer';
45
import {
@@ -8,7 +9,7 @@ import {
89
launchPuppeteer,
910
RequestQueue,
1011
} from '@crawlee/puppeteer';
11-
import { type CheerioRoot } from '@crawlee/utils';
12+
import { type CheerioRoot, RobotsTxtFile } from '@crawlee/utils';
1213
import { load } from 'cheerio';
1314
import type { Browser as PlaywrightBrowser, Page as PlaywrightPage } from 'playwright';
1415
import type { Browser as PuppeteerBrowser, Page as PuppeteerPage } from 'puppeteer';
@@ -1027,4 +1028,62 @@ describe('enqueueLinks()', () => {
10271028
}
10281029
});
10291030
});
1031+
1032+
describe('respectRobotsTxtFile option', () => {
1033+
const robotsTxtFile = RobotsTxtFile.from(
1034+
'http://example.com/robots.txt',
1035+
`User-agent: *
1036+
Disallow: /
1037+
Allow: /yes
1038+
1039+
User-agent: MyCrawler
1040+
Disallow: /no
1041+
Allow: /my-crawler
1042+
`,
1043+
);
1044+
1045+
const urls = [
1046+
'http://example.com/yes',
1047+
'http://example.com/no',
1048+
'http://example.com/no-globally',
1049+
'http://example.com/my-crawler/anything',
1050+
];
1051+
1052+
test('defaults to the catch-all user-agent when not provided', async () => {
1053+
const { enqueued, requestQueue } = createRequestQueueMock();
1054+
1055+
await enqueueLinks({ urls, requestQueue, robotsTxtFile });
1056+
1057+
expect(enqueued.map((r) => r.url)).toEqual(['http://example.com/yes']);
1058+
});
1059+
1060+
test('applies rules for the configured user-agent', async () => {
1061+
const { enqueued, requestQueue } = createRequestQueueMock();
1062+
1063+
await enqueueLinks({
1064+
urls,
1065+
requestQueue,
1066+
robotsTxtFile,
1067+
respectRobotsTxtFile: { userAgent: 'MyCrawler' },
1068+
});
1069+
1070+
expect(enqueued.map((r) => r.url)).toEqual([
1071+
'http://example.com/yes',
1072+
'http://example.com/my-crawler/anything',
1073+
]);
1074+
});
1075+
1076+
test('skips filtering when set to false even if robotsTxtFile is provided', async () => {
1077+
const { enqueued, requestQueue } = createRequestQueueMock();
1078+
1079+
await enqueueLinks({
1080+
urls,
1081+
requestQueue,
1082+
robotsTxtFile,
1083+
respectRobotsTxtFile: false,
1084+
});
1085+
1086+
expect(enqueued.map((r) => r.url)).toEqual(urls);
1087+
});
1088+
});
10301089
});

0 commit comments

Comments
 (0)