diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 850dda046f97..54fd7ef66f3a 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1734,6 +1734,7 @@ export class BasicCrawler; + /** + * Mirrors {@apilink BasicCrawlerOptions.respectRobotsTxtFile}: pass `false` to disable filtering or + * `{ userAgent }` to evaluate rules for a specific user-agent. Defaults to `*` when + * {@apilink EnqueueLinksOptions.robotsTxtFile|`robotsTxtFile`} is provided. + */ + respectRobotsTxtFile?: boolean | { userAgent?: string }; + /** * When a request is skipped for some reason, you can use this callback to act on it. * This is currently fired for requests skipped @@ -296,6 +303,7 @@ export async function enqueueLinks( urls: ow.array.ofType(ow.string), requestQueue: ow.object.hasKeys('addRequestsBatched'), robotsTxtFile: ow.optional.object.hasKeys('isAllowed'), + respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object.exactShape({ userAgent: ow.optional.string })), onSkippedRequest: ow.optional.function, forefront: ow.optional.boolean, skipNavigation: ow.optional.boolean, @@ -422,11 +430,13 @@ export async function enqueueLinks( let requestOptions = createRequestOptions(urls, options); - if (robotsTxtFile) { + if (robotsTxtFile && options.respectRobotsTxtFile !== false) { + const robotsUserAgent = + typeof options.respectRobotsTxtFile === 'object' ? (options.respectRobotsTxtFile.userAgent ?? '*') : '*'; const skippedRequests: RequestOptions[] = []; requestOptions = requestOptions.filter((request) => { - if (robotsTxtFile.isAllowed(request.url)) { + if (robotsTxtFile.isAllowed(request.url, robotsUserAgent)) { return true; } diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index 16410e298f50..57323e591fa9 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -1722,6 +1722,79 @@ describe('BasicCrawler', () => { expect(addRequestsBatchedSpy).toHaveBeenCalledOnce(); }); + test('enqueueLinks should respect custom user-agent robots.txt rules', async () => { + const requestQueue = await RequestQueue.open(); + const visitedUrls: string[] = []; + + const crawler = new (class MockedRobotsTxtCrawler extends BasicCrawler { + override async getRobotsTxtFileForUrl(_: string) { + return RobotsTxtFile.from( + 'http://example.com/robots.txt', + `User-agent: * + Disallow: / + Allow: /yes + + User-agent: MyCrawler + Disallow: /no + Allow: /my-crawler + `, + ); + } + })({ + requestQueue, + maxConcurrency: 1, + respectRobotsTxtFile: { userAgent: 'MyCrawler' }, + requestHandler: async (context) => { + visitedUrls.push(context.request.url); + + if (context.request.label) { + return; + } + + await context.enqueueLinks({ + urls: [ + 'http://example.com/yes', + 'http://example.com/no', + 'http://example.com/no-globally', + 'http://example.com/my-crawler/anything', + ], + label: 'child', + }); + }, + }); + + await crawler.run(['http://example.com/start']); + + expect(visitedUrls).toEqual([ + 'http://example.com/start', + 'http://example.com/yes', + 'http://example.com/my-crawler/anything', + ]); + }); + + test('enqueueLinks forwards respectRobotsTxtFile.userAgent to the robots.txt check', async () => { + const requestQueue = await RequestQueue.open(); + const isAllowedSpy = vitest.fn(() => true); + + const crawler = new (class MockedRobotsTxtCrawler extends BasicCrawler { + override async getRobotsTxtFileForUrl(_: string) { + return { isAllowed: isAllowedSpy } as unknown as RobotsTxtFile; + } + })({ + requestQueue, + maxConcurrency: 1, + respectRobotsTxtFile: { userAgent: 'MyCrawler' }, + requestHandler: async (context) => { + if (context.request.label) return; + await context.enqueueLinks({ urls: ['http://example.com/child'], label: 'child' }); + }, + }); + + await crawler.run(['http://example.com/start']); + + expect(isAllowedSpy).toHaveBeenCalledWith('http://example.com/child', 'MyCrawler'); + }); + test('enqueueLinks should respect maxRequestsPerCrawl', async () => { const requestQueue = await RequestQueue.open(); const addRequestsBatchedSpy = vitest.spyOn(requestQueue, 'addRequestsBatched'); diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index cede47fec3c6..f16589718d52 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -1,4 +1,5 @@ import { type AddRequestsBatchedOptions, cheerioCrawlerEnqueueLinks } from '@crawlee/cheerio'; +import { enqueueLinks } from '@crawlee/core'; import { launchPlaywright } from '@crawlee/playwright'; import type { RequestQueueOperationOptions, Source } from '@crawlee/puppeteer'; import { @@ -8,7 +9,7 @@ import { launchPuppeteer, RequestQueue, } from '@crawlee/puppeteer'; -import { type CheerioRoot } from '@crawlee/utils'; +import { type CheerioRoot, RobotsTxtFile } from '@crawlee/utils'; import { load } from 'cheerio'; import type { Browser as PlaywrightBrowser, Page as PlaywrightPage } from 'playwright'; import type { Browser as PuppeteerBrowser, Page as PuppeteerPage } from 'puppeteer'; @@ -1027,4 +1028,62 @@ describe('enqueueLinks()', () => { } }); }); + + describe('respectRobotsTxtFile option', () => { + const robotsTxtFile = RobotsTxtFile.from( + 'http://example.com/robots.txt', + `User-agent: * + Disallow: / + Allow: /yes + + User-agent: MyCrawler + Disallow: /no + Allow: /my-crawler + `, + ); + + const urls = [ + 'http://example.com/yes', + 'http://example.com/no', + 'http://example.com/no-globally', + 'http://example.com/my-crawler/anything', + ]; + + test('defaults to the catch-all user-agent when not provided', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + + await enqueueLinks({ urls, requestQueue, robotsTxtFile }); + + expect(enqueued.map((r) => r.url)).toEqual(['http://example.com/yes']); + }); + + test('applies rules for the configured user-agent', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + + await enqueueLinks({ + urls, + requestQueue, + robotsTxtFile, + respectRobotsTxtFile: { userAgent: 'MyCrawler' }, + }); + + expect(enqueued.map((r) => r.url)).toEqual([ + 'http://example.com/yes', + 'http://example.com/my-crawler/anything', + ]); + }); + + test('skips filtering when set to false even if robotsTxtFile is provided', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + + await enqueueLinks({ + urls, + requestQueue, + robotsTxtFile, + respectRobotsTxtFile: false, + }); + + expect(enqueued.map((r) => r.url)).toEqual(urls); + }); + }); });