Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1734,6 +1734,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return enqueueLinks({
requestQueue,
robotsTxtFile: await this.getRobotsTxtFileForUrl(request!.url),
respectRobotsTxtFile: this.respectRobotsTxtFile,
onSkippedRequest,
limit: this.calculateEnqueuedRequestLimit(options.limit),

Expand Down
14 changes: 12 additions & 2 deletions packages/core/src/enqueue_links/enqueue_links.ts
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,13 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
*/
robotsTxtFile?: Pick<RobotsTxtFile, 'isAllowed'>;

/**
* Mirrors {@apilink BasicCrawlerOptions.respectRobotsTxtFile}: pass `false` to disable filtering or
* `{ userAgent }` to evaluate rules for a specific user-agent. Defaults to `*` when
* {@apilink EnqueueLinksOptions.robotsTxtFile|`robotsTxtFile`} is provided.
*/
respectRobotsTxtFile?: boolean | { userAgent?: string };

/**
* When a request is skipped for some reason, you can use this callback to act on it.
* This is currently fired for requests skipped
Expand Down Expand Up @@ -296,6 +303,7 @@ export async function enqueueLinks(
urls: ow.array.ofType(ow.string),
requestQueue: ow.object.hasKeys('addRequestsBatched'),
robotsTxtFile: ow.optional.object.hasKeys('isAllowed'),
respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object.exactShape({ userAgent: ow.optional.string })),
onSkippedRequest: ow.optional.function,
forefront: ow.optional.boolean,
skipNavigation: ow.optional.boolean,
Expand Down Expand Up @@ -422,11 +430,13 @@ export async function enqueueLinks(

let requestOptions = createRequestOptions(urls, options);

if (robotsTxtFile) {
if (robotsTxtFile && options.respectRobotsTxtFile !== false) {
const robotsUserAgent =
typeof options.respectRobotsTxtFile === 'object' ? (options.respectRobotsTxtFile.userAgent ?? '*') : '*';
const skippedRequests: RequestOptions[] = [];

requestOptions = requestOptions.filter((request) => {
if (robotsTxtFile.isAllowed(request.url)) {
if (robotsTxtFile.isAllowed(request.url, robotsUserAgent)) {
return true;
}

Expand Down
73 changes: 73 additions & 0 deletions test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1722,6 +1722,79 @@ describe('BasicCrawler', () => {
expect(addRequestsBatchedSpy).toHaveBeenCalledOnce();
});

test('enqueueLinks should respect custom user-agent robots.txt rules', async () => {
const requestQueue = await RequestQueue.open();
const visitedUrls: string[] = [];

const crawler = new (class MockedRobotsTxtCrawler extends BasicCrawler {
override async getRobotsTxtFileForUrl(_: string) {
return RobotsTxtFile.from(
'http://example.com/robots.txt',
`User-agent: *
Disallow: /
Allow: /yes

User-agent: MyCrawler
Disallow: /no
Allow: /my-crawler
`,
);
}
})({
requestQueue,
maxConcurrency: 1,
respectRobotsTxtFile: { userAgent: 'MyCrawler' },
requestHandler: async (context) => {
visitedUrls.push(context.request.url);

if (context.request.label) {
return;
}

await context.enqueueLinks({
urls: [
'http://example.com/yes',
'http://example.com/no',
'http://example.com/no-globally',
'http://example.com/my-crawler/anything',
],
label: 'child',
});
},
});

await crawler.run(['http://example.com/start']);

expect(visitedUrls).toEqual([
'http://example.com/start',
'http://example.com/yes',
'http://example.com/my-crawler/anything',
]);
});

test('enqueueLinks forwards respectRobotsTxtFile.userAgent to the robots.txt check', async () => {
const requestQueue = await RequestQueue.open();
const isAllowedSpy = vitest.fn(() => true);

const crawler = new (class MockedRobotsTxtCrawler extends BasicCrawler {
override async getRobotsTxtFileForUrl(_: string) {
return { isAllowed: isAllowedSpy } as unknown as RobotsTxtFile;
}
})({
requestQueue,
maxConcurrency: 1,
respectRobotsTxtFile: { userAgent: 'MyCrawler' },
requestHandler: async (context) => {
if (context.request.label) return;
await context.enqueueLinks({ urls: ['http://example.com/child'], label: 'child' });
},
});

await crawler.run(['http://example.com/start']);

expect(isAllowedSpy).toHaveBeenCalledWith('http://example.com/child', 'MyCrawler');
});

test('enqueueLinks should respect maxRequestsPerCrawl', async () => {
const requestQueue = await RequestQueue.open();
const addRequestsBatchedSpy = vitest.spyOn(requestQueue, 'addRequestsBatched');
Expand Down
61 changes: 60 additions & 1 deletion test/core/enqueue_links/enqueue_links.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { type AddRequestsBatchedOptions, cheerioCrawlerEnqueueLinks } from '@crawlee/cheerio';
import { enqueueLinks } from '@crawlee/core';
import { launchPlaywright } from '@crawlee/playwright';
import type { RequestQueueOperationOptions, Source } from '@crawlee/puppeteer';
import {
Expand All @@ -8,7 +9,7 @@ import {
launchPuppeteer,
RequestQueue,
} from '@crawlee/puppeteer';
import { type CheerioRoot } from '@crawlee/utils';
import { type CheerioRoot, RobotsTxtFile } from '@crawlee/utils';
import { load } from 'cheerio';
import type { Browser as PlaywrightBrowser, Page as PlaywrightPage } from 'playwright';
import type { Browser as PuppeteerBrowser, Page as PuppeteerPage } from 'puppeteer';
Expand Down Expand Up @@ -1027,4 +1028,62 @@ describe('enqueueLinks()', () => {
}
});
});

describe('respectRobotsTxtFile option', () => {
const robotsTxtFile = RobotsTxtFile.from(
'http://example.com/robots.txt',
`User-agent: *
Disallow: /
Allow: /yes

User-agent: MyCrawler
Disallow: /no
Allow: /my-crawler
`,
);

const urls = [
'http://example.com/yes',
'http://example.com/no',
'http://example.com/no-globally',
'http://example.com/my-crawler/anything',
];

test('defaults to the catch-all user-agent when not provided', async () => {
const { enqueued, requestQueue } = createRequestQueueMock();

await enqueueLinks({ urls, requestQueue, robotsTxtFile });

expect(enqueued.map((r) => r.url)).toEqual(['http://example.com/yes']);
});

test('applies rules for the configured user-agent', async () => {
const { enqueued, requestQueue } = createRequestQueueMock();

await enqueueLinks({
urls,
requestQueue,
robotsTxtFile,
respectRobotsTxtFile: { userAgent: 'MyCrawler' },
});

expect(enqueued.map((r) => r.url)).toEqual([
'http://example.com/yes',
'http://example.com/my-crawler/anything',
]);
});

test('skips filtering when set to false even if robotsTxtFile is provided', async () => {
const { enqueued, requestQueue } = createRequestQueueMock();

await enqueueLinks({
urls,
requestQueue,
robotsTxtFile,
respectRobotsTxtFile: false,
});

expect(enqueued.map((r) => r.url)).toEqual(urls);
});
});
});
Loading