fix: honor custom respectRobotsTxtFile userAgent in enqueueLinks (apify#3578)

shaun0927 · web-flow · commit 1d2528b9a0e1 · 2026-04-23T17:51:25.000+02:00
diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts
@@ -1734,6 +1734,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         return enqueueLinks({
             requestQueue,
             robotsTxtFile: await this.getRobotsTxtFileForUrl(request!.url),
+            respectRobotsTxtFile: this.respectRobotsTxtFile,
             onSkippedRequest,
             limit: this.calculateEnqueuedRequestLimit(options.limit),
 
diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts
@@ -182,6 +182,13 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
      */
     robotsTxtFile?: Pick<RobotsTxtFile, 'isAllowed'>;
 
+    /**
+     * Mirrors {@apilink BasicCrawlerOptions.respectRobotsTxtFile}: pass `false` to disable filtering or
+     * `{ userAgent }` to evaluate rules for a specific user-agent. Defaults to `*` when
+     * {@apilink EnqueueLinksOptions.robotsTxtFile|`robotsTxtFile`} is provided.
+     */
+    respectRobotsTxtFile?: boolean | { userAgent?: string };
+
     /**
      * When a request is skipped for some reason, you can use this callback to act on it.
      * This is currently fired for requests skipped
@@ -296,6 +303,7 @@ export async function enqueueLinks(
             urls: ow.array.ofType(ow.string),
             requestQueue: ow.object.hasKeys('addRequestsBatched'),
             robotsTxtFile: ow.optional.object.hasKeys('isAllowed'),
+            respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object.exactShape({ userAgent: ow.optional.string })),
             onSkippedRequest: ow.optional.function,
             forefront: ow.optional.boolean,
             skipNavigation: ow.optional.boolean,
@@ -422,11 +430,13 @@ export async function enqueueLinks(
 
     let requestOptions = createRequestOptions(urls, options);
 
-    if (robotsTxtFile) {
+    if (robotsTxtFile && options.respectRobotsTxtFile !== false) {
+        const robotsUserAgent =
+            typeof options.respectRobotsTxtFile === 'object' ? (options.respectRobotsTxtFile.userAgent ?? '*') : '*';
         const skippedRequests: RequestOptions[] = [];
 
         requestOptions = requestOptions.filter((request) => {
-            if (robotsTxtFile.isAllowed(request.url)) {
+            if (robotsTxtFile.isAllowed(request.url, robotsUserAgent)) {
                 return true;
             }
 
diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts
@@ -1722,6 +1722,79 @@ describe('BasicCrawler', () => {
             expect(addRequestsBatchedSpy).toHaveBeenCalledOnce();
         });
 
+        test('enqueueLinks should respect custom user-agent robots.txt rules', async () => {
+            const requestQueue = await RequestQueue.open();
+            const visitedUrls: string[] = [];
+
+            const crawler = new (class MockedRobotsTxtCrawler extends BasicCrawler {
+                override async getRobotsTxtFileForUrl(_: string) {
+                    return RobotsTxtFile.from(
+                        'http://example.com/robots.txt',
+                        `User-agent: *
+                         Disallow: /
+                         Allow: /yes
+
+                         User-agent: MyCrawler
+                         Disallow: /no
+                         Allow: /my-crawler
+                        `,
+                    );
+                }
+            })({
+                requestQueue,
+                maxConcurrency: 1,
+                respectRobotsTxtFile: { userAgent: 'MyCrawler' },
+                requestHandler: async (context) => {
+                    visitedUrls.push(context.request.url);
+
+                    if (context.request.label) {
+                        return;
+                    }
+
+                    await context.enqueueLinks({
+                        urls: [
+                            'http://example.com/yes',
+                            'http://example.com/no',
+                            'http://example.com/no-globally',
+                            'http://example.com/my-crawler/anything',
+                        ],
+                        label: 'child',
+                    });
+                },
+            });
+
+            await crawler.run(['http://example.com/start']);
+
+            expect(visitedUrls).toEqual([
+                'http://example.com/start',
+                'http://example.com/yes',
+                'http://example.com/my-crawler/anything',
+            ]);
+        });
+
+        test('enqueueLinks forwards respectRobotsTxtFile.userAgent to the robots.txt check', async () => {
+            const requestQueue = await RequestQueue.open();
+            const isAllowedSpy = vitest.fn(() => true);
+
+            const crawler = new (class MockedRobotsTxtCrawler extends BasicCrawler {
+                override async getRobotsTxtFileForUrl(_: string) {
+                    return { isAllowed: isAllowedSpy } as unknown as RobotsTxtFile;
+                }
+            })({
+                requestQueue,
+                maxConcurrency: 1,
+                respectRobotsTxtFile: { userAgent: 'MyCrawler' },
+                requestHandler: async (context) => {
+                    if (context.request.label) return;
+                    await context.enqueueLinks({ urls: ['http://example.com/child'], label: 'child' });
+                },
+            });
+
+            await crawler.run(['http://example.com/start']);
+
+            expect(isAllowedSpy).toHaveBeenCalledWith('http://example.com/child', 'MyCrawler');
+        });
+
         test('enqueueLinks should respect maxRequestsPerCrawl', async () => {
             const requestQueue = await RequestQueue.open();
             const addRequestsBatchedSpy = vitest.spyOn(requestQueue, 'addRequestsBatched');
diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts
@@ -1,4 +1,5 @@
 import { type AddRequestsBatchedOptions, cheerioCrawlerEnqueueLinks } from '@crawlee/cheerio';
+import { enqueueLinks } from '@crawlee/core';
 import { launchPlaywright } from '@crawlee/playwright';
 import type { RequestQueueOperationOptions, Source } from '@crawlee/puppeteer';
 import {
@@ -8,7 +9,7 @@ import {
     launchPuppeteer,
     RequestQueue,
 } from '@crawlee/puppeteer';
-import { type CheerioRoot } from '@crawlee/utils';
+import { type CheerioRoot, RobotsTxtFile } from '@crawlee/utils';
 import { load } from 'cheerio';
 import type { Browser as PlaywrightBrowser, Page as PlaywrightPage } from 'playwright';
 import type { Browser as PuppeteerBrowser, Page as PuppeteerPage } from 'puppeteer';
@@ -1027,4 +1028,62 @@ describe('enqueueLinks()', () => {
             }
         });
     });
+
+    describe('respectRobotsTxtFile option', () => {
+        const robotsTxtFile = RobotsTxtFile.from(
+            'http://example.com/robots.txt',
+            `User-agent: *
+             Disallow: /
+             Allow: /yes
+
+             User-agent: MyCrawler
+             Disallow: /no
+             Allow: /my-crawler
+            `,
+        );
+
+        const urls = [
+            'http://example.com/yes',
+            'http://example.com/no',
+            'http://example.com/no-globally',
+            'http://example.com/my-crawler/anything',
+        ];
+
+        test('defaults to the catch-all user-agent when not provided', async () => {
+            const { enqueued, requestQueue } = createRequestQueueMock();
+
+            await enqueueLinks({ urls, requestQueue, robotsTxtFile });
+
+            expect(enqueued.map((r) => r.url)).toEqual(['http://example.com/yes']);
+        });
+
+        test('applies rules for the configured user-agent', async () => {
+            const { enqueued, requestQueue } = createRequestQueueMock();
+
+            await enqueueLinks({
+                urls,
+                requestQueue,
+                robotsTxtFile,
+                respectRobotsTxtFile: { userAgent: 'MyCrawler' },
+            });
+
+            expect(enqueued.map((r) => r.url)).toEqual([
+                'http://example.com/yes',
+                'http://example.com/my-crawler/anything',
+            ]);
+        });
+
+        test('skips filtering when set to false even if robotsTxtFile is provided', async () => {
+            const { enqueued, requestQueue } = createRequestQueueMock();
+
+            await enqueueLinks({
+                urls,
+                requestQueue,
+                robotsTxtFile,
+                respectRobotsTxtFile: false,
+            });
+
+            expect(enqueued.map((r) => r.url)).toEqual(urls);
+        });
+    });
 });