Skip to content

Commit 65e4ae6

Browse files
authored
fix: rectify transformRequestFunction precedence rules (#3372)
Fixes `transformRequestFunction` precedence in `enqueueLinks` — it now runs **after** pattern filtering instead of before, giving it the highest priority so its modifications can't be silently overwritten by `globs`/`regexps`/`pseudoUrls` options. Previously, pattern-specific options (like `method` or `userData` on a glob) would override changes made by `transformRequestFunction`, which was unintuitive since the transform is the user's explicit escape hatch for customizing requests. **Priority order (lowest to highest):** 1. Global `label`/`userData` 2. Pattern-specific options (`globs`, `regexps`, `pseudoUrls`) 3. `transformRequestFunction` **What changed:** - Reordered the processing pipeline: patterns filter and enrich first, then `transformRequestFunction` runs last - Requests rejected by the transform (returning `null`/`false`) now report a `'transform'` skip reason via `onSkippedRequest` - Updated Playwright and Puppeteer click-element utilities for consistent behavior - Added upgrade guide documentation and comprehensive tests
1 parent a18a5aa commit 65e4ae6

10 files changed

Lines changed: 557 additions & 156 deletions

File tree

docs/upgrading/upgrading_v4.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,18 @@ The `KeyValueStore.getPublicUrl` method is now asynchronous and reads the public
114114
## `preNavigationHooks` in `HttpCrawler` no longer accepts `gotOptions` object
115115

116116
The `preNavigationHooks` option in `HttpCrawler` subclasses no longer accepts the `gotOptions` object as a second parameter. Modify the `crawlingContext` fields (e.g. `.request`) directly instead.
117+
118+
## `transformRequestFunction` precedence in `enqueueLinks`
119+
120+
The `transformRequestFunction` callback in `enqueueLinks` now runs **after** URL pattern filtering (`globs`, `regexps`, `pseudoUrls`) instead of before. This means it has the highest priority and can overwrite any request options set by patterns or the global `label` option.
121+
122+
The priority order is now (lowest to highest):
123+
1. Global `label` / `userData` options
124+
2. Pattern-specific options from `globs`, `regexps`, or `pseudoUrls` objects
125+
3. `transformRequestFunction`
126+
127+
The `transformRequestFunction` callback receives a `RequestOptions` object and can return either:
128+
- The modified `RequestOptions` object
129+
- A new `RequestOptions` plain object
130+
- `'unchanged'` to keep the original options as-is
131+
- A falsy value or `'skip'` to exclude the request from the queue

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1723,16 +1723,19 @@ export class BasicCrawler<
17231723
request: Request<Dictionary>,
17241724
requestQueue: RequestProvider,
17251725
): Promise<BatchAddRequestsResult> {
1726-
const transformRequestFunctionWrapper: RequestTransform = (newRequest) => {
1727-
newRequest.crawlDepth = request.crawlDepth + 1;
1728-
1729-
if (this.maxCrawlDepth !== undefined && newRequest.crawlDepth > this.maxCrawlDepth) {
1730-
newRequest.skippedReason = 'depth';
1726+
const transformRequestFunctionWrapper: RequestTransform = (requestOptions) => {
1727+
requestOptions.crawlDepth = request.crawlDepth + 1;
1728+
1729+
if (this.maxCrawlDepth !== undefined && requestOptions.crawlDepth! > this.maxCrawlDepth) {
1730+
// Setting `skippedReason` before returning `false` ensures that `reportSkippedRequests`
1731+
// reports `'depth'` as the reason (via `request.skippedReason ?? reason` fallback),
1732+
// rather than the generic `'transform'` reason.
1733+
requestOptions.skippedReason = 'depth';
17311734
return false;
17321735
}
17331736

17341737
// After injecting the crawlDepth, we call the user-provided transform function, if there is one.
1735-
return options.transformRequestFunction?.(newRequest) ?? newRequest;
1738+
return options.transformRequestFunction?.(requestOptions) ?? requestOptions;
17361739
};
17371740

17381741
return await enqueueLinks({

packages/core/src/enqueue_links/enqueue_links.ts

Lines changed: 50 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ import type { SetRequired } from 'type-fest';
66

77
import log from '@apify/log';
88

9-
import type { Request, RequestOptions } from '../request.js';
9+
import type { RequestOptions } from '../request.js';
10+
import { Request } from '../request.js';
1011
import type {
1112
AddRequestsBatchedOptions,
1213
AddRequestsBatchedResult,
@@ -23,12 +24,12 @@ import type {
2324
UrlPatternObject,
2425
} from './shared.js';
2526
import {
27+
applyRequestTransform,
2628
constructGlobObjectsFromGlobs,
2729
constructRegExpObjectsFromPseudoUrls,
2830
constructRegExpObjectsFromRegExps,
2931
createRequestOptions,
30-
createRequests,
31-
filterRequestsByPatterns,
32+
filterRequestOptionsByPatterns,
3233
} from './shared.js';
3334

3435
export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
@@ -50,8 +51,8 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
5051
/**
5152
* Sets {@apilink Request.label} for newly enqueued requests.
5253
*
53-
* Note that the request options specified in `globs`, `regexps`, or `pseudoUrls` objects
54-
* have priority over this option.
54+
* This option has the lowest priority and can be overwritten by request options
55+
* specified in `globs`, `regexps`, or `pseudoUrls` objects, as well as by `transformRequestFunction`.
5556
*/
5657
label?: string;
5758

@@ -126,12 +127,12 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
126127
pseudoUrls?: readonly PseudoUrlInput[];
127128

128129
/**
129-
* Just before a new {@apilink Request} is constructed and enqueued to the {@apilink RequestQueue}, this function can be used
130-
* to remove it or modify its contents such as `userData`, `payload` or, most importantly `uniqueKey`. This is useful
130+
* After request options are filtered by patterns, this function can be used
131+
* to remove them or modify their contents such as `userData`, `payload` or, most importantly `uniqueKey`. This is useful
131132
* when you need to enqueue multiple `Requests` to the queue that share the same URL, but differ in methods or payloads,
132133
* or to dynamically update or create `userData`.
133134
*
134-
* For example: by adding `keepUrlFragment: true` to the `request` object, URL fragments will not be removed
135+
* For example: by adding `keepUrlFragment: true` to the request options, URL fragments will not be removed
135136
* when `uniqueKey` is computed.
136137
*
137138
* **Example:**
@@ -145,8 +146,13 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
145146
* }
146147
* ```
147148
*
148-
* Note that the request options specified in `globs`, `regexps`, or `pseudoUrls` objects
149-
* have priority over this function. Some request options returned by `transformRequestFunction` may be overwritten by pattern-based options from `globs`, `regexps`, or `pseudoUrls`.
149+
* Note that `transformRequestFunction` has the highest priority and can overwrite request options
150+
* specified in `globs`, `regexps`, or `pseudoUrls` objects, as well as the global `label` option.
151+
*
152+
* The function receives a {@apilink RequestOptions} object and can return either:
153+
* - The modified {@apilink RequestOptions} object
154+
* - `'unchanged'` to keep the original options as-is
155+
* - A falsy value or `'skip'` to exclude the request from the queue
150156
*/
151157
transformRequestFunction?: RequestTransform;
152158

@@ -437,58 +443,59 @@ export async function enqueueLinks(
437443
await reportSkippedRequests(skippedRequests, 'robotsTxt');
438444
}
439445

440-
if (transformRequestFunction) {
441-
const skippedRequests: RequestOptions[] = [];
442-
443-
requestOptions = requestOptions
444-
.map((request) => {
445-
const transformedRequest = transformRequestFunction(request);
446-
if (!transformedRequest) {
447-
skippedRequests.push(request);
448-
}
449-
return transformedRequest;
450-
})
451-
.filter((r) => Boolean(r)) as RequestOptions[];
452-
453-
await reportSkippedRequests(skippedRequests, 'filters');
454-
}
455-
456446
async function createFilteredRequests() {
457447
const skippedRequests: string[] = [];
458448

459-
// No user provided patterns means we can skip an extra filtering step
449+
// Step 1: Filter request options by exclude patterns, user patterns (globs/regexps), and strategy patterns.
450+
// Pattern-level options (label, userData, method, etc.) are merged during this step.
451+
let filteredOptions: RequestOptions[];
460452
if (urlPatternObjects.length === 0) {
461-
return createRequests(
453+
filteredOptions = filterRequestOptionsByPatterns(
454+
requestOptions,
455+
enqueueStrategyPatterns.length > 0 ? enqueueStrategyPatterns : undefined,
456+
urlExcludePatternObjects,
457+
options.strategy,
458+
(url) => skippedRequests.push(url),
459+
);
460+
} else {
461+
// Filter by user patterns first (with exclude)
462+
const afterUserPatterns = filterRequestOptionsByPatterns(
462463
requestOptions,
463-
enqueueStrategyPatterns,
464+
urlPatternObjects,
464465
urlExcludePatternObjects,
465466
options.strategy,
466467
(url) => skippedRequests.push(url),
467468
);
469+
// ...then filter by the enqueue links strategy (making this an AND check)
470+
filteredOptions = filterRequestOptionsByPatterns(
471+
afterUserPatterns,
472+
enqueueStrategyPatterns.length > 0 ? enqueueStrategyPatterns : undefined,
473+
[],
474+
options.strategy,
475+
(url) => skippedRequests.push(url),
476+
);
468477
}
469478

470-
// Generate requests based on the user patterns first
471-
const generatedRequestsFromUserFilters = createRequests(
472-
requestOptions,
473-
urlPatternObjects,
474-
urlExcludePatternObjects,
475-
options.strategy,
476-
(url) => skippedRequests.push(url),
477-
);
478-
// ...then filter them by the enqueue links strategy (making this an AND check)
479-
const filtered = filterRequestsByPatterns(generatedRequestsFromUserFilters, enqueueStrategyPatterns, (url) =>
480-
skippedRequests.push(url),
481-
);
482-
483479
await reportSkippedRequests(
484480
skippedRequests.map((url) => ({ url })),
485481
'filters',
486482
);
487483

488-
return filtered;
484+
// Step 2: Apply transformRequestFunction on request options - it has the highest priority
485+
if (transformRequestFunction) {
486+
const skippedByTransform: RequestOptions[] = [];
487+
filteredOptions = applyRequestTransform(filteredOptions, transformRequestFunction, (r) =>
488+
skippedByTransform.push(r),
489+
);
490+
await reportSkippedRequests(skippedByTransform, 'transform');
491+
}
492+
493+
// Step 3: Create Request instances from the final request options
494+
return filteredOptions.map((opts) => new Request(opts));
489495
}
490496

491497
let requests = await createFilteredRequests();
498+
492499
if (typeof limit === 'number' && limit < requests.length) {
493500
await reportSkippedRequests(requests.slice(limit), 'enqueueLimit');
494501
requests = requests.slice(0, limit);

packages/core/src/enqueue_links/shared.ts

Lines changed: 66 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import { Minimatch } from 'minimatch';
66
import { purlToRegExp } from '@apify/pseudo_url';
77

88
import type { RequestOptions } from '../request.js';
9-
import { Request } from '../request.js';
109
import type { EnqueueLinksOptions } from './enqueue_links.js';
1110

1211
export { tryAbsoluteURL } from '@crawlee/utils';
@@ -47,7 +46,14 @@ export type RegExpObject = { regexp: RegExp } & Pick<
4746

4847
export type RegExpInput = RegExp | RegExpObject;
4948

50-
export type SkippedRequestReason = 'robotsTxt' | 'limit' | 'enqueueLimit' | 'filters' | 'redirect' | 'depth';
49+
export type SkippedRequestReason =
50+
| 'robotsTxt'
51+
| 'limit'
52+
| 'enqueueLimit'
53+
| 'filters'
54+
| 'transform'
55+
| 'redirect'
56+
| 'depth';
5157

5258
export type SkippedRequestCallback = (args: { url: string; reason: SkippedRequestReason }) => Awaitable<void>;
5359

@@ -164,76 +170,46 @@ export function constructRegExpObjectsFromRegExps(regexps: readonly RegExpInput[
164170
}
165171

166172
/**
173+
* Filters request options by URL patterns and merges pattern-level options (label, userData, method, payload, headers)
174+
* from the first matching pattern into each RequestOptions entry.
175+
*
176+
* When `includePatterns` is empty/undefined, all options pass through (only exclude filtering applies).
167177
* @ignore
168178
*/
169-
export function createRequests(
170-
requestOptions: (string | RequestOptions)[],
171-
urlPatternObjects?: UrlPatternObject[],
172-
excludePatternObjects: UrlPatternObject[] = [],
179+
export function filterRequestOptionsByPatterns(
180+
requestOptions: RequestOptions[],
181+
includePatterns: UrlPatternObject[] | undefined,
182+
excludePatterns: UrlPatternObject[] = [],
173183
strategy?: EnqueueLinksOptions['strategy'],
174184
onSkippedUrl?: (url: string) => void,
175-
): Request[] {
176-
const excludePatternObjectMatchers = excludePatternObjects.map(createPatternObjectMatcher);
177-
const urlPatternObjectMatchers = urlPatternObjects?.map(createPatternObjectMatcher);
185+
): RequestOptions[] {
186+
const excludeMatchers = excludePatterns.map(createPatternObjectMatcher);
187+
const includeMatchers = includePatterns?.length ? includePatterns.map(createPatternObjectMatcher) : undefined;
178188

179189
return requestOptions
180-
.map((opts) => ({ url: typeof opts === 'string' ? opts : opts.url, opts }))
181190
.filter(({ url }) => {
182-
const matchesExcludePatterns = excludePatternObjectMatchers.some(({ match }) => match(url));
183-
184-
if (matchesExcludePatterns) {
191+
const matchesExclude = excludeMatchers.some(({ match }) => match(url));
192+
if (matchesExclude) {
185193
onSkippedUrl?.(url);
186194
}
187-
188-
return !matchesExcludePatterns;
195+
return !matchesExclude;
189196
})
190-
.map(({ url, opts }) => {
191-
if (!urlPatternObjectMatchers || !urlPatternObjectMatchers.length) {
192-
return new Request(typeof opts === 'string' ? { url: opts, enqueueStrategy: strategy } : { ...opts });
197+
.map((opts) => {
198+
if (!includeMatchers) {
199+
return { ...opts, enqueueStrategy: strategy };
193200
}
194201

195-
for (const urlPatternObject of urlPatternObjectMatchers) {
196-
const { match, glob, regexp, ...requestRegExpOptions } = urlPatternObject;
197-
if (match(url)) {
198-
const request =
199-
typeof opts === 'string'
200-
? { url: opts, ...requestRegExpOptions, enqueueStrategy: strategy }
201-
: { ...opts, ...requestRegExpOptions, enqueueStrategy: strategy };
202-
203-
return new Request(request);
202+
for (const { match, glob, regexp, ...patternOptions } of includeMatchers) {
203+
if (match(opts.url)) {
204+
return { ...opts, ...patternOptions, enqueueStrategy: strategy };
204205
}
205206
}
206207

207208
// didn't match any positive pattern
208-
onSkippedUrl?.(url);
209+
onSkippedUrl?.(opts.url);
209210
return null;
210211
})
211-
.filter((request) => request) as Request[];
212-
}
213-
214-
export function filterRequestsByPatterns(
215-
requests: Request[],
216-
patterns?: UrlPatternObject[],
217-
onSkippedUrl?: (url: string) => void,
218-
): Request[] {
219-
if (!patterns?.length) {
220-
return requests;
221-
}
222-
223-
const filtered: Request[] = [];
224-
const patternMatchers = patterns?.map(createPatternObjectMatcher);
225-
226-
for (const request of requests) {
227-
const matchingPattern = patternMatchers.find(({ match }) => match(request.url));
228-
229-
if (matchingPattern !== undefined) {
230-
filtered.push(request);
231-
} else {
232-
onSkippedUrl?.(request.url);
233-
}
234-
}
235-
236-
return filtered;
212+
.filter((opts) => opts !== null);
237213
}
238214

239215
/**
@@ -293,13 +269,45 @@ function createPatternObjectMatcher(urlPatternObject: UrlPatternObject) {
293269
}
294270

295271
/**
296-
* Takes an Apify {@apilink RequestOptions} object and changes its attributes in a desired way. This user-function is used
297-
* {@apilink enqueueLinks} to modify requests before enqueuing them.
272+
* Takes a {@apilink RequestOptions} object and changes its attributes in a desired way. This user-function is used
273+
* by {@apilink enqueueLinks} to modify request options before they are converted to {@apilink Request} instances.
298274
*/
299275
export interface RequestTransform {
300276
/**
301277
* @param original Request options to be modified.
302-
* @returns The modified request options to enqueue.
278+
* @returns The modified request options to enqueue, `'unchanged'` to keep the original options as-is,
279+
* or a falsy value / `'skip'` to exclude the request from the queue.
303280
*/
304-
(original: RequestOptions): RequestOptions | false | undefined | null;
281+
(original: RequestOptions): RequestOptions | false | undefined | null | 'skip' | 'unchanged';
282+
}
283+
284+
/**
285+
* Applies a {@apilink RequestTransform} function to a list of request options.
286+
* Options for which the transform returns a falsy value are removed from the list.
287+
* @param onSkipped Called with the original request options when the transform returns a falsy value (i.e. the request is skipped).
288+
* @ignore
289+
* @internal
290+
*/
291+
export function applyRequestTransform(
292+
requestOptions: RequestOptions[],
293+
transformFn: RequestTransform,
294+
onSkipped?: (requestOptions: RequestOptions) => void,
295+
): RequestOptions[] {
296+
return requestOptions
297+
.map((opts) => {
298+
const transformed = transformFn(opts);
299+
if (transformed === 'skip') {
300+
onSkipped?.(opts);
301+
return null;
302+
}
303+
if (transformed === 'unchanged') {
304+
return opts;
305+
}
306+
if (!transformed) {
307+
onSkipped?.(opts);
308+
return null;
309+
}
310+
return transformed;
311+
})
312+
.filter((r): r is RequestOptions => r !== null);
305313
}

0 commit comments

Comments
 (0)