Skip to content

Commit 339dcde

Browse files
committed
feat!: replace globs/regexps/pseudoUrls with unified include/exclude API
Align `EnqueueLinksOptions` with crawlee-python (#3409): - Replace `globs`, `regexps`, `pseudoUrls` options with `include`/`exclude` accepting `UrlPatternInput[]` - Strip request options (label, method, payload, userData, headers) from pattern objects — patterns are pure URL matchers - `transformRequestFunction` is now the only way to customize per-request options, runs after all filtering - Add `'skip'` and `'unchanged'` return values to `RequestTransform` (aligned with Python's `RequestTransformAction`) - Apply same changes to `enqueueLinksByClickingElements` (Playwright + Puppeteer) and `SitemapRequestList` - Remove `@apify/pseudo_url` dependency and `PseudoUrl` re-export - Update all templates from `globs` to `include` BREAKING CHANGE: `globs`, `regexps`, and `pseudoUrls` options removed. Use `include`/`exclude` instead.
1 parent f6dac27 commit 339dcde

22 files changed

Lines changed: 256 additions & 771 deletions

File tree

packages/core/package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
"@apify/consts": "^2.41.0",
5151
"@apify/datastructures": "^2.0.3",
5252
"@apify/log": "^2.5.18",
53-
"@apify/pseudo_url": "^2.0.59",
5453
"@apify/timeout": "^0.3.2",
5554
"@apify/utilities": "^2.15.5",
5655
"@crawlee/memory-storage": "4.0.0",

packages/core/src/crawlers/crawler_commons.ts

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,7 @@ export interface RestrictedCrawlingContext<UserData extends Dictionary = Diction
5454
* This function automatically finds and enqueues links from the current page, adding them to the {@apilink RequestQueue}
5555
* currently used by the crawler.
5656
*
57-
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
58-
* and override settings of the enqueued {@apilink Request} objects.
57+
* Optionally, the function allows you to filter the target links' URLs using an array of glob or regexp patterns.
5958
*
6059
* Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example
6160
* for more details regarding its usage.
@@ -65,7 +64,7 @@ export interface RestrictedCrawlingContext<UserData extends Dictionary = Diction
6564
* ```ts
6665
* async requestHandler({ enqueueLinks }) {
6766
* await enqueueLinks({
68-
* globs: [
67+
* include: [
6968
* 'https://www.example.com/handbags/*',
7069
* ],
7170
* });
@@ -112,8 +111,7 @@ export interface CrawlingContext<UserData extends Dictionary = Dictionary> exten
112111
* This function automatically finds and enqueues links from the current page, adding them to the {@apilink RequestQueue}
113112
* currently used by the crawler.
114113
*
115-
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
116-
* and override settings of the enqueued {@apilink Request} objects.
114+
* Optionally, the function allows you to filter the target links' URLs using an array of glob or regexp patterns.
117115
*
118116
* Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example
119117
* for more details regarding its usage.
@@ -123,7 +121,7 @@ export interface CrawlingContext<UserData extends Dictionary = Dictionary> exten
123121
* ```ts
124122
* async requestHandler({ enqueueLinks }) {
125123
* await enqueueLinks({
126-
* globs: [
124+
* include: [
127125
* 'https://www.example.com/handbags/*',
128126
* ],
129127
* });

packages/core/src/enqueue_links/enqueue_links.ts

Lines changed: 27 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,22 @@ import type { SetRequired } from 'type-fest';
66

77
import type { RequestOptions } from '../request.js';
88
import { Request } from '../request.js';
9-
import { serviceLocator } from '../service_locator.js';
109
import type {
1110
AddRequestsBatchedOptions,
1211
AddRequestsBatchedResult,
1312
RequestProvider,
1413
RequestQueueOperationOptions,
1514
} from '../storages/request_provider.js';
1615
import type {
17-
GlobInput,
18-
PseudoUrlInput,
19-
RegExpInput,
2016
RequestTransform,
2117
SkippedRequestCallback,
2218
SkippedRequestReason,
19+
UrlPatternInput,
2320
UrlPatternObject,
2421
} from './shared.js';
2522
import {
2623
applyRequestTransform,
27-
constructGlobObjectsFromGlobs,
28-
constructRegExpObjectsFromPseudoUrls,
29-
constructRegExpObjectsFromRegExps,
24+
constructUrlPatternObjects,
3025
createRequestOptions,
3126
filterRequestOptionsByPatterns,
3227
} from './shared.js';
@@ -50,8 +45,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
5045
/**
5146
* Sets {@apilink Request.label} for newly enqueued requests.
5247
*
53-
* This option has the lowest priority and can be overwritten by request options
54-
* specified in `globs`, `regexps`, or `pseudoUrls` objects, as well as by `transformRequestFunction`.
48+
* Can be overwritten by `transformRequestFunction`.
5549
*/
5650
label?: string;
5751

@@ -68,65 +62,30 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
6862
baseUrl?: string;
6963

7064
/**
71-
* An array of glob pattern strings or plain objects
72-
* containing glob pattern strings matching the URLs to be enqueued.
65+
* An array of URL patterns that URLs must match to be enqueued.
7366
*
74-
* The plain objects must include at least the `glob` property, which holds the glob pattern string.
75-
* All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects.
76-
*
77-
* The matching is always case-insensitive.
78-
* If you need case-sensitive matching, use `regexps` property directly.
79-
*
80-
* If `globs` is an empty array or `undefined`, and `regexps` are also not defined, then the function
81-
* enqueues the links with the same subdomain.
82-
*/
83-
globs?: readonly GlobInput[];
84-
85-
/**
86-
* An array of glob pattern strings, regexp patterns or plain objects
87-
* containing patterns matching URLs that will **never** be enqueued.
88-
*
89-
* The plain objects must include either the `glob` property or the `regexp` property.
67+
* Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects.
9068
*
9169
* Glob matching is always case-insensitive.
92-
* If you need case-sensitive matching, provide a regexp.
93-
*/
94-
exclude?: readonly (GlobInput | RegExpInput)[];
95-
96-
/**
97-
* An array of regular expressions or plain objects
98-
* containing regular expressions matching the URLs to be enqueued.
99-
*
100-
* The plain objects must include at least the `regexp` property, which holds the regular expression.
101-
* All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects.
70+
* If you need case-sensitive matching, use a `RegExp`.
10271
*
103-
* If `regexps` is an empty array or `undefined`, and `globs` are also not defined, then the function
72+
* If `include` is an empty array or `undefined`, then the function
10473
* enqueues the links with the same subdomain.
10574
*/
106-
regexps?: readonly RegExpInput[];
75+
include?: readonly UrlPatternInput[];
10776

10877
/**
109-
* *NOTE:* In future versions of SDK the options will be removed.
110-
* Please use `globs` or `regexps` instead.
78+
* An array of URL patterns. Matching URLs will **not** be enqueued.
11179
*
112-
* An array of {@apilink PseudoUrl} strings or plain objects
113-
* containing {@apilink PseudoUrl} strings matching the URLs to be enqueued.
80+
* Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects.
11481
*
115-
* The plain objects must include at least the `purl` property, which holds the pseudo-URL string.
116-
* All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects.
117-
*
118-
* With a pseudo-URL string, the matching is always case-insensitive.
119-
* If you need case-sensitive matching, use `regexps` property directly.
120-
*
121-
* If `pseudoUrls` is an empty array or `undefined`, then the function
122-
* enqueues the links with the same subdomain.
123-
*
124-
* @deprecated prefer using `globs` or `regexps` instead
82+
* Glob matching is always case-insensitive.
83+
* If you need case-sensitive matching, use a `RegExp`.
12584
*/
126-
pseudoUrls?: readonly PseudoUrlInput[];
85+
exclude?: readonly UrlPatternInput[];
12786

12887
/**
129-
* After request options are filtered by patterns, this function can be used
88+
* After request options are filtered by `include`/`exclude` patterns, this function can be used
13089
* to remove them or modify their contents such as `userData`, `payload` or, most importantly `uniqueKey`. This is useful
13190
* when you need to enqueue multiple `Requests` to the queue that share the same URL, but differ in methods or payloads,
13291
* or to dynamically update or create `userData`.
@@ -145,8 +104,8 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
145104
* }
146105
* ```
147106
*
148-
* Note that `transformRequestFunction` has the highest priority and can overwrite request options
149-
* specified in `globs`, `regexps`, or `pseudoUrls` objects, as well as the global `label` option.
107+
* Note that `transformRequestFunction` has the highest priority and can overwrite
108+
* the global `label` option.
150109
*
151110
* The function receives a {@apilink RequestOptions} object and can return either:
152111
* - The modified {@apilink RequestOptions} object
@@ -256,8 +215,7 @@ export enum EnqueueStrategy {
256215
* This function enqueues the urls provided to the {@apilink RequestQueue} provided. If you want to automatically find and enqueue links,
257216
* you should use the context-aware `enqueueLinks` function provided on the crawler contexts.
258217
*
259-
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
260-
* and override settings of the enqueued {@apilink Request} objects.
218+
* Optionally, the function allows you to filter the target links' URLs using an array of glob or regexp patterns.
261219
*
262220
* **Example usage**
263221
*
@@ -266,7 +224,7 @@ export enum EnqueueStrategy {
266224
* urls: aListOfFoundUrls,
267225
* requestQueue,
268226
* selector: 'a.product-detail',
269-
* globs: [
227+
* include: [
270228
* 'https://www.example.com/handbags/*',
271229
* 'https://www.example.com/purses/*'
272230
* ],
@@ -295,6 +253,8 @@ export async function enqueueLinks(
295253
);
296254
}
297255

256+
const urlPatternValidator = ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp'));
257+
298258
ow(
299259
options as any,
300260
ow.object.exactShape({
@@ -309,12 +269,8 @@ export async function enqueueLinks(
309269
baseUrl: ow.optional.string,
310270
userData: ow.optional.object,
311271
label: ow.optional.string,
312-
pseudoUrls: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('purl'))),
313-
globs: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('glob'))),
314-
exclude: ow.optional.array.ofType(
315-
ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp')),
316-
),
317-
regexps: ow.optional.array.ofType(ow.any(ow.regExp, ow.object.hasKeys('regexp'))),
272+
include: ow.optional.array.ofType(urlPatternValidator),
273+
exclude: ow.optional.array.ofType(urlPatternValidator),
318274
transformRequestFunction: ow.optional.function,
319275
strategy: ow.optional.string.oneOf(Object.values(EnqueueStrategy)),
320276
waitForAllRequestsToBeAdded: ow.optional.boolean,
@@ -325,42 +281,17 @@ export async function enqueueLinks(
325281
requestQueue,
326282
limit,
327283
urls,
328-
pseudoUrls,
284+
include,
329285
exclude,
330-
globs,
331-
regexps,
332286
transformRequestFunction,
333287
forefront,
334288
waitForAllRequestsToBeAdded,
335289
robotsTxtFile,
336290
onSkippedRequest,
337291
} = options;
338292

339-
const urlExcludePatternObjects: UrlPatternObject[] = [];
340-
const urlPatternObjects: UrlPatternObject[] = [];
341-
342-
if (exclude?.length) {
343-
for (const excl of exclude) {
344-
if (typeof excl === 'string' || 'glob' in excl) {
345-
urlExcludePatternObjects.push(...constructGlobObjectsFromGlobs([excl]));
346-
} else if (excl instanceof RegExp || 'regexp' in excl) {
347-
urlExcludePatternObjects.push(...constructRegExpObjectsFromRegExps([excl]));
348-
}
349-
}
350-
}
351-
352-
if (pseudoUrls?.length) {
353-
serviceLocator.getLogger().deprecated('`pseudoUrls` option is deprecated, use `globs` or `regexps` instead');
354-
urlPatternObjects.push(...constructRegExpObjectsFromPseudoUrls(pseudoUrls));
355-
}
356-
357-
if (globs?.length) {
358-
urlPatternObjects.push(...constructGlobObjectsFromGlobs(globs));
359-
}
360-
361-
if (regexps?.length) {
362-
urlPatternObjects.push(...constructRegExpObjectsFromRegExps(regexps));
363-
}
293+
const urlExcludePatternObjects: UrlPatternObject[] = exclude?.length ? constructUrlPatternObjects(exclude) : [];
294+
const urlPatternObjects: UrlPatternObject[] = include?.length ? constructUrlPatternObjects(include) : [];
364295

365296
if (!urlPatternObjects.length) {
366297
options.strategy ??= EnqueueStrategy.SameHostname;
@@ -445,8 +376,7 @@ export async function enqueueLinks(
445376
async function createFilteredRequests() {
446377
const skippedRequests: string[] = [];
447378

448-
// Step 1: Filter request options by exclude patterns, user patterns (globs/regexps), and strategy patterns.
449-
// Pattern-level options (label, userData, method, etc.) are merged during this step.
379+
// Step 1: Filter request options by exclude patterns, user include patterns, and strategy patterns.
450380
let filteredOptions: RequestOptions[];
451381
if (urlPatternObjects.length === 0) {
452382
filteredOptions = filterRequestOptionsByPatterns(
@@ -565,7 +495,7 @@ export interface ResolveBaseUrl {
565495
}
566496

567497
/**
568-
* Internal function that changes the enqueue globs to match both http and https
498+
* Internal function that changes the enqueue glob patterns to match both http and https
569499
*/
570500
function ignoreHttpSchema(pattern: string): string {
571501
return pattern.replace(/^(https?):\/\//, 'http{s,}://');

0 commit comments

Comments
 (0)