@@ -6,27 +6,22 @@ import type { SetRequired } from 'type-fest';
66
77import type { RequestOptions } from '../request.js' ;
88import { Request } from '../request.js' ;
9- import { serviceLocator } from '../service_locator.js' ;
109import type {
1110 AddRequestsBatchedOptions ,
1211 AddRequestsBatchedResult ,
1312 RequestProvider ,
1413 RequestQueueOperationOptions ,
1514} from '../storages/request_provider.js' ;
1615import type {
17- GlobInput ,
18- PseudoUrlInput ,
19- RegExpInput ,
2016 RequestTransform ,
2117 SkippedRequestCallback ,
2218 SkippedRequestReason ,
19+ UrlPatternInput ,
2320 UrlPatternObject ,
2421} from './shared.js' ;
2522import {
2623 applyRequestTransform ,
27- constructGlobObjectsFromGlobs ,
28- constructRegExpObjectsFromPseudoUrls ,
29- constructRegExpObjectsFromRegExps ,
24+ constructUrlPatternObjects ,
3025 createRequestOptions ,
3126 filterRequestOptionsByPatterns ,
3227} from './shared.js' ;
@@ -50,8 +45,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
5045 /**
5146 * Sets {@apilink Request.label} for newly enqueued requests.
5247 *
53- * This option has the lowest priority and can be overwritten by request options
54- * specified in `globs`, `regexps`, or `pseudoUrls` objects, as well as by `transformRequestFunction`.
48+ * Can be overwritten by `transformRequestFunction`.
5549 */
5650 label ?: string ;
5751
@@ -68,65 +62,30 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
6862 baseUrl ?: string ;
6963
7064 /**
71- * An array of glob pattern strings or plain objects
72- * containing glob pattern strings matching the URLs to be enqueued.
65+ * An array of URL patterns that URLs must match to be enqueued.
7366 *
74- * The plain objects must include at least the `glob` property, which holds the glob pattern string.
75- * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects.
76- *
77- * The matching is always case-insensitive.
78- * If you need case-sensitive matching, use `regexps` property directly.
79- *
80- * If `globs` is an empty array or `undefined`, and `regexps` are also not defined, then the function
81- * enqueues the links with the same subdomain.
82- */
83- globs ?: readonly GlobInput [ ] ;
84-
85- /**
86- * An array of glob pattern strings, regexp patterns or plain objects
87- * containing patterns matching URLs that will **never** be enqueued.
88- *
89- * The plain objects must include either the `glob` property or the `regexp` property.
67+ * Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects.
9068 *
9169 * Glob matching is always case-insensitive.
92- * If you need case-sensitive matching, provide a regexp.
93- */
94- exclude ?: readonly ( GlobInput | RegExpInput ) [ ] ;
95-
96- /**
97- * An array of regular expressions or plain objects
98- * containing regular expressions matching the URLs to be enqueued.
99- *
100- * The plain objects must include at least the `regexp` property, which holds the regular expression.
101- * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects.
70+ * If you need case-sensitive matching, use a `RegExp`.
10271 *
103- * If `regexps ` is an empty array or `undefined`, and `globs` are also not defined , then the function
72+ * If `include ` is an empty array or `undefined`, then the function
10473 * enqueues the links with the same subdomain.
10574 */
106- regexps ?: readonly RegExpInput [ ] ;
75+ include ?: readonly UrlPatternInput [ ] ;
10776
10877 /**
109- * *NOTE:* In future versions of SDK the options will be removed.
110- * Please use `globs` or `regexps` instead.
78+ * An array of URL patterns. Matching URLs will **not** be enqueued.
11179 *
112- * An array of {@apilink PseudoUrl} strings or plain objects
113- * containing {@apilink PseudoUrl} strings matching the URLs to be enqueued.
80+ * Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects.
11481 *
115- * The plain objects must include at least the `purl` property, which holds the pseudo-URL string.
116- * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects.
117- *
118- * With a pseudo-URL string, the matching is always case-insensitive.
119- * If you need case-sensitive matching, use `regexps` property directly.
120- *
121- * If `pseudoUrls` is an empty array or `undefined`, then the function
122- * enqueues the links with the same subdomain.
123- *
124- * @deprecated prefer using `globs` or `regexps` instead
82+ * Glob matching is always case-insensitive.
83+ * If you need case-sensitive matching, use a `RegExp`.
12584 */
126- pseudoUrls ?: readonly PseudoUrlInput [ ] ;
85+ exclude ?: readonly UrlPatternInput [ ] ;
12786
12887 /**
129- * After request options are filtered by patterns, this function can be used
88+ * After request options are filtered by `include`/`exclude` patterns, this function can be used
13089 * to remove them or modify their contents such as `userData`, `payload` or, most importantly `uniqueKey`. This is useful
13190 * when you need to enqueue multiple `Requests` to the queue that share the same URL, but differ in methods or payloads,
13291 * or to dynamically update or create `userData`.
@@ -145,8 +104,8 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
145104 * }
146105 * ```
147106 *
148- * Note that `transformRequestFunction` has the highest priority and can overwrite request options
149- * specified in `globs`, `regexps`, or `pseudoUrls` objects, as well as the global `label` option.
107+ * Note that `transformRequestFunction` has the highest priority and can overwrite
108+ * the global `label` option.
150109 *
151110 * The function receives a {@apilink RequestOptions} object and can return either:
152111 * - The modified {@apilink RequestOptions} object
@@ -256,8 +215,7 @@ export enum EnqueueStrategy {
256215 * This function enqueues the urls provided to the {@apilink RequestQueue} provided. If you want to automatically find and enqueue links,
257216 * you should use the context-aware `enqueueLinks` function provided on the crawler contexts.
258217 *
259- * Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
260- * and override settings of the enqueued {@apilink Request} objects.
218+ * Optionally, the function allows you to filter the target links' URLs using an array of glob or regexp patterns.
261219 *
262220 * **Example usage**
263221 *
@@ -266,7 +224,7 @@ export enum EnqueueStrategy {
266224 * urls: aListOfFoundUrls,
267225 * requestQueue,
268226 * selector: 'a.product-detail',
269- * globs : [
227+ * include : [
270228 * 'https://www.example.com/handbags/*',
271229 * 'https://www.example.com/purses/*'
272230 * ],
@@ -295,6 +253,8 @@ export async function enqueueLinks(
295253 ) ;
296254 }
297255
256+ const urlPatternValidator = ow . any ( ow . string , ow . regExp , ow . object . hasKeys ( 'glob' ) , ow . object . hasKeys ( 'regexp' ) ) ;
257+
298258 ow (
299259 options as any ,
300260 ow . object . exactShape ( {
@@ -309,12 +269,8 @@ export async function enqueueLinks(
309269 baseUrl : ow . optional . string ,
310270 userData : ow . optional . object ,
311271 label : ow . optional . string ,
312- pseudoUrls : ow . optional . array . ofType ( ow . any ( ow . string , ow . object . hasKeys ( 'purl' ) ) ) ,
313- globs : ow . optional . array . ofType ( ow . any ( ow . string , ow . object . hasKeys ( 'glob' ) ) ) ,
314- exclude : ow . optional . array . ofType (
315- ow . any ( ow . string , ow . regExp , ow . object . hasKeys ( 'glob' ) , ow . object . hasKeys ( 'regexp' ) ) ,
316- ) ,
317- regexps : ow . optional . array . ofType ( ow . any ( ow . regExp , ow . object . hasKeys ( 'regexp' ) ) ) ,
272+ include : ow . optional . array . ofType ( urlPatternValidator ) ,
273+ exclude : ow . optional . array . ofType ( urlPatternValidator ) ,
318274 transformRequestFunction : ow . optional . function ,
319275 strategy : ow . optional . string . oneOf ( Object . values ( EnqueueStrategy ) ) ,
320276 waitForAllRequestsToBeAdded : ow . optional . boolean ,
@@ -325,42 +281,17 @@ export async function enqueueLinks(
325281 requestQueue,
326282 limit,
327283 urls,
328- pseudoUrls ,
284+ include ,
329285 exclude,
330- globs,
331- regexps,
332286 transformRequestFunction,
333287 forefront,
334288 waitForAllRequestsToBeAdded,
335289 robotsTxtFile,
336290 onSkippedRequest,
337291 } = options ;
338292
339- const urlExcludePatternObjects : UrlPatternObject [ ] = [ ] ;
340- const urlPatternObjects : UrlPatternObject [ ] = [ ] ;
341-
342- if ( exclude ?. length ) {
343- for ( const excl of exclude ) {
344- if ( typeof excl === 'string' || 'glob' in excl ) {
345- urlExcludePatternObjects . push ( ...constructGlobObjectsFromGlobs ( [ excl ] ) ) ;
346- } else if ( excl instanceof RegExp || 'regexp' in excl ) {
347- urlExcludePatternObjects . push ( ...constructRegExpObjectsFromRegExps ( [ excl ] ) ) ;
348- }
349- }
350- }
351-
352- if ( pseudoUrls ?. length ) {
353- serviceLocator . getLogger ( ) . deprecated ( '`pseudoUrls` option is deprecated, use `globs` or `regexps` instead' ) ;
354- urlPatternObjects . push ( ...constructRegExpObjectsFromPseudoUrls ( pseudoUrls ) ) ;
355- }
356-
357- if ( globs ?. length ) {
358- urlPatternObjects . push ( ...constructGlobObjectsFromGlobs ( globs ) ) ;
359- }
360-
361- if ( regexps ?. length ) {
362- urlPatternObjects . push ( ...constructRegExpObjectsFromRegExps ( regexps ) ) ;
363- }
293+ const urlExcludePatternObjects : UrlPatternObject [ ] = exclude ?. length ? constructUrlPatternObjects ( exclude ) : [ ] ;
294+ const urlPatternObjects : UrlPatternObject [ ] = include ?. length ? constructUrlPatternObjects ( include ) : [ ] ;
364295
365296 if ( ! urlPatternObjects . length ) {
366297 options . strategy ??= EnqueueStrategy . SameHostname ;
@@ -445,8 +376,7 @@ export async function enqueueLinks(
445376 async function createFilteredRequests ( ) {
446377 const skippedRequests : string [ ] = [ ] ;
447378
448- // Step 1: Filter request options by exclude patterns, user patterns (globs/regexps), and strategy patterns.
449- // Pattern-level options (label, userData, method, etc.) are merged during this step.
379+ // Step 1: Filter request options by exclude patterns, user include patterns, and strategy patterns.
450380 let filteredOptions : RequestOptions [ ] ;
451381 if ( urlPatternObjects . length === 0 ) {
452382 filteredOptions = filterRequestOptionsByPatterns (
@@ -565,7 +495,7 @@ export interface ResolveBaseUrl {
565495}
566496
567497/**
568- * Internal function that changes the enqueue globs to match both http and https
498+ * Internal function that changes the enqueue glob patterns to match both http and https
569499 */
570500function ignoreHttpSchema ( pattern : string ) : string {
571501 return pattern . replace ( / ^ ( h t t p s ? ) : \/ \/ / , 'http{s,}://' ) ;
0 commit comments