diff --git a/docs/deployment/apify_platform_init_exit.ts b/docs/deployment/apify_platform_init_exit.ts index 49a10f100f23..8058d81c457f 100644 --- a/docs/deployment/apify_platform_init_exit.ts +++ b/docs/deployment/apify_platform_init_exit.ts @@ -13,7 +13,7 @@ const crawler = new CheerioCrawler({ // Add URLs that match the provided pattern. await enqueueLinks({ - globs: ['https://www.iana.org/*'], + include: ['https://www.iana.org/*'], }); // Save extracted data to dataset. diff --git a/docs/deployment/apify_platform_main.ts b/docs/deployment/apify_platform_main.ts index a338047e86ea..507c7fe1c6e2 100644 --- a/docs/deployment/apify_platform_main.ts +++ b/docs/deployment/apify_platform_main.ts @@ -12,7 +12,7 @@ await Actor.main(async () => { // Add URLs that match the provided pattern. await enqueueLinks({ - globs: ['https://www.iana.org/*'], + include: ['https://www.iana.org/*'], }); // Save extracted data to dataset. diff --git a/docs/examples/crawl_some_links.mdx b/docs/examples/crawl_some_links.mdx index fb9cde71600e..b9b7cf85f949 100644 --- a/docs/examples/crawl_some_links.mdx +++ b/docs/examples/crawl_some_links.mdx @@ -7,7 +7,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ApiLink from '@site/src/components/ApiLink'; import CrawlSource from '!!raw-loader!roa-loader!./crawl_some_links.ts'; -This `CheerioCrawler` example uses the `globs` property in the `enqueueLinks()` method to only add links to the `RequestQueue` queue if they match the specified pattern. +This `CheerioCrawler` example uses the `include` property in the `enqueueLinks()` method to only add links to the `RequestQueue` queue if they match the specified pattern. {CrawlSource} diff --git a/docs/examples/crawl_some_links.ts b/docs/examples/crawl_some_links.ts index 51912bb10f67..111f89165c0f 100644 --- a/docs/examples/crawl_some_links.ts +++ b/docs/examples/crawl_some_links.ts @@ -9,7 +9,7 @@ const crawler = new CheerioCrawler({ log.info(request.url); // Add some links from page to the crawler's RequestQueue await enqueueLinks({ - globs: ['http?(s)://crawlee.dev/*/*'], + include: ['http?(s)://crawlee.dev/*/*'], }); }, }); diff --git a/docs/examples/puppeteer_recursive_crawl.ts b/docs/examples/puppeteer_recursive_crawl.ts index ad48b324796b..dde92685ea5f 100644 --- a/docs/examples/puppeteer_recursive_crawl.ts +++ b/docs/examples/puppeteer_recursive_crawl.ts @@ -6,7 +6,7 @@ const crawler = new PuppeteerCrawler({ log.info(`Title of ${request.url}: ${title}`); await enqueueLinks({ - globs: ['http?(s)://www.iana.org/**'], + include: ['http?(s)://www.iana.org/**'], }); }, maxRequestsPerCrawl: 10, diff --git a/docs/introduction/03-adding-urls.mdx b/docs/introduction/03-adding-urls.mdx index 387b86fb9450..520dcc62ffd6 100644 --- a/docs/introduction/03-adding-urls.mdx +++ b/docs/introduction/03-adding-urls.mdx @@ -130,7 +130,7 @@ await enqueueLinks({ ### Filter URLs with patterns -For even more control, you can use `globs`, `regexps` and `pseudoUrls` to filter the URLs. Each of those arguments is always an `Array`, but the contents can take on many forms. See the reference for more information about them as well as other options. +For even more control, you can use `include` and `exclude` to filter the URLs. Each accepts an `Array` of glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects. See the reference for more information about them as well as other options. :::caution Defaults override @@ -140,17 +140,17 @@ If you provide one of those options, the default `same-hostname` strategy will * ```ts await enqueueLinks({ - globs: ['http?(s)://apify.com/*/*'], + include: ['http?(s)://apify.com/*/*'], }); ``` ### Transform requests -To have absolute control, we have the `transformRequestFunction`. Just before a new `Request` is constructed and enqueued to the `RequestQueue`, this function can be used to skip it or modify its contents such as `userData`, `payload` or, most importantly, `uniqueKey`. This is useful when you need to enqueue multiple requests to the queue, and these requests share the same URL, but differ in methods or payloads. Another use case is to dynamically update or create the `userData`. +To have absolute control, we have the `transformRequestFunction`. After request options are filtered by `include`/`exclude` patterns, this function can be used to skip them or modify their contents such as `userData`, `payload` or, most importantly, `uniqueKey`. This is useful when you need to enqueue multiple requests to the queue, and these requests share the same URL, but differ in methods or payloads. Another use case is to dynamically update or create the `userData`. ```ts await enqueueLinks({ - globs: ['http?(s)://apify.com/*/*'], + include: ['http?(s)://apify.com/*/*'], transformRequestFunction(req) { // ignore all links ending with `.pdf` if (req.url.endsWith('.pdf')) return false; diff --git a/docs/upgrading/upgrading_v3.md b/docs/upgrading/upgrading_v3.md index 39b6091c9249..21e6d1483a1e 100644 --- a/docs/upgrading/upgrading_v3.md +++ b/docs/upgrading/upgrading_v3.md @@ -188,14 +188,13 @@ One common helper that received more attention is the `enqueueLinks`. As mention This means we can even call `enqueueLinks()` without any parameters. By default, it will go through all the links found on current page and filter only those targeting the same subdomain. -Moreover, we can specify patterns the URL should match via globs: +Moreover, we can specify patterns the URL should match via `include`: ```ts const crawler = new PlaywrightCrawler({ async requestHandler({ enqueueLinks }) { await enqueueLinks({ - globs: ['https://crawlee.dev/*/*'], - // we can also use `regexps` and `pseudoUrls` keys here + include: ['https://crawlee.dev/*/*'], }); }, }); @@ -231,7 +230,7 @@ Labeling requests used to work via the `Request.userData` object. With Crawlee, async requestHandler({ request, enqueueLinks }) { if (request.label !== 'DETAIL') { await enqueueLinks({ - globs: ['...'], + include: ['...'], label: 'DETAIL', }); } diff --git a/packages/core/package.json b/packages/core/package.json index 8e6778b7644c..e1a4cedb9e90 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -50,7 +50,6 @@ "@apify/consts": "^2.41.0", "@apify/datastructures": "^2.0.3", "@apify/log": "^2.5.18", - "@apify/pseudo_url": "^2.0.59", "@apify/timeout": "^0.3.2", "@apify/utilities": "^2.15.5", "@crawlee/memory-storage": "workspace:*", diff --git a/packages/core/src/crawlers/crawler_commons.ts b/packages/core/src/crawlers/crawler_commons.ts index 25de68e03961..975ba7223f29 100644 --- a/packages/core/src/crawlers/crawler_commons.ts +++ b/packages/core/src/crawlers/crawler_commons.ts @@ -58,8 +58,7 @@ export interface RestrictedCrawlingContext exten * This function automatically finds and enqueues links from the current page, adding them to the {@apilink RequestQueue} * currently used by the crawler. * - * Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions - * and override settings of the enqueued {@apilink Request} objects. + * Optionally, the function allows you to filter the target links' URLs using an array of glob or regexp patterns. * * Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example * for more details regarding its usage. @@ -127,7 +125,7 @@ export interface CrawlingContext exten * ```ts * async requestHandler({ enqueueLinks }) { * await enqueueLinks({ - * globs: [ + * include: [ * 'https://www.example.com/handbags/*', * ], * }); diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 3387e5155d97..37da24c26d22 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -6,7 +6,6 @@ import type { SetRequired } from 'type-fest'; import type { RequestOptions } from '../request.js'; import { Request } from '../request.js'; -import { serviceLocator } from '../service_locator.js'; import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, @@ -14,19 +13,15 @@ import type { RequestQueueOperationOptions, } from '../storages/request_provider.js'; import type { - GlobInput, - PseudoUrlInput, - RegExpInput, RequestTransform, SkippedRequestCallback, SkippedRequestReason, + UrlPatternInput, UrlPatternObject, } from './shared.js'; import { applyRequestTransform, - constructGlobObjectsFromGlobs, - constructRegExpObjectsFromPseudoUrls, - constructRegExpObjectsFromRegExps, + constructUrlPatternObjects, createRequestOptions, filterRequestOptionsByPatterns, } from './shared.js'; @@ -50,8 +45,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { /** * Sets {@apilink Request.label} for newly enqueued requests. * - * This option has the lowest priority and can be overwritten by request options - * specified in `globs`, `regexps`, or `pseudoUrls` objects, as well as by `transformRequestFunction`. + * Can be overwritten by `transformRequestFunction`. */ label?: string; @@ -71,65 +65,30 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { baseUrl?: string; /** - * An array of glob pattern strings or plain objects - * containing glob pattern strings matching the URLs to be enqueued. + * An array of URL patterns that URLs must match to be enqueued. * - * The plain objects must include at least the `glob` property, which holds the glob pattern string. - * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. - * - * The matching is always case-insensitive. - * If you need case-sensitive matching, use `regexps` property directly. - * - * If `globs` is an empty array or `undefined`, and `regexps` are also not defined, then the function - * enqueues the links with the same subdomain. - */ - globs?: readonly GlobInput[]; - - /** - * An array of glob pattern strings, regexp patterns or plain objects - * containing patterns matching URLs that will **never** be enqueued. - * - * The plain objects must include either the `glob` property or the `regexp` property. + * Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects. * * Glob matching is always case-insensitive. - * If you need case-sensitive matching, provide a regexp. - */ - exclude?: readonly (GlobInput | RegExpInput)[]; - - /** - * An array of regular expressions or plain objects - * containing regular expressions matching the URLs to be enqueued. - * - * The plain objects must include at least the `regexp` property, which holds the regular expression. - * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. + * If you need case-sensitive matching, use a `RegExp`. * - * If `regexps` is an empty array or `undefined`, and `globs` are also not defined, then the function + * If `include` is an empty array or `undefined`, then the function * enqueues the links with the same subdomain. */ - regexps?: readonly RegExpInput[]; + include?: readonly UrlPatternInput[]; /** - * *NOTE:* In future versions of SDK the options will be removed. - * Please use `globs` or `regexps` instead. + * An array of URL patterns. Matching URLs will **not** be enqueued. * - * An array of {@apilink PseudoUrl} strings or plain objects - * containing {@apilink PseudoUrl} strings matching the URLs to be enqueued. + * Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects. * - * The plain objects must include at least the `purl` property, which holds the pseudo-URL string. - * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. - * - * With a pseudo-URL string, the matching is always case-insensitive. - * If you need case-sensitive matching, use `regexps` property directly. - * - * If `pseudoUrls` is an empty array or `undefined`, then the function - * enqueues the links with the same subdomain. - * - * @deprecated prefer using `globs` or `regexps` instead + * Glob matching is always case-insensitive. + * If you need case-sensitive matching, use a `RegExp`. */ - pseudoUrls?: readonly PseudoUrlInput[]; + exclude?: readonly UrlPatternInput[]; /** - * After request options are filtered by patterns, this function can be used + * After request options are filtered by `include`/`exclude` patterns, this function can be used * to remove them or modify their contents such as `userData`, `payload` or, most importantly `uniqueKey`. This is useful * when you need to enqueue multiple `Requests` to the queue that share the same URL, but differ in methods or payloads, * or to dynamically update or create `userData`. @@ -148,8 +107,8 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { * } * ``` * - * Note that `transformRequestFunction` has the highest priority and can overwrite request options - * specified in `globs`, `regexps`, or `pseudoUrls` objects, as well as the global `label` option. + * Note that `transformRequestFunction` has the highest priority and can overwrite + * the global `label` option. * * The function receives a {@apilink RequestOptions} object and can return either: * - The modified {@apilink RequestOptions} object @@ -259,8 +218,7 @@ export enum EnqueueStrategy { * This function enqueues the urls provided to the {@apilink RequestQueue} provided. If you want to automatically find and enqueue links, * you should use the context-aware `enqueueLinks` function provided on the crawler contexts. * - * Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions - * and override settings of the enqueued {@apilink Request} objects. + * Optionally, the function allows you to filter the target links' URLs using an array of glob or regexp patterns. * * **Example usage** * @@ -269,7 +227,7 @@ export enum EnqueueStrategy { * urls: aListOfFoundUrls, * requestQueue, * selector: 'a.product-detail', - * globs: [ + * include: [ * 'https://www.example.com/handbags/*', * 'https://www.example.com/purses/*' * ], @@ -298,6 +256,8 @@ export async function enqueueLinks( ); } + const urlPatternValidator = ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp')); + ow( options as any, ow.object.exactShape({ @@ -313,12 +273,8 @@ export async function enqueueLinks( baseUrl: ow.optional.string, userData: ow.optional.object, label: ow.optional.string, - pseudoUrls: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('purl'))), - globs: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('glob'))), - exclude: ow.optional.array.ofType( - ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp')), - ), - regexps: ow.optional.array.ofType(ow.any(ow.regExp, ow.object.hasKeys('regexp'))), + include: ow.optional.array.ofType(urlPatternValidator), + exclude: ow.optional.array.ofType(urlPatternValidator), transformRequestFunction: ow.optional.function, strategy: ow.optional.string.oneOf(Object.values(EnqueueStrategy)), waitForAllRequestsToBeAdded: ow.optional.boolean, @@ -329,11 +285,8 @@ export async function enqueueLinks( requestQueue, limit, urls, - // oxlint-disable-next-line typescript/no-deprecated -- still accepted for backwards compat - pseudoUrls, + include, exclude, - globs, - regexps, transformRequestFunction, forefront, waitForAllRequestsToBeAdded, @@ -341,31 +294,8 @@ export async function enqueueLinks( onSkippedRequest, } = options; - const urlExcludePatternObjects: UrlPatternObject[] = []; - const urlPatternObjects: UrlPatternObject[] = []; - - if (exclude?.length) { - for (const excl of exclude) { - if (typeof excl === 'string' || 'glob' in excl) { - urlExcludePatternObjects.push(...constructGlobObjectsFromGlobs([excl])); - } else if (excl instanceof RegExp || 'regexp' in excl) { - urlExcludePatternObjects.push(...constructRegExpObjectsFromRegExps([excl])); - } - } - } - - if (pseudoUrls?.length) { - serviceLocator.getLogger().deprecated('`pseudoUrls` option is deprecated, use `globs` or `regexps` instead'); - urlPatternObjects.push(...constructRegExpObjectsFromPseudoUrls(pseudoUrls)); - } - - if (globs?.length) { - urlPatternObjects.push(...constructGlobObjectsFromGlobs(globs)); - } - - if (regexps?.length) { - urlPatternObjects.push(...constructRegExpObjectsFromRegExps(regexps)); - } + const urlExcludePatternObjects: UrlPatternObject[] = exclude?.length ? constructUrlPatternObjects(exclude) : []; + const urlPatternObjects: UrlPatternObject[] = include?.length ? constructUrlPatternObjects(include) : []; if (!urlPatternObjects.length) { options.strategy ??= EnqueueStrategy.SameHostname; @@ -450,8 +380,7 @@ export async function enqueueLinks( async function createFilteredRequests() { const skippedRequests: string[] = []; - // Step 1: Filter request options by exclude patterns, user patterns (globs/regexps), and strategy patterns. - // Pattern-level options (label, userData, method, etc.) are merged during this step. + // Step 1: Filter request options by exclude patterns, user include patterns, and strategy patterns. let filteredOptions: RequestOptions[]; if (urlPatternObjects.length === 0) { filteredOptions = filterRequestOptionsByPatterns( @@ -570,7 +499,7 @@ export interface ResolveBaseUrl { } /** - * Internal function that changes the enqueue globs to match both http and https + * Internal function that changes the enqueue glob patterns to match both http and https */ function ignoreHttpSchema(pattern: string): string { return pattern.replace(/^(https?):\/\//, 'http{s,}://'); diff --git a/packages/core/src/enqueue_links/shared.ts b/packages/core/src/enqueue_links/shared.ts index e47b73db6d4c..b3ceff024514 100644 --- a/packages/core/src/enqueue_links/shared.ts +++ b/packages/core/src/enqueue_links/shared.ts @@ -3,8 +3,6 @@ import { URL } from 'node:url'; import type { Awaitable } from '@crawlee/types'; import { Minimatch } from 'minimatch'; -import { purlToRegExp } from '@apify/pseudo_url'; - import type { RequestOptions } from '../request.js'; import type { EnqueueLinksOptions } from './enqueue_links.js'; @@ -13,39 +11,33 @@ export { tryAbsoluteURL } from '@crawlee/utils'; const MAX_ENQUEUE_LINKS_CACHE_SIZE = 1000; /** - * To enable direct use of the Actor UI `globs`/`regexps`/`pseudoUrls` output while keeping high performance, + * To enable direct use of the Actor UI `include`/`exclude` output while keeping high performance, * all the regexps from the output are only constructed once and kept in a cache * by the `enqueueLinks()` function. * @ignore */ const enqueueLinksPatternCache = new Map(); -export type UrlPatternObject = { +export interface UrlPatternObject { glob?: string; regexp?: RegExp; -} & Pick; - -export type PseudoUrlObject = { purl: string } & Pick< - RequestOptions, - 'method' | 'payload' | 'label' | 'userData' | 'headers' ->; - -export type PseudoUrlInput = string | PseudoUrlObject; +} -export type GlobObject = { glob: string } & Pick< - RequestOptions, - 'method' | 'payload' | 'label' | 'userData' | 'headers' ->; +export interface GlobObject { + glob: string; +} export type GlobInput = string | GlobObject; -export type RegExpObject = { regexp: RegExp } & Pick< - RequestOptions, - 'method' | 'payload' | 'label' | 'userData' | 'headers' ->; +export interface RegExpObject { + regexp: RegExp; +} export type RegExpInput = RegExp | RegExpObject; +/** Unified URL pattern input — accepts glob strings, glob objects, RegExp instances, or regexp objects. */ +export type UrlPatternInput = GlobInput | RegExpInput; + export type SkippedRequestReason = | 'robotsTxt' | 'limit' @@ -61,7 +53,7 @@ export type SkippedRequestCallback = (args: { url: string; reason: SkippedReques * @ignore */ export function updateEnqueueLinksPatternCache( - item: GlobInput | RegExpInput | PseudoUrlInput, + item: GlobInput | RegExpInput, pattern: RegExpObject | GlobObject, ): void { enqueueLinksPatternCache.set(item, pattern); @@ -71,30 +63,6 @@ export function updateEnqueueLinksPatternCache( } } -/** - * Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function - * to construct RegExps from PseudoUrl strings. - * @ignore - */ -export function constructRegExpObjectsFromPseudoUrls(pseudoUrls: readonly PseudoUrlInput[]): RegExpObject[] { - return pseudoUrls.map((item) => { - // Get pseudoUrl object from cache. - let regexpObject = enqueueLinksPatternCache.get(item); - if (regexpObject) return regexpObject; - - if (typeof item === 'string') { - regexpObject = { regexp: purlToRegExp(item) }; - } else { - const { purl, ...requestOptions } = item; - regexpObject = { regexp: purlToRegExp(purl), ...requestOptions }; - } - - updateEnqueueLinksPatternCache(item, regexpObject); - - return regexpObject; - }); -} - /** * Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function * to construct Glob objects from Glob pattern strings. @@ -126,8 +94,7 @@ export function constructGlobObjectsFromGlobs(globs: readonly GlobInput[]): Glob if (typeof item === 'string') { globObject = { glob: validateGlobPattern(item) }; } else { - const { glob, ...requestOptions } = item; - globObject = { glob: validateGlobPattern(glob), ...requestOptions }; + globObject = { glob: validateGlobPattern(item.glob) }; } updateEnqueueLinksPatternCache(item, globObject); @@ -160,7 +127,7 @@ export function constructRegExpObjectsFromRegExps(regexps: readonly RegExpInput[ if (item instanceof RegExp) { regexpObject = { regexp: item }; } else { - regexpObject = item; + regexpObject = { regexp: item.regexp }; } updateEnqueueLinksPatternCache(item, regexpObject); @@ -170,8 +137,26 @@ export function constructRegExpObjectsFromRegExps(regexps: readonly RegExpInput[ } /** - * Filters request options by URL patterns and merges pattern-level options (label, userData, method, payload, headers) - * from the first matching pattern into each RequestOptions entry. + * Helper factory used in the `enqueueLinks()` function to construct UrlPatternObjects + * from a mixed array of glob strings, glob objects, RegExp instances, and regexp objects. + * @ignore + */ +export function constructUrlPatternObjects(patterns: readonly UrlPatternInput[]): UrlPatternObject[] { + const result: UrlPatternObject[] = []; + + for (const item of patterns) { + if (typeof item === 'string' || 'glob' in item) { + result.push(...constructGlobObjectsFromGlobs([item])); + } else if (item instanceof RegExp || 'regexp' in item) { + result.push(...constructRegExpObjectsFromRegExps([item])); + } + } + + return result; +} + +/** + * Filters request options by URL patterns. * * When `includePatterns` is empty/undefined, all options pass through (only exclude filtering applies). * @ignore @@ -199,9 +184,9 @@ export function filterRequestOptionsByPatterns( return { ...opts, enqueueStrategy: strategy }; } - for (const { match, glob, regexp, ...patternOptions } of includeMatchers) { + for (const { match } of includeMatchers) { if (match(opts.url)) { - return { ...opts, ...patternOptions, enqueueStrategy: strategy }; + return { ...opts, enqueueStrategy: strategy }; } } @@ -263,7 +248,7 @@ export function createRequestOptions( */ function createPatternObjectMatcher(urlPatternObject: UrlPatternObject) { const { regexp, glob } = urlPatternObject; - let match; + let match: (url: string) => boolean; if (regexp) { match = (url: string) => regexp.test(url); } else if (glob) { @@ -272,7 +257,7 @@ function createPatternObjectMatcher(urlPatternObject: UrlPatternObject) { } else { match = () => false; } - return { ...urlPatternObject, match }; + return { match }; } /** diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 9e64ff774e18..cd104f9fc837 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -15,5 +15,4 @@ export * from './storages/index.js'; export * from './validators.js'; export * from './cookie_utils.js'; export * from './recoverable_state.js'; -export { PseudoUrl } from '@apify/pseudo_url'; export type { Dictionary, Awaitable, Constructor, StorageClient, Cookie, QueueOperationInfo } from '@crawlee/types'; diff --git a/packages/core/src/storages/sitemap_request_list.ts b/packages/core/src/storages/sitemap_request_list.ts index b8cc78265742..44fdb2494f50 100644 --- a/packages/core/src/storages/sitemap_request_list.ts +++ b/packages/core/src/storages/sitemap_request_list.ts @@ -6,8 +6,8 @@ import { minimatch } from 'minimatch'; import ow from 'ow'; import type { RequiredDeep } from 'type-fest'; -import type { GlobInput, RegExpInput, UrlPatternObject } from '../enqueue_links/shared.js'; -import { constructGlobObjectsFromGlobs, constructRegExpObjectsFromRegExps } from '../enqueue_links/shared.js'; +import type { UrlPatternInput, UrlPatternObject } from '../enqueue_links/shared.js'; +import { constructUrlPatternObjects } from '../enqueue_links/shared.js'; import { type EventManager, EventType } from '../events/event_manager.js'; import type { CrawleeLogger } from '../log.js'; import { Request } from '../request.js'; @@ -21,40 +21,27 @@ const STATE_PERSISTENCE_KEY = 'SITEMAP_REQUEST_LIST_STATE'; interface UrlConstraints { /** - * An array of glob pattern strings or plain objects - * containing glob pattern strings matching the URLs to be enqueued. + * An array of URL patterns that URLs must match to be included. * - * The plain objects must include at least the `glob` property, which holds the glob pattern string. + * Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects. * - * The matching is always case-insensitive. - * If you need case-sensitive matching, use `regexps` property directly. + * Glob matching is always case-insensitive. + * If you need case-sensitive matching, use a `RegExp`. * - * If `globs` is an empty array or `undefined`, and `regexps` are also not defined, then the `SitemapRequestList` + * If `include` is an empty array or `undefined`, then the `SitemapRequestList` * includes all the URLs from the sitemap. */ - globs?: readonly GlobInput[]; + include?: readonly UrlPatternInput[]; /** - * An array of glob pattern strings, regexp patterns or plain objects - * containing patterns matching URLs that will **never** be included. + * An array of URL patterns. Matching URLs will **not** be included. * - * The plain objects must include either the `glob` property or the `regexp` property. + * Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects. * * Glob matching is always case-insensitive. - * If you need case-sensitive matching, provide a regexp. - */ - exclude?: readonly (GlobInput | RegExp)[]; - - /** - * An array of regular expressions or plain objects - * containing regular expressions matching the URLs to be enqueued. - * - * The plain objects must include at least the `regexp` property, which holds the regular expression. - * - * If `regexps` is an empty array or `undefined`, and `globs` are also not defined, then the `SitemapRequestList` - * includes all the URLs from the sitemap. + * If you need case-sensitive matching, use a `RegExp`. */ - regexps?: readonly RegExpInput[]; + exclude?: readonly UrlPatternInput[]; } export interface SitemapRequestListOptions extends UrlConstraints { @@ -207,6 +194,13 @@ export class SitemapRequestList implements IRequestList { /** @internal */ private constructor(options: SitemapRequestListOptions) { + const urlPatternValidator = ow.any( + ow.string, + ow.regExp, + ow.object.hasKeys('glob'), + ow.object.hasKeys('regexp'), + ); + ow( options, ow.object.exactShape({ @@ -217,36 +211,24 @@ export class SitemapRequestList implements IRequestList { timeoutMillis: ow.optional.number, maxBufferSize: ow.optional.number, parseSitemapOptions: ow.optional.object, - globs: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('glob'))), - exclude: ow.optional.array.ofType( - ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp')), - ), - regexps: ow.optional.array.ofType(ow.any(ow.regExp, ow.object.hasKeys('regexp'))), + include: ow.optional.array.ofType(urlPatternValidator), + exclude: ow.optional.array.ofType(urlPatternValidator), config: ow.optional.object, persistenceOptions: ow.optional.object, + httpClient: ow.optional.object, }), ); - const { globs, exclude, regexps } = options; + const { include, exclude } = options; this.log = serviceLocator.getLogger().child({ prefix: 'SitemapRequestList' }); if (exclude?.length) { - for (const excl of exclude) { - if (typeof excl === 'string' || 'glob' in excl) { - this.urlExcludePatternObjects.push(...constructGlobObjectsFromGlobs([excl])); - } else if (excl instanceof RegExp || 'regexp' in excl) { - this.urlExcludePatternObjects.push(...constructRegExpObjectsFromRegExps([excl])); - } - } - } - - if (globs?.length) { - this.urlPatternObjects.push(...constructGlobObjectsFromGlobs(globs)); + this.urlExcludePatternObjects.push(...constructUrlPatternObjects(exclude)); } - if (regexps?.length) { - this.urlPatternObjects.push(...constructRegExpObjectsFromRegExps(regexps)); + if (include?.length) { + this.urlPatternObjects.push(...constructUrlPatternObjects(include)); } this.persistStateKey = options.persistStateKey; @@ -291,7 +273,7 @@ export class SitemapRequestList implements IRequestList { } /** - * Checks whether the URL matches the `globs` / `regexps` / `exclude` provided in the `options`. + * Checks whether the URL matches the `include` / `exclude` patterns provided in the `options`. * @param url URL to be checked. * @returns `true` if the URL matches the patterns, `false` otherwise. */ diff --git a/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts b/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts index bac01890ff5a..300d682ca22c 100644 --- a/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts +++ b/packages/core/test/enqueue_links/user-provided-patterns-with-enqueue-strategy.test.ts @@ -61,15 +61,15 @@ describe('enqueueLinks() - combining user patterns with enqueue strategies', () $ = load(HTML); }); - test('works with globs and same domain strategy', async () => { + test('works with include and same domain strategy', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const globs = ['**/first']; + const include = ['**/first']; await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', - globs, + include, strategy: EnqueueStrategy.SameDomain, }, $, @@ -82,15 +82,15 @@ describe('enqueueLinks() - combining user patterns with enqueue strategies', () expect(enqueued[0].url).toBe('https://example.com/a/b/first'); }); - test('works with globs and all domains strategy', async () => { + test('works with include and all domains strategy', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const globs = ['**/first']; + const include = ['**/first']; await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', - globs, + include, strategy: EnqueueStrategy.All, }, $, @@ -122,16 +122,16 @@ describe('enqueueLinks() - combining user patterns with enqueue strategies', () expect(enqueued[1].url).toBe('https://example.com/a/b/third'); }); - test('works with globs and exclude', async () => { + test('works with include and exclude', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const globs = ['**/first']; + const include = ['**/first']; const exclude = ['**/first']; await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', - globs, + include, exclude, }, $, diff --git a/packages/playwright-crawler/src/internals/enqueue-links/click-elements.ts b/packages/playwright-crawler/src/internals/enqueue-links/click-elements.ts index 6a766b37d329..eb9b8e6c1839 100644 --- a/packages/playwright-crawler/src/internals/enqueue-links/click-elements.ts +++ b/packages/playwright-crawler/src/internals/enqueue-links/click-elements.ts @@ -1,20 +1,16 @@ import { URL } from 'node:url'; import type { - GlobInput, - PseudoUrlInput, - RegExpInput, RequestOptions, RequestProvider, RequestTransform, SkippedRequestCallback, + UrlPatternInput, UrlPatternObject, } from '@crawlee/browser'; import { applyRequestTransform, - constructGlobObjectsFromGlobs, - constructRegExpObjectsFromPseudoUrls, - constructRegExpObjectsFromRegExps, + constructUrlPatternObjects, createRequestOptions, filterRequestOptionsByPatterns, Request as CrawleeRequest, @@ -58,68 +54,31 @@ export interface EnqueueLinksByClickingElementsOptions { clickOptions?: ClickOptions; /** - * An array of glob pattern strings or plain objects - * containing glob pattern strings matching the URLs to be enqueued. + * An array of URL patterns that URLs must match to be enqueued. * - * The plain objects must include at least the `glob` property, which holds the glob pattern string. - * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. - * - * The matching is always case-insensitive. - * If you need case-sensitive matching, use `regexps` property directly. - * - * If `globs` is an empty array or `undefined`, then the function - * enqueues all the intercepted navigation requests produced by the page - * after clicking on elements matching the provided CSS selector. - */ - globs?: GlobInput[]; - - /** - * An array of glob pattern strings, regexp patterns or plain objects - * containing patterns matching URLs that will **never** be enqueued. - * - * The plain objects must include either the `glob` property or the `regexp` property. + * Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects. * * Glob matching is always case-insensitive. - * If you need case-sensitive matching, provide a regexp. - */ - exclude?: readonly (GlobInput | RegExpInput)[]; - - /** - * An array of regular expressions or plain objects - * containing regular expressions matching the URLs to be enqueued. - * - * The plain objects must include at least the `regexp` property, which holds the regular expression. - * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. + * If you need case-sensitive matching, use a `RegExp`. * - * If `regexps` is an empty array or `undefined`, then the function + * If `include` is an empty array or `undefined`, then the function * enqueues all the intercepted navigation requests produced by the page * after clicking on elements matching the provided CSS selector. */ - regexps?: RegExpInput[]; + include?: UrlPatternInput[]; /** - * *NOTE:* In future versions of SDK the options will be removed. - * Please use `globs` or `regexps` instead. + * An array of URL patterns. Matching URLs will **not** be enqueued. * - * An array of {@apilink PseudoUrl} strings or plain objects - * containing {@apilink PseudoUrl} strings matching the URLs to be enqueued. + * Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects. * - * The plain objects must include at least the `purl` property, which holds the pseudo-URL pattern string. - * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. - * - * With a pseudo-URL string, the matching is always case-insensitive. - * If you need case-sensitive matching, use `regexps` property directly. - * - * If `pseudoUrls` is an empty array or `undefined`, then the function - * enqueues all the intercepted navigation requests produced by the page - * after clicking on elements matching the provided CSS selector. - * - * @deprecated prefer using `globs` or `regexps` instead + * Glob matching is always case-insensitive. + * If you need case-sensitive matching, use a `RegExp`. */ - pseudoUrls?: PseudoUrlInput[]; + exclude?: readonly UrlPatternInput[]; /** - * After {@apilink Request} objects are constructed and filtered by URL patterns (`globs`, `regexps`, `pseudoUrls`), + * After request options are filtered by `include`/`exclude` patterns, * this function can be used to remove them or modify their contents such as `userData`, `payload` or, most importantly * `uniqueKey`. This is useful when you need to enqueue multiple `Requests` to the queue that share the same URL, * but differ in methods or payloads, or to dynamically update or create `userData`. @@ -134,8 +93,8 @@ export interface EnqueueLinksByClickingElementsOptions { * } * ``` * - * Note that `transformRequestFunction` has the highest priority and can overwrite request options - * specified in `globs`, `regexps`, or `pseudoUrls` objects, as well as the global `label` option. + * Note that `transformRequestFunction` has the highest priority and can overwrite + * the global `label` option. * * The function receives a {@apilink RequestOptions} object and can return either: * - The modified {@apilink RequestOptions} object @@ -202,8 +161,7 @@ export interface EnqueueLinksByClickingElementsOptions { * in `href` elements, but rather navigations are triggered in click handlers. * If you're looking to find URLs in `href` attributes of the page, see {@apilink enqueueLinks}. * - * Optionally, the function allows you to filter the target links' URLs using an array of {@apilink PseudoUrl} objects - * and override settings of the enqueued {@apilink Request} objects. + * Optionally, the function allows you to filter the target links' URLs using an array of glob or regexp patterns. * * **IMPORTANT**: To be able to do this, this function uses various mutations on the page, * such as changing the Z-index of elements being clicked and their visibility. Therefore, @@ -225,9 +183,9 @@ export interface EnqueueLinksByClickingElementsOptions { * page, * requestQueue, * selector: 'a.product-detail', - * pseudoUrls: [ - * 'https://www.example.com/handbags/[.*]' - * 'https://www.example.com/purses/[.*]' + * include: [ + * 'https://www.example.com/handbags/*', + * 'https://www.example.com/purses/*', * ], * }); * ``` @@ -237,6 +195,8 @@ export interface EnqueueLinksByClickingElementsOptions { export async function enqueueLinksByClickingElements( options: EnqueueLinksByClickingElementsOptions, ): Promise { + const urlPatternValidator = ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp')); + ow( options, ow.object.exactShape({ @@ -245,12 +205,8 @@ export async function enqueueLinksByClickingElements( selector: ow.string, userData: ow.optional.object, clickOptions: ow.optional.object.hasKeys('clickCount', 'delay'), - pseudoUrls: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('purl'))), - globs: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('glob'))), - regexps: ow.optional.array.ofType(ow.any(ow.regExp, ow.object.hasKeys('regexp'))), - exclude: ow.optional.array.ofType( - ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp')), - ), + include: ow.optional.array.ofType(urlPatternValidator), + exclude: ow.optional.array.ofType(urlPatternValidator), transformRequestFunction: ow.optional.function, waitForPageIdleSecs: ow.optional.number, maxWaitForPageIdleSecs: ow.optional.number, @@ -266,46 +222,20 @@ export async function enqueueLinksByClickingElements( requestQueue, selector, clickOptions, - // oxlint-disable-next-line typescript/no-deprecated -- still accepted for backwards compat - pseudoUrls, - globs, - regexps, + include, + exclude, transformRequestFunction, waitForPageIdleSecs = 1, maxWaitForPageIdleSecs = 5, forefront, - exclude, onSkippedRequest, } = options; const waitForPageIdleMillis = waitForPageIdleSecs * 1000; const maxWaitForPageIdleMillis = maxWaitForPageIdleSecs * 1000; - const urlExcludePatternObjects: UrlPatternObject[] = []; - const urlPatternObjects: UrlPatternObject[] = []; - - if (exclude?.length) { - for (const excl of exclude) { - if (typeof excl === 'string' || 'glob' in excl) { - urlExcludePatternObjects.push(...constructGlobObjectsFromGlobs([excl])); - } else if (excl instanceof RegExp || 'regexp' in excl) { - urlExcludePatternObjects.push(...constructRegExpObjectsFromRegExps([excl])); - } - } - } - - if (pseudoUrls?.length) { - serviceLocator.getLogger().deprecated('`pseudoUrls` option is deprecated, use `globs` or `regexps` instead'); - urlPatternObjects.push(...constructRegExpObjectsFromPseudoUrls(pseudoUrls)); - } - - if (globs?.length) { - urlPatternObjects.push(...constructGlobObjectsFromGlobs(globs)); - } - - if (regexps?.length) { - urlPatternObjects.push(...constructRegExpObjectsFromRegExps(regexps)); - } + const urlExcludePatternObjects: UrlPatternObject[] = exclude?.length ? constructUrlPatternObjects(exclude) : []; + const urlPatternObjects: UrlPatternObject[] = include?.length ? constructUrlPatternObjects(include) : []; const interceptedRequests = await clickElementsAndInterceptNavigationRequests({ page, diff --git a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts index 4ab17072fb96..0008cfc23294 100644 --- a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts +++ b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts @@ -958,8 +958,7 @@ export interface PlaywrightContextUtils { * in `href` elements, but rather navigations are triggered in click handlers. * If you're looking to find URLs in `href` attributes of the page, see {@apilink enqueueLinks}. * - * Optionally, the function allows you to filter the target links' URLs using an array of {@apilink PseudoUrl} objects - * and override settings of the enqueued {@apilink Request} objects. + * Optionally, the function allows you to filter the target links' URLs using an array of glob or regexp patterns. * * **IMPORTANT**: To be able to do this, this function uses various mutations on the page, * such as changing the Z-index of elements being clicked and their visibility. Therefore, @@ -980,9 +979,9 @@ export interface PlaywrightContextUtils { * async requestHandler({ enqueueLinksByClickingElements }) { * await enqueueLinksByClickingElements({ * selector: 'a.product-detail', - * globs: [ - * 'https://www.example.com/handbags/**' - * 'https://www.example.com/purses/**' + * include: [ + * 'https://www.example.com/handbags/**', + * 'https://www.example.com/purses/**', * ], * }); * }); diff --git a/packages/puppeteer-crawler/src/internals/enqueue-links/click-elements.ts b/packages/puppeteer-crawler/src/internals/enqueue-links/click-elements.ts index 9d579d83c289..114c6c3599be 100644 --- a/packages/puppeteer-crawler/src/internals/enqueue-links/click-elements.ts +++ b/packages/puppeteer-crawler/src/internals/enqueue-links/click-elements.ts @@ -1,20 +1,16 @@ import { URL } from 'node:url'; import type { - GlobInput, - PseudoUrlInput, - RegExpInput, RequestOptions, RequestProvider, RequestTransform, SkippedRequestCallback, + UrlPatternInput, UrlPatternObject, } from '@crawlee/browser'; import { applyRequestTransform, - constructGlobObjectsFromGlobs, - constructRegExpObjectsFromPseudoUrls, - constructRegExpObjectsFromRegExps, + constructUrlPatternObjects, createRequestOptions, filterRequestOptionsByPatterns, Request, @@ -58,68 +54,31 @@ export interface EnqueueLinksByClickingElementsOptions { clickOptions?: ClickOptions; /** - * An array of glob pattern strings or plain objects - * containing glob pattern strings matching the URLs to be enqueued. + * An array of URL patterns that URLs must match to be enqueued. * - * The plain objects must include at least the `glob` property, which holds the glob pattern string. - * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. - * - * The matching is always case-insensitive. - * If you need case-sensitive matching, use `regexps` property directly. - * - * If `globs` is an empty array or `undefined`, then the function - * enqueues all the intercepted navigation requests produced by the page - * after clicking on elements matching the provided CSS selector. - */ - globs?: GlobInput[]; - - /** - * An array of glob pattern strings, regexp patterns or plain objects - * containing patterns matching URLs that will **never** be enqueued. - * - * The plain objects must include either the `glob` property or the `regexp` property. + * Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects. * * Glob matching is always case-insensitive. - * If you need case-sensitive matching, provide a regexp. - */ - exclude?: readonly (GlobInput | RegExpInput)[]; - - /** - * An array of regular expressions or plain objects - * containing regular expressions matching the URLs to be enqueued. - * - * The plain objects must include at least the `regexp` property, which holds the regular expression. - * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. + * If you need case-sensitive matching, use a `RegExp`. * - * If `regexps` is an empty array or `undefined`, then the function + * If `include` is an empty array or `undefined`, then the function * enqueues all the intercepted navigation requests produced by the page * after clicking on elements matching the provided CSS selector. */ - regexps?: RegExpInput[]; + include?: UrlPatternInput[]; /** - * *NOTE:* In future versions of SDK the options will be removed. - * Please use `globs` or `regexps` instead. + * An array of URL patterns. Matching URLs will **not** be enqueued. * - * An array of {@apilink PseudoUrl} strings or plain objects - * containing {@apilink PseudoUrl} strings matching the URLs to be enqueued. + * Accepts glob pattern strings, `{ glob: string }` objects, `RegExp` instances, or `{ regexp: RegExp }` objects. * - * The plain objects must include at least the `purl` property, which holds the pseudo-URL pattern string. - * All remaining keys will be used as request options for the corresponding enqueued {@apilink Request} objects. - * - * With a pseudo-URL string, the matching is always case-insensitive. - * If you need case-sensitive matching, use `regexps` property directly. - * - * If `pseudoUrls` is an empty array or `undefined`, then the function - * enqueues all the intercepted navigation requests produced by the page - * after clicking on elements matching the provided CSS selector. - * - * @deprecated prefer using `globs` or `regexps` instead + * Glob matching is always case-insensitive. + * If you need case-sensitive matching, use a `RegExp`. */ - pseudoUrls?: PseudoUrlInput[]; + exclude?: readonly UrlPatternInput[]; /** - * After {@apilink Request} objects are constructed and filtered by URL patterns (`globs`, `regexps`, `pseudoUrls`), + * After request options are filtered by `include`/`exclude` patterns, * this function can be used to remove them or modify their contents such as `userData`, `payload` or, most importantly * `uniqueKey`. This is useful when you need to enqueue multiple `Requests` to the queue that share the same URL, * but differ in methods or payloads, or to dynamically update or create `userData`. @@ -134,8 +93,8 @@ export interface EnqueueLinksByClickingElementsOptions { * } * ``` * - * Note that `transformRequestFunction` has the highest priority and can overwrite request options - * specified in `globs`, `regexps`, or `pseudoUrls` objects, as well as the global `label` option. + * Note that `transformRequestFunction` has the highest priority and can overwrite + * the global `label` option. * * The function receives a {@apilink RequestOptions} object and can return either: * - The modified {@apilink RequestOptions} object @@ -202,8 +161,7 @@ export interface EnqueueLinksByClickingElementsOptions { * in `href` elements, but rather navigations are triggered in click handlers. * If you're looking to find URLs in `href` attributes of the page, see {@apilink enqueueLinks}. * - * Optionally, the function allows you to filter the target links' URLs using an array of {@apilink PseudoUrl} objects - * and override settings of the enqueued {@apilink Request} objects. + * Optionally, the function allows you to filter the target links' URLs using an array of glob or regexp patterns. * * **IMPORTANT**: To be able to do this, this function uses various mutations on the page, * such as changing the Z-index of elements being clicked and their visibility. Therefore, @@ -225,9 +183,9 @@ export interface EnqueueLinksByClickingElementsOptions { * page, * requestQueue, * selector: 'a.product-detail', - * pseudoUrls: [ - * 'https://www.example.com/handbags/[.*]' - * 'https://www.example.com/purses/[.*]' + * include: [ + * 'https://www.example.com/handbags/*', + * 'https://www.example.com/purses/*', * ], * }); * ``` @@ -237,6 +195,8 @@ export interface EnqueueLinksByClickingElementsOptions { export async function enqueueLinksByClickingElements( options: EnqueueLinksByClickingElementsOptions, ): Promise { + const urlPatternValidator = ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp')); + ow( options, ow.object.exactShape({ @@ -245,12 +205,8 @@ export async function enqueueLinksByClickingElements( selector: ow.string, userData: ow.optional.object, clickOptions: ow.optional.object.hasKeys('clickCount', 'delay'), - pseudoUrls: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('purl'))), - globs: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('glob'))), - regexps: ow.optional.array.ofType(ow.any(ow.regExp, ow.object.hasKeys('regexp'))), - exclude: ow.optional.array.ofType( - ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp')), - ), + include: ow.optional.array.ofType(urlPatternValidator), + exclude: ow.optional.array.ofType(urlPatternValidator), transformRequestFunction: ow.optional.function, waitForPageIdleSecs: ow.optional.number, maxWaitForPageIdleSecs: ow.optional.number, @@ -266,46 +222,20 @@ export async function enqueueLinksByClickingElements( requestQueue, selector, clickOptions, - // oxlint-disable-next-line typescript/no-deprecated -- still accepted for backwards compat - pseudoUrls, - globs, - regexps, + include, + exclude, transformRequestFunction, waitForPageIdleSecs = 1, maxWaitForPageIdleSecs = 5, forefront, - exclude, onSkippedRequest, } = options; const waitForPageIdleMillis = waitForPageIdleSecs * 1000; const maxWaitForPageIdleMillis = maxWaitForPageIdleSecs * 1000; - const urlExcludePatternObjects: UrlPatternObject[] = []; - const urlPatternObjects: UrlPatternObject[] = []; - - if (exclude?.length) { - for (const excl of exclude) { - if (typeof excl === 'string' || 'glob' in excl) { - urlExcludePatternObjects.push(...constructGlobObjectsFromGlobs([excl])); - } else if (excl instanceof RegExp || 'regexp' in excl) { - urlExcludePatternObjects.push(...constructRegExpObjectsFromRegExps([excl])); - } - } - } - - if (pseudoUrls?.length) { - getLog().deprecated('`pseudoUrls` option is deprecated, use `globs` or `regexps` instead'); - urlPatternObjects.push(...constructRegExpObjectsFromPseudoUrls(pseudoUrls)); - } - - if (globs?.length) { - urlPatternObjects.push(...constructGlobObjectsFromGlobs(globs)); - } - - if (regexps?.length) { - urlPatternObjects.push(...constructRegExpObjectsFromRegExps(regexps)); - } + const urlExcludePatternObjects: UrlPatternObject[] = exclude?.length ? constructUrlPatternObjects(exclude) : []; + const urlPatternObjects: UrlPatternObject[] = include?.length ? constructUrlPatternObjects(include) : []; const interceptedRequests = await clickElementsAndInterceptNavigationRequests({ page, diff --git a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts index 2d25ce3bbda8..993223bd815b 100644 --- a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts +++ b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts @@ -891,8 +891,7 @@ export interface PuppeteerContextUtils { * in `href` elements, but rather navigations are triggered in click handlers. * If you're looking to find URLs in `href` attributes of the page, see {@apilink enqueueLinks}. * - * Optionally, the function allows you to filter the target links' URLs using an array of {@apilink PseudoUrl} objects - * and override settings of the enqueued {@apilink Request} objects. + * Optionally, the function allows you to filter the target links' URLs using an array of glob or regexp patterns. * * **IMPORTANT**: To be able to do this, this function uses various mutations on the page, * such as changing the Z-index of elements being clicked and their visibility. Therefore, @@ -913,9 +912,9 @@ export interface PuppeteerContextUtils { * async requestHandler({ enqueueLinksByClickingElements }) { * await enqueueLinksByClickingElements({ * selector: 'a.product-detail', - * globs: [ - * 'https://www.example.com/handbags/**' - * 'https://www.example.com/purses/**' + * include: [ + * 'https://www.example.com/handbags/**', + * 'https://www.example.com/purses/**', * ], * }); * }); diff --git a/packages/stagehand-crawler/src/internals/stagehand-crawler.ts b/packages/stagehand-crawler/src/internals/stagehand-crawler.ts index 0bab236d164d..dac67c31acd0 100644 --- a/packages/stagehand-crawler/src/internals/stagehand-crawler.ts +++ b/packages/stagehand-crawler/src/internals/stagehand-crawler.ts @@ -472,7 +472,7 @@ export class StagehandCrawler< * }); * * router.addDefaultHandler(async ({ page, enqueueLinks }) => { - * await enqueueLinks({ globs: ['https://example.com/products/*'] }); + * await enqueueLinks({ include: ['https://example.com/products/*'] }); * }); * * const crawler = new StagehandCrawler({ diff --git a/packages/templates/templates/camoufox-ts/src/routes.ts b/packages/templates/templates/camoufox-ts/src/routes.ts index e2bea3cd67e2..0952e90ab43d 100644 --- a/packages/templates/templates/camoufox-ts/src/routes.ts +++ b/packages/templates/templates/camoufox-ts/src/routes.ts @@ -5,7 +5,7 @@ export const router = createPlaywrightRouter(); router.addDefaultHandler(async ({ enqueueLinks, log }) => { log.info(`enqueueing new URLs`); await enqueueLinks({ - globs: ['https://crawlee.dev/**'], + include: ['https://crawlee.dev/**'], label: 'detail', }); }); diff --git a/packages/templates/templates/cheerio-js/src/routes.js b/packages/templates/templates/cheerio-js/src/routes.js index a7cbbf142de5..b5eb002d56e3 100644 --- a/packages/templates/templates/cheerio-js/src/routes.js +++ b/packages/templates/templates/cheerio-js/src/routes.js @@ -5,7 +5,7 @@ export const router = createCheerioRouter(); router.addDefaultHandler(async ({ enqueueLinks, log }) => { log.info(`enqueueing new URLs`); await enqueueLinks({ - globs: ['https://crawlee.dev/**'], + include: ['https://crawlee.dev/**'], label: 'detail', }); }); diff --git a/packages/templates/templates/cheerio-ts/src/routes.ts b/packages/templates/templates/cheerio-ts/src/routes.ts index a7cbbf142de5..b5eb002d56e3 100644 --- a/packages/templates/templates/cheerio-ts/src/routes.ts +++ b/packages/templates/templates/cheerio-ts/src/routes.ts @@ -5,7 +5,7 @@ export const router = createCheerioRouter(); router.addDefaultHandler(async ({ enqueueLinks, log }) => { log.info(`enqueueing new URLs`); await enqueueLinks({ - globs: ['https://crawlee.dev/**'], + include: ['https://crawlee.dev/**'], label: 'detail', }); }); diff --git a/packages/templates/templates/playwright-js/src/routes.js b/packages/templates/templates/playwright-js/src/routes.js index e2bea3cd67e2..0952e90ab43d 100644 --- a/packages/templates/templates/playwright-js/src/routes.js +++ b/packages/templates/templates/playwright-js/src/routes.js @@ -5,7 +5,7 @@ export const router = createPlaywrightRouter(); router.addDefaultHandler(async ({ enqueueLinks, log }) => { log.info(`enqueueing new URLs`); await enqueueLinks({ - globs: ['https://crawlee.dev/**'], + include: ['https://crawlee.dev/**'], label: 'detail', }); }); diff --git a/packages/templates/templates/playwright-ts/src/routes.ts b/packages/templates/templates/playwright-ts/src/routes.ts index e2bea3cd67e2..0952e90ab43d 100644 --- a/packages/templates/templates/playwright-ts/src/routes.ts +++ b/packages/templates/templates/playwright-ts/src/routes.ts @@ -5,7 +5,7 @@ export const router = createPlaywrightRouter(); router.addDefaultHandler(async ({ enqueueLinks, log }) => { log.info(`enqueueing new URLs`); await enqueueLinks({ - globs: ['https://crawlee.dev/**'], + include: ['https://crawlee.dev/**'], label: 'detail', }); }); diff --git a/packages/templates/templates/puppeteer-js/src/routes.js b/packages/templates/templates/puppeteer-js/src/routes.js index 8896a306dc54..f4b142e7434e 100644 --- a/packages/templates/templates/puppeteer-js/src/routes.js +++ b/packages/templates/templates/puppeteer-js/src/routes.js @@ -5,7 +5,7 @@ export const router = createPuppeteerRouter(); router.addDefaultHandler(async ({ enqueueLinks, log }) => { log.info(`enqueueing new URLs`); await enqueueLinks({ - globs: ['https://crawlee.dev/**'], + include: ['https://crawlee.dev/**'], label: 'detail', }); }); diff --git a/packages/templates/templates/puppeteer-ts/src/routes.ts b/packages/templates/templates/puppeteer-ts/src/routes.ts index 8896a306dc54..f4b142e7434e 100644 --- a/packages/templates/templates/puppeteer-ts/src/routes.ts +++ b/packages/templates/templates/puppeteer-ts/src/routes.ts @@ -5,7 +5,7 @@ export const router = createPuppeteerRouter(); router.addDefaultHandler(async ({ enqueueLinks, log }) => { log.info(`enqueueing new URLs`); await enqueueLinks({ - globs: ['https://crawlee.dev/**'], + include: ['https://crawlee.dev/**'], label: 'detail', }); }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 88760b90b417..2629cd16e65e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -418,9 +418,6 @@ importers: '@apify/log': specifier: ^2.5.18 version: 2.5.35 - '@apify/pseudo_url': - specifier: ^2.0.59 - version: 2.0.76 '@apify/timeout': specifier: ^0.3.2 version: 0.3.3 diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index fc5d85d1482e..32650774d662 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -112,15 +112,15 @@ describe('enqueueLinks()', () => { expect(enqueued[3]).toBe(undefined); }); - test('works with globs', async () => { + test('works with include (globs)', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const globs = ['https://example.com/**/*', { glob: '?(http|https)://cool.com/', method: 'POST' as const }]; + const include = ['https://example.com/**/*', '?(http|https)://cool.com/']; await browserCrawlerEnqueueLinks({ options: { selector: '.click', label: 'COOL', - globs, + include, transformRequestFunction: (request) => { if (/example\.com\/a\/b\/third/.exec(request.url)) { request.method = 'OPTIONS'; @@ -144,24 +144,24 @@ describe('enqueueLinks()', () => { expect(enqueued[1].userData).toEqual({ label: 'COOL' }); expect(enqueued[2].url).toBe('http://cool.com/'); - expect(enqueued[2].method).toBe('POST'); + expect(enqueued[2].method).toBe('GET'); expect(enqueued[2].userData).toEqual({ label: 'COOL' }); }); - test('does not throw with empty globs', async () => { + test('does not throw with empty include patterns', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const globs = [ + const include = [ 'https://example.com/**/*', '', { glob: ' ' }, // Empty string used to throw an error (https://console.apify.com/actors/aYG0l9s7dbB7j3gbS/issues/Wd0Ahfk9Vd2OPk4Uf) { glob: '' }, - { glob: '?(http|https)://cool.com/', method: 'POST' as const }, + '?(http|https)://cool.com/', ]; await expect( browserCrawlerEnqueueLinks({ - options: { selector: '.click', globs }, + options: { selector: '.click', include }, page, requestQueue, originalRequestUrl: 'https://example.com', @@ -171,17 +171,14 @@ describe('enqueueLinks()', () => { expect(enqueued).toHaveLength(3); }); - test('works with regexps', async () => { + test('works with include (regexps)', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const regexps = [ - /^https:\/\/example\.com\/(\w|\/)+/, - { regexp: /^(http|https):\/\/cool\.com\//, method: 'POST' as const, userData: { label: 'COOL' } }, - ]; + const include = [/^https:\/\/example\.com\/(\w|\/)+/, /^(http|https):\/\/cool\.com\//]; await browserCrawlerEnqueueLinks({ options: { selector: '.click', - regexps, + include, transformRequestFunction: (request) => { if (/example\.com\/a\/b\/third/.exec(request.url)) { request.method = 'OPTIONS'; @@ -205,8 +202,8 @@ describe('enqueueLinks()', () => { expect(enqueued[1].userData).toEqual({}); expect(enqueued[2].url).toBe('http://cool.com/'); - expect(enqueued[2].method).toBe('POST'); - expect(enqueued[2].userData).toEqual({ label: 'COOL' }); + expect(enqueued[2].method).toBe('GET'); + expect(enqueued[2].userData).toEqual({}); }); test('works with skipNavigation', async () => { @@ -231,7 +228,7 @@ describe('enqueueLinks()', () => { test('works with exclude glob', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const globs = ['https://example.com/**/*', { glob: '?(http|https)://cool.com/', method: 'POST' as const }]; + const include = ['https://example.com/**/*', '?(http|https)://cool.com/']; const exclude = ['**/first']; @@ -239,7 +236,7 @@ describe('enqueueLinks()', () => { options: { selector: '.click', label: 'COOL', - globs, + include, exclude, transformRequestFunction: (request) => { if (/example\.com\/a\/b\/third/.exec(request.url)) { @@ -263,13 +260,13 @@ describe('enqueueLinks()', () => { expect(enqueued[0].userData).toEqual({ label: 'COOL' }); expect(enqueued[1].url).toBe('http://cool.com/'); - expect(enqueued[1].method).toBe('POST'); + expect(enqueued[1].method).toBe('GET'); expect(enqueued[1].userData).toEqual({ label: 'COOL' }); }); test('works with exclude regexp', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const globs = ['https://example.com/**/*', { glob: '?(http|https)://cool.com/', method: 'POST' as const }]; + const include = ['https://example.com/**/*', '?(http|https)://cool.com/']; const exclude = [/first/]; @@ -277,7 +274,7 @@ describe('enqueueLinks()', () => { options: { selector: '.click', label: 'COOL', - globs, + include, exclude, transformRequestFunction: (request) => { if (/example\.com\/a\/b\/third/.exec(request.url)) { @@ -301,65 +298,11 @@ describe('enqueueLinks()', () => { expect(enqueued[0].userData).toEqual({ label: 'COOL' }); expect(enqueued[1].url).toBe('http://cool.com/'); - expect(enqueued[1].method).toBe('POST'); + expect(enqueued[1].method).toBe('GET'); expect(enqueued[1].userData).toEqual({ label: 'COOL' }); }); - test('works with pseudoUrls', async () => { - const { enqueued, requestQueue } = await createRequestQueueMock(); - const pseudoUrls = [ - 'https://example.com/[(\\w|-|/)*]', - { purl: '[http|https]://cool.com/', method: 'POST' as const, userData: { label: 'COOL' } }, - ]; - - await browserCrawlerEnqueueLinks({ - options: { - selector: '.click', - pseudoUrls, - transformRequestFunction: (request) => { - if (/example\.com\/a\/b\/third/.exec(request.url)) { - request.method = 'OPTIONS'; - } - return request; - }, - }, - page, - requestQueue, - originalRequestUrl: 'https://example.com', - }); - - expect(enqueued).toHaveLength(3); - - expect(enqueued[0].url).toBe('https://example.com/a/b/first'); - expect(enqueued[0].method).toBe('GET'); - expect(enqueued[0].userData).toEqual({}); - - expect(enqueued[1].url).toBe('https://example.com/a/b/third'); - expect(enqueued[1].method).toBe('OPTIONS'); - expect(enqueued[1].userData).toEqual({}); - - expect(enqueued[2].url).toBe('http://cool.com/'); - expect(enqueued[2].method).toBe('POST'); - expect(enqueued[2].userData).toEqual({ label: 'COOL' }); - }); - - test('throws with RegExp pseudoUrls', async () => { - const { enqueued, requestQueue } = await createRequestQueueMock(); - - const pseudoUrls = [/https:\/\/example\.com\/(\w|-|\/)*/, /(http|https):\/\/cool\.com\//]; - - await expect( - browserCrawlerEnqueueLinks({ - // @ts-expect-error Type 'RegExp[]' is not assignable to type 'PseudoUrlInput[]' - options: { selector: '.click', pseudoUrls }, - page, - requestQueue, - originalRequestUrl: 'https://example.com', - }), - ).rejects.toThrow(/to be of type `string` but received type `RegExp`/); - }); - - test('works with undefined pseudoUrls[]', async () => { + test('works with no include/exclude filters (enqueues all matching strategy)', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); await browserCrawlerEnqueueLinks({ @@ -388,63 +331,6 @@ describe('enqueueLinks()', () => { expect(enqueued[3].userData).toEqual({}); }); - test('throws with null pseudoUrls[]', async () => { - const { enqueued, requestQueue } = await createRequestQueueMock(); - await expect( - browserCrawlerEnqueueLinks({ - // @ts-expect-error invalid input - options: { selector: '.click', pseudoUrls: null }, - page, - requestQueue, - originalRequestUrl: 'https://example.com', - }), - ).rejects.toThrow(/Expected property `pseudoUrls` to be of type `array` but received type `null`/); - }); - - test('works with empty pseudoUrls[]', async () => { - const { enqueued, requestQueue } = await createRequestQueueMock(); - await browserCrawlerEnqueueLinks({ - options: { selector: '.click', pseudoUrls: [], strategy: EnqueueStrategy.All }, - page, - requestQueue, - originalRequestUrl: 'https://example.com', - }); - - expect(enqueued).toHaveLength(4); - - expect(enqueued[0].url).toBe('https://example.com/a/b/first'); - expect(enqueued[0].method).toBe('GET'); - expect(enqueued[0].userData).toEqual({}); - - expect(enqueued[1].url).toBe('https://example.com/a/b/third'); - expect(enqueued[1].method).toBe('GET'); - expect(enqueued[1].userData).toEqual({}); - - expect(enqueued[2].url).toBe('https://another.com/a/fifth'); - expect(enqueued[2].method).toBe('GET'); - expect(enqueued[2].userData).toEqual({}); - - expect(enqueued[3].url).toBe('http://cool.com/'); - expect(enqueued[3].method).toBe('GET'); - expect(enqueued[3].userData).toEqual({}); - }); - - test('throws with sparse pseudoUrls[]', async () => { - const { enqueued, requestQueue } = await createRequestQueueMock(); - const pseudoUrls = ['https://example.com/[(\\w|-|/)*]', null, '[http|https]://cool.com/']; - - await expect( - browserCrawlerEnqueueLinks({ - // @ts-expect-error invalid input - options: { selector: '.click', pseudoUrls }, - page, - requestQueue, - originalRequestUrl: 'https://example.com', - }), - ).rejects.toThrow(/\(array `pseudoUrls`\) Any predicate failed with the following errors/); - expect(enqueued).toHaveLength(0); - }); - test('correctly resolves relative URLs with default strategy of same-hostname', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); await browserCrawlerEnqueueLinks({ @@ -536,12 +422,12 @@ describe('enqueueLinks()', () => { test('correctly works with transformRequestFunction', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const pseudoUrls = ['https://example.com/[(\\w|-|/)*]', '[http|https]://cool.com/']; + const include = ['https://example.com/**/*', '?(http|https)://cool.com/']; await browserCrawlerEnqueueLinks({ options: { selector: '.click', - pseudoUrls, + include, transformRequestFunction: (request) => { if (request.url.includes('example.com')) { request.method = 'POST'; @@ -583,17 +469,14 @@ describe('enqueueLinks()', () => { $ = null!; }); - test('works with globs', async () => { + test('works with include (globs)', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const globs = [ - 'https://example.com/**/*', - { glob: '?(http|https)://cool.com/', method: 'POST' as const, userData: { label: 'COOL' } }, - ]; + const include = ['https://example.com/**/*', '?(http|https)://cool.com/']; await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', - globs, + include, transformRequestFunction: (request) => { if (/example\.com\/a\/b\/third/.exec(request.url)) { request.method = 'OPTIONS'; @@ -617,22 +500,17 @@ describe('enqueueLinks()', () => { expect(enqueued[1].userData).toEqual({}); expect(enqueued[2].url).toBe('http://cool.com/'); - expect(enqueued[2].method).toBe('POST'); - expect(enqueued[2].userData).toEqual({ label: 'COOL' }); + expect(enqueued[2].method).toBe('GET'); + expect(enqueued[2].userData).toEqual({}); }); - test('does not throw with empty globs', async () => { + test('does not throw with empty include patterns', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const globs = [ - 'https://example.com/**/*', - { glob: '?(http|https)://cool.com/', method: 'POST' as const, userData: { label: 'COOL' } }, - '', - { glob: ' ' }, - ]; + const include = ['https://example.com/**/*', '?(http|https)://cool.com/', '', { glob: ' ' }]; await expect( cheerioCrawlerEnqueueLinks({ - options: { selector: '.click', globs }, + options: { selector: '.click', include }, $, requestQueue, originalRequestUrl: 'https://example.com', @@ -642,17 +520,14 @@ describe('enqueueLinks()', () => { expect(enqueued).toHaveLength(3); }); - test('works with RegExps', async () => { + test('works with include (regexps)', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const regexps = [ - /^https:\/\/example\.com\/(\w|\/)+/, - { regexp: /^(http|https):\/\/cool\.com\//, method: 'POST' as const, userData: { label: 'COOL' } }, - ]; + const include = [/^https:\/\/example\.com\/(\w|\/)+/, /^(http|https):\/\/cool\.com\//]; await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', - regexps, + include, transformRequestFunction: (request) => { if (/example\.com\/a\/b\/third/.exec(request.url)) { request.method = 'OPTIONS'; @@ -676,28 +551,18 @@ describe('enqueueLinks()', () => { expect(enqueued[1].userData).toEqual({}); expect(enqueued[2].url).toBe('http://cool.com/'); - expect(enqueued[2].method).toBe('POST'); - expect(enqueued[2].userData).toEqual({ label: 'COOL' }); + expect(enqueued[2].method).toBe('GET'); + expect(enqueued[2].userData).toEqual({}); }); - test('works with string pseudoUrls', async () => { + test('works with include (mixed globs and regexps)', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const pseudoUrls = [ - 'https://example.com/[(\\w|-|/)*]', - { purl: '[http|https]://cool.com/', method: 'POST' as const, userData: { label: 'COOL' } }, - ]; + const include = ['https://example.com/**/*', /^(http|https):\/\/cool\.com\//]; await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', - userData: { label: 'DEFAULT' }, - pseudoUrls, - transformRequestFunction: (request) => { - if (/example\.com\/a\/b\/third/.exec(request.url)) { - request.method = 'OPTIONS'; - } - return request; - }, + include, }, $, requestQueue, @@ -707,34 +572,11 @@ describe('enqueueLinks()', () => { expect(enqueued).toHaveLength(3); expect(enqueued[0].url).toBe('https://example.com/a/b/first'); - expect(enqueued[0].method).toBe('GET'); - expect(enqueued[0].userData).toEqual({ label: 'DEFAULT' }); - expect(enqueued[1].url).toBe('https://example.com/a/b/third'); - expect(enqueued[1].method).toBe('OPTIONS'); - expect(enqueued[1].userData).toEqual({ label: 'DEFAULT' }); - expect(enqueued[2].url).toBe('http://cool.com/'); - expect(enqueued[2].method).toBe('POST'); - expect(enqueued[2].userData).toEqual({ label: 'COOL' }); }); - test('throws with RegExp pseudoUrls', async () => { - const { enqueued, requestQueue } = await createRequestQueueMock(); - const pseudoUrls = [/https:\/\/example\.com\/(\w|-|\/)*/, /(http|https):\/\/cool\.com\//]; - - await expect( - cheerioCrawlerEnqueueLinks({ - // @ts-expect-error Type 'RegExp[]' is not assignable to type 'PseudoUrlInput[]' - options: { selector: '.click', pseudoUrls }, - $, - requestQueue, - originalRequestUrl: 'https://example.com', - }), - ).rejects.toThrow(/to be of type `string` but received type `RegExp`/); - }); - - test('works with undefined pseudoUrls[]', async () => { + test('works with no include/exclude filters (enqueues all matching strategy)', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', strategy: EnqueueStrategy.All }, @@ -762,63 +604,6 @@ describe('enqueueLinks()', () => { expect(enqueued[3].userData).toEqual({}); }); - test('throws with null pseudoUrls[]', async () => { - const { enqueued, requestQueue } = await createRequestQueueMock(); - await expect( - cheerioCrawlerEnqueueLinks({ - // @ts-expect-error invalid input - options: { selector: '.click', pseudoUrls: null }, - $, - requestQueue, - originalRequestUrl: 'https://example.com', - }), - ).rejects.toThrow(/Expected property `pseudoUrls` to be of type `array` but received type `null`/); - }); - - test('works with empty pseudoUrls[]', async () => { - const { enqueued, requestQueue } = await createRequestQueueMock(); - await cheerioCrawlerEnqueueLinks({ - options: { selector: '.click', pseudoUrls: [], strategy: EnqueueStrategy.All }, - $, - requestQueue, - originalRequestUrl: 'https://example.com', - }); - - expect(enqueued).toHaveLength(4); - - expect(enqueued[0].url).toBe('https://example.com/a/b/first'); - expect(enqueued[0].method).toBe('GET'); - expect(enqueued[0].userData).toEqual({}); - - expect(enqueued[1].url).toBe('https://example.com/a/b/third'); - expect(enqueued[1].method).toBe('GET'); - expect(enqueued[1].userData).toEqual({}); - - expect(enqueued[2].url).toBe('https://another.com/a/fifth'); - expect(enqueued[2].method).toBe('GET'); - expect(enqueued[2].userData).toEqual({}); - - expect(enqueued[3].url).toBe('http://cool.com/'); - expect(enqueued[3].method).toBe('GET'); - expect(enqueued[3].userData).toEqual({}); - }); - - test('throws with sparse pseudoUrls[]', async () => { - const { enqueued, requestQueue } = await createRequestQueueMock(); - const pseudoUrls = ['https://example.com/[(\\w|-|/)*]', null, '[http|https]://cool.com/']; - - await expect( - cheerioCrawlerEnqueueLinks({ - // @ts-expect-error invalid input - options: { selector: '.click', pseudoUrls }, - $, - requestQueue, - originalRequestUrl: 'https://example.com', - }), - ).rejects.toThrow(/\(array `pseudoUrls`\) Any predicate failed with the following errors/); - expect(enqueued).toHaveLength(0); - }); - test('correctly resolves relative URLs with the strategy of all', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ @@ -932,12 +717,12 @@ describe('enqueueLinks()', () => { test('correctly works with transformRequestFunction', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); - const pseudoUrls = ['https://example.com/[(\\w|-|/)*]', '[http|https]://cool.com/']; + const include = ['https://example.com/**/*', '?(http|https)://cool.com/']; await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', - pseudoUrls, + include, transformRequestFunction: (request) => { if (request.url.includes('example.com')) { request.method = 'POST'; @@ -1033,7 +818,7 @@ describe('enqueueLinks()', () => { options: { selector: '.click', label: 'global-label', - globs: ['https://example.com/**/*'], + include: ['https://example.com/**/*'], }, $, requestQueue, @@ -1041,45 +826,19 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(2); - // Global label should be applied when no pattern-specific label is set + // Global label should be applied to all matched requests expect(enqueued[0].userData).toEqual({ label: 'global-label' }); expect(enqueued[1].userData).toEqual({ label: 'global-label' }); }); - test('pattern label overrides global label', async () => { - const { enqueued, requestQueue } = await createRequestQueueMock(); - - await cheerioCrawlerEnqueueLinks({ - options: { - selector: '.click', - label: 'global-label', - regexps: [ - { regexp: /example\.com\/a\/b\/first/, label: 'pattern-label' }, - /example\.com\/a\/b\/third/, // No label, should use global - ], - }, - $, - requestQueue, - originalRequestUrl: 'https://example.com', - }); - - expect(enqueued).toHaveLength(2); - // Pattern-specific label should override global label - expect(enqueued[0].url).toBe('https://example.com/a/b/first'); - expect(enqueued[0].userData).toEqual({ label: 'pattern-label' }); - // URL matching pattern without label should use global label - expect(enqueued[1].url).toBe('https://example.com/a/b/third'); - expect(enqueued[1].userData).toEqual({ label: 'global-label' }); - }); - - test('transformRequestFunction has highest priority and overrides pattern label', async () => { + test('transformRequestFunction overrides global label', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', label: 'global-label', - regexps: [{ regexp: /example\.com/, label: 'pattern-label' }], + include: [/example\.com/], transformRequestFunction: (request) => { if (request.url.includes('/a/b/first')) { request.label = 'transformed-label'; @@ -1093,26 +852,22 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(2); - // transformRequestFunction should override pattern label + // transformRequestFunction should override global label expect(enqueued[0].url).toBe('https://example.com/a/b/first'); expect(enqueued[0].userData).toEqual({ label: 'transformed-label' }); - // URL not modified by transformRequestFunction should keep pattern label + // URL not modified by transformRequestFunction should keep global label expect(enqueued[1].url).toBe('https://example.com/a/b/third'); - expect(enqueued[1].userData).toEqual({ label: 'pattern-label' }); + expect(enqueued[1].userData).toEqual({ label: 'global-label' }); }); - test('transformRequestFunction can override all label sources', async () => { + test('transformRequestFunction can override global label for all requests', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', label: 'global-label', - globs: [ - { glob: 'https://example.com/a/b/first', label: 'glob-label' }, - { glob: 'https://example.com/a/b/third', label: 'glob-label' }, - { glob: 'http://cool.com/', label: 'cool-label' }, - ], + include: ['https://example.com/a/b/first', 'https://example.com/a/b/third', 'http://cool.com/'], transformRequestFunction: (request) => { // Override all labels request.label = 'final-label'; @@ -1131,17 +886,16 @@ describe('enqueueLinks()', () => { } }); - test('transformRequestFunction can modify other request properties after patterns are applied', async () => { + test('transformRequestFunction can modify request properties', async () => { const { enqueued, requestQueue } = await createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', - regexps: [{ regexp: /example\.com/, method: 'POST' as const, userData: { source: 'pattern' } }], + include: [/example\.com/], transformRequestFunction: (request) => { - // Change method set by pattern + // Set method and userData via transformRequestFunction request.method = 'PUT'; - // Add to userData without removing pattern's data request.userData = { ...request.userData, transformed: true }; return request; }, @@ -1152,12 +906,12 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(2); - // transformRequestFunction should override method from pattern + // transformRequestFunction should set method expect(enqueued[0].method).toBe('PUT'); expect(enqueued[1].method).toBe('PUT'); - // userData should contain both pattern and transformed data - expect(enqueued[0].userData).toEqual({ source: 'pattern', transformed: true }); - expect(enqueued[1].userData).toEqual({ source: 'pattern', transformed: true }); + // userData should contain transformed data + expect(enqueued[0].userData).toEqual({ transformed: true }); + expect(enqueued[1].userData).toEqual({ transformed: true }); }); test('transformRequestFunction can return a new plain object instead of modifying in place', async () => { @@ -1183,7 +937,7 @@ describe('enqueueLinks()', () => { await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', - globs: ['https://example.com/**/*'], + include: ['https://example.com/**/*'], transformRequestFunction: (request) => { // Return a new plain object instead of modifying in place return { @@ -1216,7 +970,7 @@ describe('enqueueLinks()', () => { options: { selector: '.click', label: 'global-label', - globs: ['https://example.com/**/*'], + include: ['https://example.com/**/*'], transformRequestFunction: (request) => { if (request.url.includes('/a/b/first')) { return 'skip'; @@ -1258,7 +1012,7 @@ describe('enqueueLinks()', () => { await cheerioCrawlerEnqueueLinks({ options: { selector: '.click', - globs: ['https://example.com/**/*'], + include: ['https://example.com/**/*'], transformRequestFunction: (request) => { // Skip the first URL, keep the second if (request.url.includes('/a/b/first')) { @@ -1276,7 +1030,7 @@ describe('enqueueLinks()', () => { expect(enqueued).toHaveLength(1); expect(enqueued[0].url).toBe('https://example.com/a/b/third'); - // onSkippedRequest fires for URLs filtered out by globs (another.com, cool.com) + // onSkippedRequest fires for URLs filtered out by include (another.com, cool.com) // AND for the URL explicitly skipped by transformRequestFunction const skippedCalls = onSkippedRequest.mock.calls.map( (call: unknown[]) => call[0] as { url: string; reason: string }, diff --git a/test/core/enqueue_links/shared.test.ts b/test/core/enqueue_links/shared.test.ts index fbb1cafd0b29..462bec8bca85 100644 --- a/test/core/enqueue_links/shared.test.ts +++ b/test/core/enqueue_links/shared.test.ts @@ -2,8 +2,8 @@ import type { RequestOptions } from '@crawlee/core'; import { applyRequestTransform, constructGlobObjectsFromGlobs, - constructRegExpObjectsFromPseudoUrls, constructRegExpObjectsFromRegExps, + constructUrlPatternObjects, createRequestOptions, filterRequestOptionsByPatterns, validateGlobPattern, @@ -12,57 +12,43 @@ import { describe('Enqueue links shared functions', () => { describe('constructGlobObjectsFromGlobs()', () => { test('should work', () => { - const globs = [ - 'https://example.com/**/*', - { glob: '?(http|https)://cool.com/', userData: { foo: 'bar' }, label: 'foobar' }, - ]; + const globs = ['https://example.com/**/*', { glob: '?(http|https)://cool.com/' }]; const globObjects = constructGlobObjectsFromGlobs(globs); expect(globObjects).toHaveLength(2); expect(globObjects[0].glob).toEqual('https://example.com/**/*'); - expect(globObjects[0].userData).toBe(undefined); expect(globObjects[1].glob).toEqual('?(http|https)://cool.com/'); - expect(globObjects[1].userData).toStrictEqual({ foo: 'bar' }); - expect(globObjects[1].label).toBe('foobar'); }); }); describe('constructRegExpObjectsFromRegExps()', () => { test('should work', () => { - const regexps = [ - /^https:\/\/example\.com\/(\w|\/)+/, - { regexp: /^(http|https):\/\/cool\.com\//, userData: { foo: 'bar' } }, - ]; + const regexps = [/^https:\/\/example\.com\/(\w|\/)+/, { regexp: /^(http|https):\/\/cool\.com\// }]; const regexpObjects = constructRegExpObjectsFromRegExps(regexps); expect(regexpObjects).toHaveLength(2); expect(regexpObjects[0].regexp.test('https://example.com/')).toBe(false); - expect(regexpObjects[0].userData).toBe(undefined); expect(regexpObjects[1].regexp.test('https://cool.com/')).toBe(true); - expect(regexpObjects[1].userData).toStrictEqual({ foo: 'bar' }); }); }); - describe('constructRegExpObjectsFromPseudoUrls()', () => { - test('should work', () => { - const pseudoUrls = [ - 'http[s?]://example.com/', - { purl: 'http[s?]://example.com[.*]', userData: { foo: 'bar' } }, + describe('constructUrlPatternObjects()', () => { + test('should handle mixed glob and regexp patterns', () => { + const patterns = [ + 'https://example.com/**/*', + { glob: 'https://cool.com/**' }, + /^https:\/\/foo\.com/, + { regexp: /bar\.com/ }, ]; - const urlPatternObjects = constructRegExpObjectsFromPseudoUrls(pseudoUrls); - expect(urlPatternObjects).toHaveLength(2); - urlPatternObjects.forEach((urlPatternObject) => { - expect(urlPatternObject.regexp.test('https://example.com/')).toBe(true); - }); - expect(urlPatternObjects[0].regexp.test('https://example.com/foo')).toBe(false); - expect(urlPatternObjects[0].userData).toBe(undefined); - expect(urlPatternObjects[1].regexp.test('https://example.com/foo')).toBe(true); - expect(urlPatternObjects[1].userData).toStrictEqual({ foo: 'bar' }); + const objects = constructUrlPatternObjects(patterns); + expect(objects).toHaveLength(4); + expect(objects[0]).toHaveProperty('glob', 'https://example.com/**/*'); + expect(objects[1]).toHaveProperty('glob', 'https://cool.com/**'); + expect(objects[2]).toHaveProperty('regexp'); + expect(objects[3]).toHaveProperty('regexp'); }); + }); + describe('caching', () => { test('should cache items', () => { - const pseudoUrls0 = constructRegExpObjectsFromPseudoUrls(['http[s?]://example.com/[.*]']); - const pseudoUrls1 = constructRegExpObjectsFromPseudoUrls(['http[s?]://example.com/[.*]']); - expect(pseudoUrls0[0]).toEqual(pseudoUrls1[0]); - const globs0 = constructGlobObjectsFromGlobs(['https://example.com/**/*']); const globs1 = constructGlobObjectsFromGlobs(['https://example.com/**/*']); expect(globs0[0]).toEqual(globs1[0]); @@ -80,8 +66,7 @@ describe('Enqueue links shared functions', () => { { url: 'https://example.com/bar', method: 'POST' as const, label: 'POST-REQUEST' }, 'https://apify.com', ]; - const pseudoUrls = [{ purl: 'http[s?]://example.com/[.*]', userData: { one: 1 } }]; - const urlPatternObjects = constructRegExpObjectsFromPseudoUrls(pseudoUrls); + const urlPatternObjects = constructUrlPatternObjects([/^https?:\/\/example\.com\/.*/]); const transformRequestFunction = (request: RequestOptions) => { request.userData = { ...request.userData, foo: 'bar' }; @@ -95,13 +80,10 @@ describe('Enqueue links shared functions', () => { expect(transformed).toHaveLength(2); transformed.forEach((r) => { expect(r.url).toMatch(/^https?:\/\/example\.com\//); - expect(r.userData).toMatchObject({ foo: 'bar', one: 1 }); + expect(r.userData).toMatchObject({ foo: 'bar' }); }); expect(transformed[0].method).toBeUndefined(); // defaults to GET when Request is constructed expect(transformed[1].method).toBe('POST'); - // Pattern-level userData { one: 1 } overwrites the source's userData { label: 'POST-REQUEST' }, - // then the transform adds { foo: 'bar' } - expect(transformed[1].userData).toEqual({ foo: 'bar', one: 1 }); }); }); diff --git a/test/core/sitemap_request_list.test.ts b/test/core/sitemap_request_list.test.ts index f60d08594fb1..c3a783548e3d 100644 --- a/test/core/sitemap_request_list.test.ts +++ b/test/core/sitemap_request_list.test.ts @@ -278,10 +278,10 @@ describe('SitemapRequestList', () => { await expect(list.fetchNextRequest()).resolves.toBe(null); }); - test('globs filtering works', async () => { + test('include with globs filtering works', async () => { const list = await SitemapRequestList.open({ sitemapUrls: [`${url}/sitemap.xml`], - globs: ['http://not-exists.com/catalog**'], + include: ['http://not-exists.com/catalog**'], }); for await (const request of list) { @@ -291,10 +291,10 @@ describe('SitemapRequestList', () => { expect(list.handledCount()).toBe(4); }); - test('regexps filtering works', async () => { + test('include with regexps filtering works', async () => { const list = await SitemapRequestList.open({ sitemapUrls: [`${url}/sitemap.xml`], - regexps: [/desc=vacation_new.+/], + include: [/desc=vacation_new.+/], }); for await (const request of list) { diff --git a/test/e2e/adaptive-playwright-default/actor/main.js b/test/e2e/adaptive-playwright-default/actor/main.js index ecc894b08e80..30871858307f 100644 --- a/test/e2e/adaptive-playwright-default/actor/main.js +++ b/test/e2e/adaptive-playwright-default/actor/main.js @@ -40,7 +40,7 @@ const crawler = new AdaptivePlaywrightCrawler({ await context.pushData({ url, heading, requestHandlerMode }); await context.enqueueLinks({ - globs: ['**/next/examples/*'], + include: ['**/next/examples/*'], }); }, }); diff --git a/test/e2e/adaptive-playwright-robots-file/actor/main.js b/test/e2e/adaptive-playwright-robots-file/actor/main.js index 96a87f86834b..369b8126fd8b 100644 --- a/test/e2e/adaptive-playwright-robots-file/actor/main.js +++ b/test/e2e/adaptive-playwright-robots-file/actor/main.js @@ -18,7 +18,7 @@ crawler.router.addDefaultHandler(async ({ log, request, enqueueLinks, pushData } log.info(`Processing ${request.loadedUrl}`); await enqueueLinks({ // '/cart' is disallowed by robots.txt - globs: ['**/cart', '**/collections/*'], + include: ['**/cart', '**/collections/*'], }); await pushData({ url: request.url, loadedUrl: request.loadedUrl }); }); diff --git a/test/e2e/cheerio-default-ts/actor/main.ts b/test/e2e/cheerio-default-ts/actor/main.ts index 93c1b93322ba..c122e09a3547 100644 --- a/test/e2e/cheerio-default-ts/actor/main.ts +++ b/test/e2e/cheerio-default-ts/actor/main.ts @@ -13,7 +13,7 @@ const crawler = new CheerioCrawler(); crawler.router.addDefaultHandler(async ({ $, enqueueLinks, request, log }) => { const { url } = request; await enqueueLinks({ - globs: ['https://crawlee.dev/js/docs/**'], + include: ['https://crawlee.dev/js/docs/**'], }); const pageTitle = $('title').first().text(); diff --git a/test/e2e/cheerio-default/actor/main.js b/test/e2e/cheerio-default/actor/main.js index f881af85aeeb..756edba14ced 100644 --- a/test/e2e/cheerio-default/actor/main.js +++ b/test/e2e/cheerio-default/actor/main.js @@ -20,7 +20,7 @@ await Actor.main(async () => { async requestHandler({ $, enqueueLinks, request, log }) { const { url } = request; await enqueueLinks({ - globs: ['https://crawlee.dev/js/docs/**'], + include: ['https://crawlee.dev/js/docs/**'], }); const pageTitle = $('title').first().text(); diff --git a/test/e2e/cheerio-enqueue-links-base/actor/main.js b/test/e2e/cheerio-enqueue-links-base/actor/main.js index ac2ad0848a44..1e533b5851e5 100644 --- a/test/e2e/cheerio-enqueue-links-base/actor/main.js +++ b/test/e2e/cheerio-enqueue-links-base/actor/main.js @@ -21,7 +21,7 @@ await Actor.main(async () => { await Dataset.pushData({ url, loadedUrl, pageTitle }); await enqueueLinks({ - globs: [ + include: [ 'https://www.jamesallen.com/about-us/**', 'https://www.jamesallen.com/terms-of-use/**', 'https://www.jamesallen.com/guarantee/**', diff --git a/test/e2e/cheerio-ignore-ssl-errors/actor/main.js b/test/e2e/cheerio-ignore-ssl-errors/actor/main.js index 0686f7044124..f38f23df4f81 100644 --- a/test/e2e/cheerio-ignore-ssl-errors/actor/main.js +++ b/test/e2e/cheerio-ignore-ssl-errors/actor/main.js @@ -21,7 +21,8 @@ await Actor.main(async () => { if (label === 'START') { log.info('Bad ssl page opened!'); await enqueueLinks({ - globs: [{ glob: 'https://*.badssl.com/', userData: { label: 'DETAIL' } }], + include: ['https://*.badssl.com/'], + label: 'DETAIL', selector: '.group a.bad', }); } else if (label === 'DETAIL') { diff --git a/test/e2e/cheerio-page-info/actor/main.js b/test/e2e/cheerio-page-info/actor/main.js index ccb3a1762d55..3d31fb1adb68 100644 --- a/test/e2e/cheerio-page-info/actor/main.js +++ b/test/e2e/cheerio-page-info/actor/main.js @@ -14,7 +14,7 @@ const router = createCheerioRouter(); router.addHandler('START', async ({ enqueueLinks }) => { await enqueueLinks({ label: 'DETAIL', - globs: ['**/examples/accept-user-input'], + include: ['**/examples/accept-user-input'], }); }); diff --git a/test/e2e/cheerio-request-queue-v2/actor/main.js b/test/e2e/cheerio-request-queue-v2/actor/main.js index 60e9c81b55c7..a78598390470 100644 --- a/test/e2e/cheerio-request-queue-v2/actor/main.js +++ b/test/e2e/cheerio-request-queue-v2/actor/main.js @@ -14,7 +14,7 @@ await Actor.main(async () => { async requestHandler({ $, enqueueLinks, request, log }) { const { url } = request; await enqueueLinks({ - globs: ['https://crawlee.dev/js/docs/**'], + include: ['https://crawlee.dev/js/docs/**'], }); const pageTitle = $('title').first().text(); diff --git a/test/e2e/cheerio-robots-file/actor/main.js b/test/e2e/cheerio-robots-file/actor/main.js index db191d417c90..bc5da9a6e278 100644 --- a/test/e2e/cheerio-robots-file/actor/main.js +++ b/test/e2e/cheerio-robots-file/actor/main.js @@ -17,7 +17,7 @@ crawler.router.addDefaultHandler(async ({ log, request, enqueueLinks, pushData } log.info(`Processing ${request.loadedUrl}`); await enqueueLinks({ // '/cart' is disallowed by robots.txt - globs: ['**/cart', '**/collections/*'], + include: ['**/cart', '**/collections/*'], }); await pushData({ url: request.url, loadedUrl: request.loadedUrl }); }); diff --git a/test/e2e/cheerio-stop-resume-ts/actor/main.ts b/test/e2e/cheerio-stop-resume-ts/actor/main.ts index 887f0f088ee3..737ea8cb649c 100644 --- a/test/e2e/cheerio-stop-resume-ts/actor/main.ts +++ b/test/e2e/cheerio-stop-resume-ts/actor/main.ts @@ -14,7 +14,7 @@ const crawler = new CheerioCrawler(); crawler.router.addDefaultHandler(async ({ $, enqueueLinks, request, log }) => { const { url } = request; await enqueueLinks({ - globs: ['https://crawlee.dev/js/docs/**'], + include: ['https://crawlee.dev/js/docs/**'], }); const pageTitle = $('title').first().text(); diff --git a/test/e2e/cheerio-throw-on-ssl-errors/actor/main.js b/test/e2e/cheerio-throw-on-ssl-errors/actor/main.js index 9cb577437158..5897199d810c 100644 --- a/test/e2e/cheerio-throw-on-ssl-errors/actor/main.js +++ b/test/e2e/cheerio-throw-on-ssl-errors/actor/main.js @@ -20,7 +20,8 @@ await Actor.main(async () => { if (label === 'START') { log.info('Bad ssl page opened!'); await enqueueLinks({ - globs: [{ glob: 'https://*.badssl.com/', userData: { label: 'DETAIL' } }], + include: ['https://*.badssl.com/'], + label: 'DETAIL', selector: '.group a.bad', }); } else if (label === 'DETAIL') { diff --git a/test/e2e/jsdom-default-ts/actor/main.ts b/test/e2e/jsdom-default-ts/actor/main.ts index 6e16608c8dd0..384a76682353 100644 --- a/test/e2e/jsdom-default-ts/actor/main.ts +++ b/test/e2e/jsdom-default-ts/actor/main.ts @@ -15,7 +15,7 @@ const crawler = new JSDOMCrawler(); crawler.router.addDefaultHandler(async ({ window, document, enqueueLinks, request, log }) => { const { url } = request; await enqueueLinks({ - globs: ['https://crawlee.dev/js/docs/**'], + include: ['https://crawlee.dev/js/docs/**'], }); const pageTitle = window.document.title; diff --git a/test/e2e/linkedom-default-ts/actor/main.ts b/test/e2e/linkedom-default-ts/actor/main.ts index de5cd4b13fca..ac5b878217e9 100644 --- a/test/e2e/linkedom-default-ts/actor/main.ts +++ b/test/e2e/linkedom-default-ts/actor/main.ts @@ -15,7 +15,7 @@ const crawler = new LinkeDOMCrawler(); crawler.router.addDefaultHandler(async ({ document, enqueueLinks, request, log }) => { const { url } = request; await enqueueLinks({ - globs: ['https://crawlee.dev/js/docs/**'], + include: ['https://crawlee.dev/js/docs/**'], }); const pageTitle = document.querySelector('title')?.textContent ?? ''; diff --git a/test/e2e/playwright-default/actor/main.js b/test/e2e/playwright-default/actor/main.js index fdda97cadd12..369ffa6a4ac1 100644 --- a/test/e2e/playwright-default/actor/main.js +++ b/test/e2e/playwright-default/actor/main.js @@ -21,7 +21,7 @@ await Actor.main(async () => { const pageTitle = await page.title(); await Dataset.pushData({ url, pageTitle }); await enqueueLinks({ - globs: ['**/3.12/examples/*'], + include: ['**/3.12/examples/*'], }); }, }); diff --git a/test/e2e/playwright-enqueue-links-base/actor/main.js b/test/e2e/playwright-enqueue-links-base/actor/main.js index 7364d2bb9e5d..684255c6c5f3 100644 --- a/test/e2e/playwright-enqueue-links-base/actor/main.js +++ b/test/e2e/playwright-enqueue-links-base/actor/main.js @@ -22,7 +22,7 @@ await Actor.main(async () => { await Dataset.pushData({ url, loadedUrl, pageTitle }); await enqueueLinks({ - globs: [ + include: [ 'https://www.jamesallen.com/about-us/**', 'https://www.jamesallen.com/terms-of-use/**', 'https://www.jamesallen.com/guarantee/**', diff --git a/test/e2e/playwright-multi-run/actor/main.js b/test/e2e/playwright-multi-run/actor/main.js index e47305cecc40..21191bdbba1f 100644 --- a/test/e2e/playwright-multi-run/actor/main.js +++ b/test/e2e/playwright-multi-run/actor/main.js @@ -15,7 +15,7 @@ const crawler = new PlaywrightCrawler({ const pageTitle = await page.title(); await Dataset.pushData({ url, pageTitle }); await enqueueLinks({ - globs: ['**/3.12/examples/*'], + include: ['**/3.12/examples/*'], }); }, }); diff --git a/test/e2e/playwright-robots-file/actor/main.js b/test/e2e/playwright-robots-file/actor/main.js index f38eb3e495c0..2cf516d94591 100644 --- a/test/e2e/playwright-robots-file/actor/main.js +++ b/test/e2e/playwright-robots-file/actor/main.js @@ -17,7 +17,7 @@ crawler.router.addDefaultHandler(async ({ log, request, enqueueLinks, pushData } log.info(`Processing ${request.loadedUrl}`); await enqueueLinks({ // '/cart' is disallowed by robots.txt - globs: ['**/cart', '**/collections/*'], + include: ['**/cart', '**/collections/*'], }); await pushData({ url: request.url, loadedUrl: request.loadedUrl }); }); diff --git a/test/e2e/puppeteer-default/actor/main.js b/test/e2e/puppeteer-default/actor/main.js index fd8a6fb8f891..30c121812a35 100644 --- a/test/e2e/puppeteer-default/actor/main.js +++ b/test/e2e/puppeteer-default/actor/main.js @@ -22,7 +22,7 @@ await Actor.main(async () => { const pageTitle = await page.title(); await Dataset.pushData({ url, pageTitle }); await enqueueLinks({ - globs: ['**/3.12/examples/*'], + include: ['**/3.12/examples/*'], }); }, }); diff --git a/test/e2e/puppeteer-ignore-ssl-errors/actor/main.js b/test/e2e/puppeteer-ignore-ssl-errors/actor/main.js index 0a279d7d96b3..49ed52231bb6 100644 --- a/test/e2e/puppeteer-ignore-ssl-errors/actor/main.js +++ b/test/e2e/puppeteer-ignore-ssl-errors/actor/main.js @@ -26,7 +26,8 @@ await Actor.main(async () => { if (label === 'START') { log.info('Bad ssl page opened!'); await enqueueLinks({ - globs: [{ glob: 'https://*.badssl.com/', userData: { label: 'DETAIL' } }], + include: ['https://*.badssl.com/'], + label: 'DETAIL', selector: '.group a.bad', }); } else if (label === 'DETAIL') { diff --git a/test/e2e/puppeteer-page-info/actor/main.js b/test/e2e/puppeteer-page-info/actor/main.js index 495d361605de..a7837ee550cf 100644 --- a/test/e2e/puppeteer-page-info/actor/main.js +++ b/test/e2e/puppeteer-page-info/actor/main.js @@ -23,7 +23,7 @@ await Actor.main(async () => { if (label === 'START') { await enqueueLinks({ - globs: ['**/examples/accept-user-input'], + include: ['**/examples/accept-user-input'], userData: { label: 'DETAIL' }, }); } diff --git a/test/e2e/puppeteer-store-pagination-jquery/actor/main.js b/test/e2e/puppeteer-store-pagination-jquery/actor/main.js index 635d6f9113af..8a34a084ad58 100644 --- a/test/e2e/puppeteer-store-pagination-jquery/actor/main.js +++ b/test/e2e/puppeteer-store-pagination-jquery/actor/main.js @@ -37,7 +37,7 @@ await Actor.main(async () => { await enqueueLinks({ selector: 'a.product-item__image-wrapper', label: 'DETAIL', - globs: ['https://warehouse-theme-metal.myshopify.com/*/*'], + include: ['https://warehouse-theme-metal.myshopify.com/*/*'], }); log.info(`Enqueued actors for page ${pageNo}`); log.info('Loading the next page'); diff --git a/test/e2e/puppeteer-store-pagination/actor/main.js b/test/e2e/puppeteer-store-pagination/actor/main.js index 85e4a355e70d..d14feb63d657 100644 --- a/test/e2e/puppeteer-store-pagination/actor/main.js +++ b/test/e2e/puppeteer-store-pagination/actor/main.js @@ -31,7 +31,7 @@ crawler.router.addHandler('START', async ({ log, enqueueLinks, page }) => { await enqueueLinks({ selector: 'a.product-item__image-wrapper', label: 'DETAIL', - globs: ['https://warehouse-theme-metal.myshopify.com/*/*'], + include: ['https://warehouse-theme-metal.myshopify.com/*/*'], }); log.info(`Enqueued actors for page ${pageNo}`); log.info('Loading the next page'); diff --git a/test/e2e/puppeteer-throw-on-ssl-errors/actor/main.js b/test/e2e/puppeteer-throw-on-ssl-errors/actor/main.js index 03b223e3cfa0..33ed50ead9f0 100644 --- a/test/e2e/puppeteer-throw-on-ssl-errors/actor/main.js +++ b/test/e2e/puppeteer-throw-on-ssl-errors/actor/main.js @@ -26,7 +26,8 @@ await Actor.main(async () => { if (label === 'START') { log.info('Bad ssl page opened!'); await enqueueLinks({ - globs: [{ glob: 'https://*.badssl.com/', userData: { label: 'DETAIL' } }], + include: ['https://*.badssl.com/'], + label: 'DETAIL', selector: '.group a.bad', }); } else if (label === 'DETAIL') {