Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
f4e5397
feat: allow navigation hooks to override crawling context members
barjin May 25, 2026
70defed
feat(playwright): reload page in handleCloudflareChallenge and expose…
barjin May 25, 2026
d208655
refactor(playwright): drop blockedStatusCodes manipulation from handl…
barjin May 25, 2026
a3a56fc
fix: address PR review feedback
barjin May 25, 2026
3966c80
refactor: move navigation hook execution into the context pipeline
barjin May 29, 2026
4ecfd8b
refactor(browser-crawler): rename slot helper to readContextField
barjin May 29, 2026
50acbb6
refactor(http-crawler): drop trivial wrapHook helper
barjin May 29, 2026
d2aec38
refactor!: drop gotoOptions second arg from BrowserHook, expose on co…
barjin May 29, 2026
db146f5
refactor: thread GoToOptions through BrowserCrawlingContext generic
barjin May 29, 2026
a41c82a
docs: drop misleading pageOptions note from preNavigationHooks JSDoc
barjin May 29, 2026
9603ca7
refactor(browser-crawler): use try/catch for response read in finaliz…
barjin May 29, 2026
aeae20b
refactor(adaptive-playwright): inline hook casts at call sites
barjin May 29, 2026
4186ded
refactor(adaptive-playwright): hoist hook array defaults into destruc…
barjin May 29, 2026
564524a
Merge branch 'v4' into feat/nav-hook-context-overrides
barjin Jun 1, 2026
e5c7010
refactor: address nav-hook override review feedback
barjin Jun 1, 2026
d91a14d
chore: fix typo
barjin Jun 4, 2026
bd26152
chore: separate adaptive hooks with and without `LoadedRequest`
barjin Jun 4, 2026
c49297f
fix: tighten type safety around navigation hooks
barjin Jun 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/guides/avoid_blocking.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ On the contrary, sometimes we want to entirely disable the usage of browser fing

## Camoufox

For some protections, using our integrated solutions is not enough, one example could be the Cloudflare challenge. For such pages, you can try [Camoufox](https://camoufox.com/), a custom stealthy build of Firefox for web scraping. It might not get you through the challenge automatically, but with our `handleCloudflareChallenge` helper, it should be able to successfully mimic the required user action and get you through it.
For some protections, using our integrated solutions is not enough, one example could be the Cloudflare challenge. For such pages, you can try [Camoufox](https://camoufox.com/), a custom stealthy build of Firefox for web scraping. It might not get you through the challenge automatically, but with our `handleCloudflareChallengeHook` post-navigation hook, it should be able to successfully mimic the required user action and get you through it. The hook also reloads the page after the challenge clears and propagates the fresh response back into the crawling context.

<CodeBlock language="ts">
{PlaywrightCamoufox}
Expand Down
8 changes: 2 additions & 6 deletions docs/guides/avoid_blocking_camoufox.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
import { PlaywrightCrawler } from 'crawlee';
import { PlaywrightCrawler, handleCloudflareChallengeHook } from 'crawlee';
import { launchOptions } from 'camoufox-js';
import { firefox } from 'playwright';

const crawler = new PlaywrightCrawler({
postNavigationHooks: [
async ({ handleCloudflareChallenge }) => {
await handleCloudflareChallenge();
},
],
postNavigationHooks: [handleCloudflareChallengeHook()],
browserPoolOptions: {
// Disable the default fingerprint spoofing to avoid conflicts with Camoufox.
useFingerprints: false,
Expand Down
11 changes: 0 additions & 11 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2220,17 +2220,6 @@ export class BasicCrawler<
}
}

protected async _executeHooks<HookLike extends (...args: any[]) => Awaitable<void>>(
hooks: HookLike[],
...args: Parameters<HookLike>
) {
if (Array.isArray(hooks) && hooks.length) {
for (const hook of hooks) {
await hook(...args);
}
}
}

/**
* Stops the crawler immediately.
*
Expand Down
120 changes: 81 additions & 39 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import type {
Awaitable,
BasicCrawlerOptions,
BasicCrawlingContext,
ContextMiddleware,
CrawlingContext,
Dictionary,
EnqueueLinksOptions,
Expand Down Expand Up @@ -58,6 +59,7 @@ export interface BrowserCrawlingContext<
Response extends BaseResponse = BaseResponse,
ProvidedController = BrowserController,
UserData extends Dictionary = Dictionary,
GoToOptions extends Dictionary = Dictionary,
> extends CrawlingContext<UserData> {
/**
* An instance of the {@apilink BrowserController} that manages the browser instance and provides access to its API.
Expand All @@ -79,16 +81,25 @@ export interface BrowserCrawlingContext<
*/
response: Response;

/**
* Options object passed to the underlying `page.goto()` call. `preNavigationHooks` can mutate this
* object (or return `{ gotoOptions: ... }`) to influence the navigation.
*/
gotoOptions: GoToOptions;

/**
* Helper function for extracting URLs from the current page and adding them to the request queue.
*/
enqueueLinks: (options?: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>;
}

export type BrowserHook<Context = BrowserCrawlingContext, GoToOptions extends Dictionary | undefined = Dictionary> = (
export type BrowserHook<Context = BrowserCrawlingContext> = (
crawlingContext: Context,
gotoOptions: GoToOptions,
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

db146f5 drops the second gotoOptions param for navigation hooks and passes this through the CrawlingContext instead.

I suppose we had it like this because of the lack of a unified ContextPipeline in v3, but I'd like to validate this hunch with someone else.

Copy link
Copy Markdown
Contributor

@janbuchar janbuchar Jun 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't think of a reason to keep this a separate parameter, so your hunch is probably right.

) => Awaitable<void>;
) => Awaitable<void | Partial<Context>>;

const COOKIES_BEFORE_HOOKS = Symbol('cookiesBeforeHooks');

const readContextField = <T>(ctx: object, key: symbol): T => (ctx as Record<symbol, unknown>)[key] as T;

export interface BrowserCrawlerOptions<
Page extends CommonPage = CommonPage,
Expand Down Expand Up @@ -174,31 +185,33 @@ export interface BrowserCrawlerOptions<

/**
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
* or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`,
* which are passed to the `page.goto()` function the crawler calls to navigate.
* or browser properties before navigation. The function receives the `crawlingContext`; the options object
* forwarded to `page.goto()` is available as `crawlingContext.gotoOptions` and can be mutated in place.
*
* **Example:**
*
* ```js
* preNavigationHooks: [
* async (crawlingContext, gotoOptions) => {
* const { page } = crawlingContext;
* async ({ page, gotoOptions }) => {
* await page.evaluate((attr) => { window.foo = attr; }, 'bar');
* gotoOptions.timeout = 60_000;
* gotoOptions.waitUntil = 'domcontentloaded';
* },
* ]
* ```
*
* Modyfing `pageOptions` is supported only in Playwright incognito.
* See {@apilink PrePageCreateHook}
* A hook may optionally return a partial object whose properties are merged into the crawling context,
* allowing the hook to override context members for subsequent hooks and pipeline stages.
*/
preNavigationHooks?: BrowserHook<Context>[];

/**
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
* The function accepts `crawlingContext` as the only parameter.
*
* A hook may optionally return a partial object whose properties are merged into the crawling context.
* This is useful for overriding context members (e.g. `response`) after solving a challenge.
*
* **Example:**
*
* ```js
Expand All @@ -209,6 +222,11 @@ export interface BrowserCrawlerOptions<
* await solveCaptcha(page);
* }
* },
* async (crawlingContext) => {
* if (await needsRevalidation(crawlingContext)) {
* return { response: await crawlingContext.page.reload() };
* }
* },
* ]
* ```
*/
Expand Down Expand Up @@ -358,13 +376,32 @@ export abstract class BrowserCrawler<
...basicCrawlerOptions
} = options;

const skipGuard = <Ctx extends Context>(
action: (ctx: Ctx) => Awaitable<void | Partial<Ctx>>,
): ContextMiddleware<Ctx, Partial<Ctx>> => ({
action: async (ctx) => (ctx.request.skipNavigation ? {} : ((await action(ctx)) ?? {})),
});

super({
...basicCrawlerOptions,
contextPipelineBuilder: () =>
contextPipelineBuilder()
.compose({ action: this.performNavigation.bind(this) })
contextPipelineBuilder: () => {
let pipeline = contextPipelineBuilder().compose({ action: this.prepareNavigation.bind(this) });

for (const hook of this.preNavigationHooks) {
pipeline = pipeline.compose(skipGuard(hook));
}

pipeline = pipeline.compose(skipGuard(this.navigate.bind(this)));

for (const hook of this.postNavigationHooks) {
pipeline = pipeline.compose(skipGuard(hook));
}

return pipeline
.compose(skipGuard(this.finalizeNavigation.bind(this)))
.compose({ action: this.handleBlockedRequestByContent.bind(this) })
.compose({ action: this.restoreRequestState.bind(this) }),
.compose({ action: this.restoreRequestState.bind(this) });
},
extendContext: extendContext as (context: Context) => Awaitable<ContextExtension>,
});

Expand Down Expand Up @@ -490,6 +527,9 @@ export abstract class BrowserCrawler<
"The `response` property is not available. This might mean that you're trying to access it before navigation or that navigation resulted in `null` (this should only happen with `about:` URLs)",
);
},
get gotoOptions(): Dictionary {
throw new Error('The `gotoOptions` property is not available until `prepareNavigation` runs.');
},
browserController: browserControllerInstance,
enqueueLinks: async (enqueueOptions: EnqueueLinksOptions = {}) => {
return (await browserCrawlerEnqueueLinks({
Expand All @@ -506,10 +546,7 @@ export abstract class BrowserCrawler<
};
}

private async performNavigation(crawlingContext: Context): Promise<{
request: LoadedRequest<Request>;
response?: Response;
}> {
private async prepareNavigation(crawlingContext: Context): Promise<Partial<Context>> {
if (crawlingContext.request.skipNavigation) {
return {
request: new Proxy(crawlingContext.request, {
Expand All @@ -527,42 +564,56 @@ export abstract class BrowserCrawler<
'The `response` property is not available - `skipNavigation` was used',
);
},
};
} as Partial<Context>;
}

const gotoOptions = { timeout: this.navigationTimeoutMillis } as unknown as GoToOptions;
crawlingContext.request.state = RequestState.BEFORE_NAV;

const preNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);
return {
gotoOptions: { timeout: this.navigationTimeoutMillis } as unknown as GoToOptions,
[COOKIES_BEFORE_HOOKS]: this._getCookieHeaderFromRequest(crawlingContext.request),
} as unknown as Partial<Context>;
}

crawlingContext.request.state = RequestState.BEFORE_NAV;
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotoOptions);
private async navigate(crawlingContext: Context): Promise<Partial<Context>> {
tryCancel();

const postNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);
const gotoOptions = crawlingContext.gotoOptions as GoToOptions;
const cookiesBeforeHooks = readContextField<string>(crawlingContext, COOKIES_BEFORE_HOOKS);
const cookiesAfterHooks = this._getCookieHeaderFromRequest(crawlingContext.request);

await this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);
await this._applyCookies(crawlingContext, cookiesBeforeHooks, cookiesAfterHooks);

let response: Response | undefined;

try {
response = (await this._navigationHandler(crawlingContext, gotoOptions)) ?? undefined;
} catch (error) {
await this._handleNavigationTimeout(crawlingContext, error as Error);

crawlingContext.request.state = RequestState.ERROR;

this._throwIfProxyError(error as Error);
throw error;
}
tryCancel();

crawlingContext.request.state = RequestState.AFTER_NAV;
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotoOptions);

return { response } as Partial<Context>;
}

private async finalizeNavigation(crawlingContext: Context): Promise<Partial<Context>> {
tryCancel();

let response: Response | undefined;
try {
response = crawlingContext.response;
} catch {
// `preparePage` installs a throwing getter for `response`; reaching this branch means
// navigation produced no response and no hook overrode it. Treat as undefined.
}

await this.processResponse(response, crawlingContext);
tryCancel();

// save cookies
// TODO: Should we save the cookies also after/only the handle page?
if (this.saveResponseCookies && crawlingContext.session) {
const cookies = await crawlingContext.browserController.getCookies(crawlingContext.page);
Expand All @@ -579,16 +630,7 @@ export abstract class BrowserCrawler<
}
}

if (response !== undefined) {
return {
request: crawlingContext.request as LoadedRequest<Request>,
response,
};
}

return {
request: crawlingContext.request as LoadedRequest<Request>,
};
return { request: crawlingContext.request as LoadedRequest<Request> } as Partial<Context>;
}

private async handleBlockedRequestByContent(
Expand Down
Loading
Loading