Skip to content

Commit f2016c3

Browse files
authored
refactor: use BasicCrawler session in browser launch context (#3489)
Before, `BrowserCrawler` instances could generate a new `Session`, overwriting the one created in `BasicCrawler`'s pipeline. The changes from this PR enforce the use of the `BasicCrawler` session, aligning the `BrowserCrawler` behaviour with `HttpCrawler` and `AdaptivePlaywrightCrawler`. Prerequisite: #3478
1 parent 697dcf2 commit f2016c3

3 files changed

Lines changed: 70 additions & 75 deletions

File tree

packages/browser-crawler/src/internals/browser-crawler.ts

Lines changed: 29 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ import type {
3737
LaunchContext,
3838
} from '@crawlee/browser-pool';
3939
import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
40-
import type { BatchAddRequestsResult, Cookie as CookieObject, ProxyInfo } from '@crawlee/types';
40+
import type { BatchAddRequestsResult, Cookie as CookieObject } from '@crawlee/types';
4141
import type { RobotsTxtFile } from '@crawlee/utils';
4242
import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
4343
import ow from 'ow';
@@ -391,12 +391,8 @@ export abstract class BrowserCrawler<
391391
browserPoolOptions.useFingerprints = false;
392392
}
393393

394-
const { preLaunchHooks = [], postLaunchHooks = [], ...rest } = browserPoolOptions;
395-
396394
this.browserPool = new BrowserPool<InternalBrowserPoolOptions>({
397-
...(rest as any),
398-
preLaunchHooks: [this._extendLaunchContext.bind(this), ...preLaunchHooks],
399-
postLaunchHooks: [this._maybeAddSessionRetiredListener.bind(this), ...postLaunchHooks],
395+
...(browserPoolOptions as any),
400396
});
401397
}
402398

@@ -469,25 +465,12 @@ export abstract class BrowserCrawler<
469465
id: crawlingContext.id,
470466
};
471467

472-
const useIncognitoPages = this.launchContext?.useIncognitoPages;
473-
474468
if (crawlingContext.session?.proxyInfo) {
475469
const proxyInfo = crawlingContext.session.proxyInfo;
476-
crawlingContext.proxyInfo = proxyInfo;
477470

478471
newPageOptions.proxyUrl = proxyInfo?.url;
479472
newPageOptions.proxyTier = proxyInfo?.proxyTier;
480-
481-
if (proxyInfo?.ignoreTlsErrors) {
482-
/**
483-
* @see https://playwright.dev/docs/api/class-browser/#browser-new-context
484-
* @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
485-
*/
486-
newPageOptions.pageOptions = {
487-
ignoreHTTPSErrors: true,
488-
acceptInsecureCerts: true,
489-
};
490-
}
473+
newPageOptions.ignoreTlsErrors = proxyInfo?.ignoreTlsErrors;
491474
}
492475

493476
const page = (await this.browserPool.newPage(newPageOptions)) as Page;
@@ -497,11 +480,9 @@ export abstract class BrowserCrawler<
497480
page as any,
498481
) as ProvidedController;
499482

500-
const contextEnqueueLinks = crawlingContext.enqueueLinks;
483+
this.addSessionRetiredListener(crawlingContext.session, browserControllerInstance);
501484

502-
const session = useIncognitoPages
503-
? crawlingContext.session
504-
: (browserControllerInstance.launchContext.session as Session);
485+
const contextEnqueueLinks = crawlingContext.enqueueLinks;
505486

506487
return {
507488
page,
@@ -511,8 +492,6 @@ export abstract class BrowserCrawler<
511492
);
512493
},
513494
browserController: browserControllerInstance,
514-
session,
515-
proxyInfo: session?.proxyInfo,
516495
enqueueLinks: async (enqueueOptions: EnqueueLinksOptions = {}) => {
517496
return (await browserCrawlerEnqueueLinks({
518497
options: { ...enqueueOptions, limit: this.calculateEnqueuedRequestLimit(enqueueOptions?.limit) },
@@ -685,57 +664,37 @@ export abstract class BrowserCrawler<
685664
request.loadedUrl = await page.url();
686665
}
687666

688-
protected async _extendLaunchContext(_pageId: string, launchContext: LaunchContext): Promise<void> {
689-
const launchContextExtends: { session?: Session; proxyInfo?: ProxyInfo } = {};
690-
691-
// Hacky access from `AdaptivePlaywrightCrawler` calls this without calling `.init()`.
692-
// This is the only case where this.sessionPool is accessed without being initialized.
693-
if (this.sessionPool) {
694-
launchContextExtends.session = await this.sessionPool.newSession({
695-
proxyInfo: await this.proxyConfiguration?.newProxyInfo({
696-
// cannot pass a request here, since session is created on browser launch
697-
}),
698-
});
699-
}
667+
private browserSessionIds = new WeakMap<Context['browserController'], Set<string>>();
700668

701-
if (!launchContext.proxyUrl && launchContextExtends.session?.proxyInfo) {
702-
const proxyInfo = launchContextExtends.session.proxyInfo;
669+
private addSessionRetiredListener(session: Session, browserController: Context['browserController']): void {
670+
if (!this.sessionPool) {
671+
return;
672+
}
703673

704-
launchContext.proxyUrl = proxyInfo?.url;
705-
launchContextExtends.proxyInfo = proxyInfo;
674+
let sessionIds = this.browserSessionIds.get(browserController);
706675

707-
// Disable SSL verification for MITM proxies
708-
if (proxyInfo?.ignoreTlsErrors) {
709-
/**
710-
* @see https://playwright.dev/docs/api/class-browser/#browser-new-context
711-
* @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
712-
*/
713-
(launchContext.launchOptions as Dictionary).ignoreHTTPSErrors = true;
714-
(launchContext.launchOptions as Dictionary).acceptInsecureCerts = true;
715-
}
676+
if (sessionIds) {
677+
sessionIds.add(session.id);
678+
return;
716679
}
717680

718-
launchContext.extend(launchContextExtends);
719-
}
681+
sessionIds = new Set([session.id]);
682+
this.browserSessionIds.set(browserController, sessionIds);
720683

721-
protected _maybeAddSessionRetiredListener(_pageId: string, browserController: Context['browserController']): void {
722-
if (this.sessionPool) {
723-
const listener = (session: Session) => {
724-
const { launchContext } = browserController;
725-
if (session.id === (launchContext.session as Session).id) {
726-
this.browserPool.retireBrowserController(
727-
browserController as Parameters<
728-
BrowserPool<InternalBrowserPoolOptions>['retireBrowserController']
729-
>[0],
730-
);
731-
}
732-
};
684+
const listener = (retired: Session) => {
685+
if (this.browserSessionIds.get(browserController)?.has(retired.id)) {
686+
this.browserPool.retireBrowserController(
687+
browserController as Parameters<
688+
BrowserPool<InternalBrowserPoolOptions>['retireBrowserController']
689+
>[0],
690+
);
691+
}
692+
};
733693

734-
this.sessionPool.on(EVENT_SESSION_RETIRED, listener);
735-
browserController.on(BROWSER_CONTROLLER_EVENTS.BROWSER_CLOSED, () => {
736-
return this.sessionPool!.removeListener(EVENT_SESSION_RETIRED, listener);
737-
});
738-
}
694+
this.sessionPool.on(EVENT_SESSION_RETIRED, listener);
695+
browserController.on(BROWSER_CONTROLLER_EVENTS.BROWSER_CLOSED, () => {
696+
return this.sessionPool!.removeListener(EVENT_SESSION_RETIRED, listener);
697+
});
739698
}
740699

741700
/**

packages/browser-pool/src/browser-pool.ts

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,14 @@ export class BrowserPool<
436436
* or their page limits have been exceeded.
437437
*/
438438
async newPage(options: BrowserPoolNewPageOptions<PageOptions, BrowserPlugins[number]> = {}): Promise<PageReturn> {
439-
const { id = nanoid(), pageOptions, browserPlugin = this._pickBrowserPlugin(), proxyUrl, proxyTier } = options;
439+
const {
440+
id = nanoid(),
441+
pageOptions,
442+
browserPlugin = this._pickBrowserPlugin(),
443+
proxyUrl,
444+
proxyTier,
445+
ignoreTlsErrors,
446+
} = options;
440447

441448
if (this.pages.has(id)) {
442449
throw new Error(`Page with ID: ${id} already exists.`);
@@ -451,10 +458,15 @@ export class BrowserPool<
451458
let browserController = this._pickBrowserWithFreeCapacity(browserPlugin, { proxyTier, proxyUrl });
452459

453460
if (!browserController)
454-
browserController = await this._launchBrowser(id, { browserPlugin, proxyTier, proxyUrl });
461+
browserController = await this._launchBrowser(id, {
462+
browserPlugin,
463+
proxyTier,
464+
proxyUrl,
465+
ignoreTlsErrors,
466+
});
455467
tryCancel();
456468

457-
return await this._createPageForBrowser(id, browserController, pageOptions, proxyUrl);
469+
return await this._createPageForBrowser(id, browserController, pageOptions, proxyUrl, ignoreTlsErrors);
458470
});
459471
}
460472

@@ -552,6 +564,7 @@ export class BrowserPool<
552564
browserController: BrowserControllerReturn,
553565
pageOptions: PageOptions = {} as PageOptions,
554566
proxyUrl?: string,
567+
ignoreTlsErrors?: boolean,
555568
) {
556569
// This is needed for concurrent newPage calls to wait for the browser launch.
557570
// It's not ideal though, we need to come up with a better API.
@@ -563,6 +576,13 @@ export class BrowserPool<
563576

564577
if (finalPageOptions) {
565578
Object.assign(finalPageOptions, browserController.normalizeProxyOptions(proxyUrl, pageOptions));
579+
580+
if (ignoreTlsErrors) {
581+
Object.assign(finalPageOptions, {
582+
ignoreHTTPSErrors: true,
583+
acceptInsecureCerts: true,
584+
});
585+
}
566586
}
567587

568588
await this._executeHooks(this.prePageCreateHooks, pageId, browserController, finalPageOptions);
@@ -684,7 +704,7 @@ export class BrowserPool<
684704
}
685705

686706
private async _launchBrowser(pageId: string, options: InternalLaunchBrowserOptions<BrowserPlugins[number]>) {
687-
const { browserPlugin, launchOptions, proxyTier, proxyUrl } = options;
707+
const { browserPlugin, launchOptions, proxyTier, proxyUrl, ignoreTlsErrors } = options;
688708

689709
const browserController = browserPlugin.createController() as BrowserControllerReturn;
690710
this.startingBrowserControllers.add(browserController);
@@ -696,6 +716,16 @@ export class BrowserPool<
696716
proxyUrl,
697717
});
698718

719+
// Disable SSL verification for MITM proxies
720+
if (ignoreTlsErrors) {
721+
/**
722+
* @see https://playwright.dev/docs/api/class-browser/#browser-new-context
723+
* @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
724+
*/
725+
(launchContext.launchOptions as Record<string, unknown>).ignoreHTTPSErrors = true;
726+
(launchContext.launchOptions as Record<string, unknown>).acceptInsecureCerts = true;
727+
}
728+
699729
try {
700730
// If the hooks or the launch fails, we need to delete the controller,
701731
// because otherwise it would be stuck in limbo without a browser.
@@ -882,6 +912,11 @@ export interface BrowserPoolNewPageOptions<PageOptions, BP extends BrowserPlugin
882912
* Proxy tier.
883913
*/
884914
proxyTier?: number;
915+
/**
916+
* Disable TLS certificate verification for MITM proxies.
917+
* Applied both when launching a new browser and when creating a page in an existing one.
918+
*/
919+
ignoreTlsErrors?: boolean;
885920
}
886921

887922
export interface BrowserPoolNewPageInNewBrowserOptions<PageOptions, BP extends BrowserPlugin> {
@@ -919,4 +954,5 @@ interface InternalLaunchBrowserOptions<BP extends BrowserPlugin> {
919954
launchOptions?: BP['launchOptions'];
920955
proxyTier?: number;
921956
proxyUrl?: string;
957+
ignoreTlsErrors?: boolean;
922958
}

test/core/crawlers/browser_crawler.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -821,7 +821,7 @@ describe('BrowserCrawler', () => {
821821
{ url: `${serverAddress}/?q=6` },
822822
]);
823823

824-
expect(sessionUsageHistory).toEqual([0, 1, 2, 3, 4, 5]);
824+
expect(sessionUsageHistory).toEqual([0, 0, 0, 0, 0, 0]);
825825
} finally {
826826
await localStorageEmulator.destroy();
827827
}

0 commit comments

Comments
 (0)