Skip to content

Commit 651f677

Browse files
authored
refactor: use ContextPipeline to initialize BasicCrawler's context idiomatically (#3388)
Extracts all `CrawlingContext` initialization to `ContextPipeline` steps to tighten the control over the `CrawlingContext` contents. Blocks #3380
1 parent c467d18 commit 651f677

12 files changed

Lines changed: 453 additions & 201 deletions

File tree

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 233 additions & 134 deletions
Large diffs are not rendered by default.

packages/browser-crawler/src/internals/browser-crawler.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ export abstract class BrowserCrawler<
411411
});
412412
}
413413

414-
protected buildContextPipeline(): ContextPipeline<
414+
protected override buildContextPipeline(): ContextPipeline<
415415
CrawlingContext,
416416
BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>
417417
> {

packages/cheerio-crawler/src/internals/cheerio-crawler.ts

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -176,17 +176,23 @@ export class CheerioCrawler<
176176
* All `CheerioCrawler` parameters are passed via an options object.
177177
*/
178178
constructor(options?: CheerioCrawlerOptions<ContextExtension, ExtendedContext>) {
179+
const { contextPipelineBuilder, ...rest } = options ?? {};
180+
179181
super({
180-
...options,
181-
contextPipelineBuilder: () =>
182-
this.buildContextPipeline()
183-
.compose({
184-
action: async (context) => await this.parseContent(context),
185-
})
186-
.compose({ action: async (context) => await this.addHelpers(context) }),
182+
...rest,
183+
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
187184
});
188185
}
189186

187+
protected override buildContextPipeline() {
188+
return super
189+
.buildContextPipeline()
190+
.compose({
191+
action: async (context) => await this.parseContent(context),
192+
})
193+
.compose({ action: async (context) => await this.addHelpers(context) });
194+
}
195+
190196
private async parseContent(crawlingContext: InternalHttpCrawlingContext) {
191197
const isXml = crawlingContext.contentType.type.includes('xml');
192198
const body = Buffer.isBuffer(crawlingContext.body)

packages/core/src/crawlers/context_pipeline.ts

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import type { Awaitable } from '@crawlee/types';
22

3+
import log from '@apify/log';
4+
35
import {
46
ContextPipelineCleanupError,
57
ContextPipelineInitializationError,
@@ -56,6 +58,19 @@ export abstract class ContextPipeline<TContextBase, TCrawlingContext extends TCo
5658
middleware: ContextMiddleware<TCrawlingContext, TCrawlingContextExtension>,
5759
): ContextPipeline<TContextBase, TCrawlingContext & TCrawlingContextExtension>;
5860

61+
/**
62+
* Chains another pipeline onto this one. The other pipeline's base context must match
63+
* this pipeline's output context. Returns a new pipeline that runs this pipeline's
64+
* middlewares first, then the other pipeline's middlewares.
65+
*
66+
* @template TFinalContext - The final context type after the chained pipeline's transformations
67+
* @param other - The pipeline to append after this one
68+
* @returns A new ContextPipeline combining both pipelines' middlewares
69+
*/
70+
abstract chain<TFinalContext extends TCrawlingContext>(
71+
other: ContextPipeline<TCrawlingContext, TFinalContext>,
72+
): ContextPipeline<TContextBase, TFinalContext>;
73+
5974
/**
6075
* Executes the middleware pipeline and passes the final context to a consumer function.
6176
*
@@ -105,6 +120,21 @@ class ContextPipelineImpl<TContextBase, TCrawlingContext extends TContextBase> e
105120
);
106121
}
107122

123+
chain<TFinalContext extends TCrawlingContext>(
124+
other: ContextPipeline<TCrawlingContext, TFinalContext>,
125+
): ContextPipeline<TContextBase, TFinalContext> {
126+
const otherMiddlewares = Array.from(
127+
(other as any).middlewareChain() as Iterable<ContextMiddleware<any, any>>,
128+
).reverse();
129+
130+
let result: ContextPipeline<TContextBase, any> = this as any;
131+
for (const middleware of otherMiddlewares) {
132+
result = result.compose(middleware as any);
133+
}
134+
135+
return result as ContextPipeline<TContextBase, TFinalContext>;
136+
}
137+
108138
private *middlewareChain() {
109139
let step: ContextPipelineImpl<TContextBase, TContextBase> | undefined = this as any;
110140

@@ -129,7 +159,25 @@ class ContextPipelineImpl<TContextBase, TCrawlingContext extends TContextBase> e
129159
for (const { action, cleanup } of middlewares) {
130160
try {
131161
const contextExtension = await action(crawlingContext);
132-
Object.defineProperties(crawlingContext, Object.getOwnPropertyDescriptors(contextExtension));
162+
163+
const extensionNames = [
164+
...Object.getOwnPropertyNames(contextExtension),
165+
...Object.getOwnPropertySymbols(contextExtension),
166+
];
167+
168+
for (const key of extensionNames) {
169+
try {
170+
if (Object.getOwnPropertyDescriptor(crawlingContext, key)?.configurable !== false) {
171+
Object.defineProperty(
172+
crawlingContext,
173+
key,
174+
Object.getOwnPropertyDescriptor(contextExtension, key)!,
175+
);
176+
}
177+
} catch (error: any) {
178+
log.debug(`Context pipeline failed to define property ${key.toString()}:`, error);
179+
}
180+
}
133181

134182
if (cleanup) {
135183
cleanupStack.push(cleanup);

packages/http-crawler/src/internals/file-download.ts

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { Transform } from 'node:stream';
22

33
import type { BasicCrawlerOptions } from '@crawlee/basic';
4-
import { BasicCrawler, ContextPipeline } from '@crawlee/basic';
4+
import { BasicCrawler } from '@crawlee/basic';
55
import type { CrawlingContext, LoadedRequest, Request } from '@crawlee/core';
66
import { ResponseWithUrl } from '@crawlee/http-client';
77
import type { Dictionary } from '@crawlee/types';
@@ -162,19 +162,22 @@ export class FileDownload extends BasicCrawler<FileDownloadCrawlingContext> {
162162
constructor(options: BasicCrawlerOptions<FileDownloadCrawlingContext> = {}) {
163163
super({
164164
...options,
165-
contextPipelineBuilder: () =>
166-
ContextPipeline.create<CrawlingContext>().compose({
167-
action: async (context) => this.initiateDownload(context),
168-
cleanup: async (context) => {
169-
if (!context.response.bodyUsed) {
170-
// Nobody consumed the body — cancel it so the
171-
// underlying connection can be released.
172-
await context.response.body?.cancel();
173-
}
165+
contextPipelineBuilder: () => this.buildContextPipeline(),
166+
});
167+
}
168+
169+
protected override buildContextPipeline() {
170+
return super.buildContextPipeline().compose({
171+
action: async (context) => this.initiateDownload(context),
172+
cleanup: async (context) => {
173+
if (!context.response.bodyUsed) {
174+
// Nobody consumed the body — cancel it so the
175+
// underlying connection can be released.
176+
await context.response.body?.cancel();
177+
}
174178

175-
await (context as { [kBodyDrained]: Promise<void> })[kBodyDrained];
176-
},
177-
}),
179+
await (context as { [kBodyDrained]: Promise<void> })[kBodyDrained];
180+
},
178181
});
179182
}
180183

packages/http-crawler/src/internals/http-crawler.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ export class HttpCrawler<
410410
}
411411
}
412412

413-
protected buildContextPipeline(): ContextPipeline<CrawlingContext, InternalHttpCrawlingContext> {
413+
protected override buildContextPipeline(): ContextPipeline<CrawlingContext, InternalHttpCrawlingContext> {
414414
return ContextPipeline.create<CrawlingContext>()
415415
.compose({
416416
action: this.makeHttpRequest.bind(this),

packages/jsdom-crawler/src/internals/jsdom-crawler.ts

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -192,26 +192,30 @@ export class JSDOMCrawler<
192192
protected virtualConsole: VirtualConsole | null = null;
193193

194194
constructor(options: JSDOMCrawlerOptions<ContextExtension, ExtendedContext> = {}) {
195-
const { runScripts = false, hideInternalConsole = false, ...httpOptions } = options;
195+
const { runScripts = false, hideInternalConsole = false, contextPipelineBuilder, ...httpOptions } = options;
196196

197197
super({
198198
...httpOptions,
199-
contextPipelineBuilder: () =>
200-
this.buildContextPipeline()
201-
.compose({
202-
action: async (context) => await this.parseContent(context),
203-
cleanup: async (context) => {
204-
this.getVirtualConsole().off('jsdomError', this.jsdomErrorHandler);
205-
context.window?.close();
206-
},
207-
})
208-
.compose({ action: async (context) => await this.addHelpers(context) }),
199+
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
209200
});
210201

211202
this.runScripts = runScripts;
212203
this.hideInternalConsole = hideInternalConsole;
213204
}
214205

206+
protected override buildContextPipeline() {
207+
return super
208+
.buildContextPipeline()
209+
.compose({
210+
action: async (context) => await this.parseContent(context),
211+
cleanup: async (context) => {
212+
this.getVirtualConsole().off('jsdomError', this.jsdomErrorHandler);
213+
context.window?.close();
214+
},
215+
})
216+
.compose({ action: async (context) => await this.addHelpers(context) });
217+
}
218+
215219
/**
216220
* Returns the currently used `VirtualConsole` instance. Can be used to listen for the JSDOM's internal console messages.
217221
*

packages/linkedom-crawler/src/internals/linkedom-crawler.ts

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -167,17 +167,23 @@ export class LinkeDOMCrawler<
167167
private static parser = new DOMParser();
168168

169169
constructor(options: LinkeDOMCrawlerOptions<ContextExtension, ExtendedContext>) {
170+
const { contextPipelineBuilder, ...rest } = options;
171+
170172
super({
171-
...options,
172-
contextPipelineBuilder: () =>
173-
this.buildContextPipeline()
174-
.compose({
175-
action: async (context) => this.parseContent(context),
176-
})
177-
.compose({ action: async (context) => this.addHelpers(context) }),
173+
...rest,
174+
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
178175
});
179176
}
180177

178+
protected override buildContextPipeline() {
179+
return super
180+
.buildContextPipeline()
181+
.compose({
182+
action: async (context) => this.parseContent(context),
183+
})
184+
.compose({ action: async (context) => this.addHelpers(context) });
185+
}
186+
181187
private async parseContent(crawlingContext: InternalHttpCrawlingContext) {
182188
const isXml = crawlingContext.contentType.type.includes('xml');
183189
const document = LinkeDOMCrawler.parser.parseFromString(

packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary =
150150
*/
151151
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
152152

153-
enqueueLinks(options?: EnqueueLinksOptions): Promise<void>;
153+
enqueueLinks(options?: EnqueueLinksOptions): Promise<unknown>;
154154
}
155155

156156
interface AdaptiveHook
@@ -299,17 +299,11 @@ export class AdaptivePlaywrightCrawler<
299299

300300
super({
301301
...rest,
302-
// Pass error handlers to the "main" crawler - we only pluck them from `rest` so that they don't go to the sub crawlers
303302
errorHandler,
304303
failedRequestHandler,
305-
// Same for request handler
306304
requestHandler,
307-
// The builder intentionally returns null so that it crashes the crawler when it tries to use this instead of one of two the specialized context pipelines
308-
// (that would be a logical error in this class)
309-
contextPipelineBuilder: () =>
310-
null as unknown as ContextPipeline<CrawlingContext, AdaptivePlaywrightCrawlerContext>,
305+
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
311306
});
312-
313307
this.individualRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
314308

315309
this.renderingTypePredictor =
@@ -408,6 +402,34 @@ export class AdaptivePlaywrightCrawler<
408402
return await super._init();
409403
}
410404

405+
protected override buildContextPipeline() {
406+
const errorMessage = (prop: string) =>
407+
`The \`${prop}\` property is not available on the outer context pipeline of AdaptivePlaywrightCrawler - it is provided by the inner (static/browser) pipelines`;
408+
409+
return super.buildContextPipeline().compose({
410+
action: async ({ request }) => ({
411+
get request(): LoadedRequest<Request<Dictionary>> {
412+
return request as LoadedRequest<Request<Dictionary>>;
413+
},
414+
get response(): Response {
415+
throw new Error(errorMessage('response'));
416+
},
417+
get page(): Page {
418+
throw new Error(errorMessage('page'));
419+
},
420+
get querySelector(): AdaptivePlaywrightCrawlerContext['querySelector'] {
421+
throw new Error(errorMessage('querySelector'));
422+
},
423+
get waitForSelector(): AdaptivePlaywrightCrawlerContext['waitForSelector'] {
424+
throw new Error(errorMessage('waitForSelector'));
425+
},
426+
get parseWithCheerio(): AdaptivePlaywrightCrawlerContext['parseWithCheerio'] {
427+
throw new Error(errorMessage('parseWithCheerio'));
428+
},
429+
}),
430+
});
431+
}
432+
411433
private async adaptCheerioContext(cheerioContext: CheerioCrawlingContext) {
412434
// Capture the original response to avoid infinite recursion when the getter is copied to the context
413435
const result = this.resultObjects.get(cheerioContext);
@@ -507,28 +529,29 @@ export class AdaptivePlaywrightCrawler<
507529
pushData: result.pushData,
508530
useState: this.allowStorageAccess(useStateFunction),
509531
getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
510-
enqueueLinks: async (options: SetRequired<EnqueueLinksOptions, 'urls'>) => {
511-
return await this.enqueueLinks(options, context.request, result);
512-
},
513532
log: this.createLogProxy(context.log, logs),
514533
registerDeferredCleanup: (cleanup: () => Promise<unknown>) => deferredCleanup.push(cleanup),
515534
};
516535

517-
const subCrawlerContext = { ...context, ...resultBoundContextHelpers };
536+
const subCrawlerContext = Object.defineProperties(
537+
{},
538+
Object.getOwnPropertyDescriptors(context),
539+
) as typeof context;
540+
541+
// Mark result-bound helpers as non-configurable so they survive the sub-crawler context pipeline
542+
// (which would otherwise override them with the sub-crawler's own versions, losing the result binding).
543+
for (const [key, descriptor] of Object.entries(Object.getOwnPropertyDescriptors(resultBoundContextHelpers))) {
544+
Object.defineProperty(subCrawlerContext, key, { ...descriptor, configurable: false });
545+
}
546+
518547
this.resultObjects.set(subCrawlerContext, result);
519548

520549
try {
521550
const callAdaptiveRequestHandler = async () => {
522551
if (renderingType === 'static') {
523-
await this.staticContextPipeline.call(
524-
subCrawlerContext,
525-
async (finalContext) => await this.requestHandler(finalContext),
526-
);
552+
await this.staticContextPipeline.call(subCrawlerContext, this.requestHandler.bind(this));
527553
} else if (renderingType === 'clientOnly') {
528-
await this.browserContextPipeline.call(
529-
subCrawlerContext,
530-
async (finalContext) => await this.requestHandler(finalContext),
531-
);
554+
await this.browserContextPipeline.call(subCrawlerContext, this.requestHandler.bind(this));
532555
}
533556
};
534557

packages/playwright-crawler/src/internals/playwright-crawler.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ export class PlaywrightCrawler<
202202
constructor(options: PlaywrightCrawlerOptions<ExtendedContext> = {}) {
203203
ow(options, 'PlaywrightCrawlerOptions', ow.object.exactShape(PlaywrightCrawler.optionsShape));
204204

205-
const { launchContext = {}, headless, ...browserCrawlerOptions } = options;
205+
const { launchContext = {}, headless, contextPipelineBuilder, ...browserCrawlerOptions } = options;
206206

207207
const browserPoolOptions = {
208208
...options.browserPoolOptions,
@@ -234,11 +234,14 @@ export class PlaywrightCrawler<
234234
...(browserCrawlerOptions as PlaywrightCrawlerOptions<ExtendedContext>),
235235
launchContext,
236236
browserPoolOptions,
237-
contextPipelineBuilder: () =>
238-
this.buildContextPipeline().compose({ action: this.enhanceContext.bind(this) }),
237+
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
239238
});
240239
}
241240

241+
protected override buildContextPipeline() {
242+
return super.buildContextPipeline().compose({ action: this.enhanceContext.bind(this) });
243+
}
244+
242245
protected override async _navigationHandler(
243246
crawlingContext: PlaywrightCrawlingContext,
244247
gotoOptions: DirectNavigationOptions,

0 commit comments

Comments
 (0)