Skip to content

Commit c467d18

Browse files
authored
feat: decouple log configuration from apifylog (#3399)
Adds a pluggable logger abstraction so users can swap @apify/log for any logger (Winston, Pino, Bunyan, etc.). What's added: - CrawleeLogger interface — the contract any logger must satisfy (13 methods: error, warning, info, debug, exception, softFail, warningOnce, deprecated, perf, logWithLevel, child, getOptions, setOptions) - BaseCrawleeLogger abstract class — implement logWithLevel() + createChild(), get everything else for free - ServiceLocator.getLogger() / setLogger() — injection point; defaults to @apify/log
1 parent 66c21eb commit c467d18

43 files changed

Lines changed: 866 additions & 212 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

docs/examples/file_download_stream.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import { pipeline, Transform } from 'stream';
22

3-
import { FileDownload, type Log } from 'crawlee';
3+
import { FileDownload, type CrawleeLogger } from 'crawlee';
44

55
// A sample Transform stream logging the download progress.
6-
function createProgressTracker({ url, log, totalBytes }: { url: URL; log: Log; totalBytes: number }) {
6+
function createProgressTracker({ url, log, totalBytes }: { url: URL; log: CrawleeLogger; totalBytes: number }) {
77
let downloadedBytes = 0;
88

99
return new Transform({

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 39 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import type {
66
AddRequestsBatchedResult,
77
AutoscaledPoolOptions,
88
Configuration,
9+
CrawleeLogger,
910
CrawlingContext,
1011
DatasetExportOptions,
1112
EnqueueLinksOptions,
@@ -40,6 +41,7 @@ import {
4041
EnqueueStrategy,
4142
EventType,
4243
KeyValueStore,
44+
LogLevel,
4345
mergeCookies,
4446
NonRetryableError,
4547
purgeDefaultStorages,
@@ -77,8 +79,6 @@ import { getDomain } from 'tldts';
7779
import type { ReadonlyDeep, SetRequired } from 'type-fest';
7880

7981
import { LruCache } from '@apify/datastructures';
80-
import type { Log } from '@apify/log';
81-
import defaultLog, { LogLevel } from '@apify/log';
8282
import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout';
8383
import { cryptoRandomObjectId } from '@apify/utilities';
8484

@@ -370,9 +370,6 @@ export interface BasicCrawlerOptions<
370370
*/
371371
onSkippedRequest?: SkippedRequestCallback;
372372

373-
/** @internal */
374-
log?: Log;
375-
376373
/**
377374
* Enables experimental features of Crawlee, which can alter the behavior of the crawler.
378375
* WARNING: these options are not guaranteed to be stable and may change or be removed at any time.
@@ -415,6 +412,12 @@ export interface BasicCrawlerOptions<
415412
*/
416413
eventManager?: EventManager;
417414

415+
/**
416+
* Custom logger to use for this crawler.
417+
* If provided, the crawler will use its own ServiceLocator instance instead of the global one.
418+
*/
419+
logger?: CrawleeLogger;
420+
418421
/**
419422
* A unique identifier for the crawler instance. This ID is used to isolate the state returned by
420423
* {@apilink BasicCrawler.useState|`crawler.useState()`} from other crawler instances.
@@ -586,7 +589,12 @@ export class BasicCrawler<
586589
running = false;
587590
hasFinishedBefore = false;
588591

589-
readonly log: Log;
592+
#log!: CrawleeLogger;
593+
594+
get log(): CrawleeLogger {
595+
return this.#log;
596+
}
597+
590598
protected requestHandler!: RequestHandler<ExtendedContext>;
591599
protected errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
592600
protected failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
@@ -651,6 +659,7 @@ export class BasicCrawler<
651659
configuration: ow.optional.object,
652660
storageClient: ow.optional.object,
653661
eventManager: ow.optional.object,
662+
logger: ow.optional.object,
654663

655664
// AutoscaledPool shorthands
656665
minConcurrency: ow.optional.number,
@@ -659,7 +668,6 @@ export class BasicCrawler<
659668
keepAlive: ow.optional.boolean,
660669

661670
// internal
662-
log: ow.optional.object,
663671
experiments: ow.optional.object,
664672

665673
statisticsOptions: ow.optional.object,
@@ -695,6 +703,7 @@ export class BasicCrawler<
695703
configuration,
696704
storageClient,
697705
eventManager,
706+
logger,
698707

699708
// AutoscaledPool shorthands
700709
minConcurrency,
@@ -714,7 +723,6 @@ export class BasicCrawler<
714723
httpClient,
715724

716725
// internal
717-
log = defaultLog.child({ prefix: this.constructor.name }),
718726
experiments = {},
719727

720728
id,
@@ -730,15 +738,18 @@ export class BasicCrawler<
730738
if (
731739
storageClient ||
732740
eventManager ||
741+
logger ||
733742
(configuration !== undefined && configuration !== serviceLocator.getConfiguration())
734743
) {
735-
const scopedServiceLocator = new ServiceLocator(configuration, eventManager, storageClient);
744+
const scopedServiceLocator = new ServiceLocator(configuration, eventManager, storageClient, logger);
736745
serviceLocatorScope = bindMethodsToServiceLocator(scopedServiceLocator, this);
737746
}
738747

739748
try {
740749
serviceLocatorScope.enterScope();
741750

751+
this.#log = serviceLocator.getLogger().child({ prefix: this.constructor.name });
752+
742753
// Store whether the user explicitly provided an ID
743754
this.hasExplicitId = id !== undefined;
744755
// Store the user-provided ID, or generate a unique one for tracking purposes (not for state key)
@@ -793,7 +804,6 @@ export class BasicCrawler<
793804

794805
this.httpClient = httpClient ?? new GotScrapingHttpClient();
795806
this.proxyConfiguration = proxyConfiguration;
796-
this.log = log;
797807
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
798808
this.statusMessageCallback = statusMessageCallback as StatusMessageCallback;
799809
this.domainAccessedTime = new Map();
@@ -833,19 +843,19 @@ export class BasicCrawler<
833843
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
834844
this.maxSessionRotations = maxSessionRotations;
835845
this.stats = new Statistics({
836-
logMessage: `${log.getOptions().prefix} request statistics:`,
837-
log,
846+
logMessage: `${this.constructor.name} request statistics:`,
847+
log: this.log,
838848
...(this.hasExplicitId ? { id: this.crawlerId } : {}),
839849
...statisticsOptions,
840850
});
841851
this.sessionPoolOptions = {
842852
...sessionPoolOptions,
843-
log,
853+
log: this.log,
844854
};
845855
if (this.retryOnBlocked) {
846856
this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
847857
if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
848-
log.warning(
858+
this.log.warning(
849859
`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.`,
850860
);
851861
}
@@ -854,7 +864,7 @@ export class BasicCrawler<
854864

855865
const maxSignedInteger = 2 ** 31 - 1;
856866
if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
857-
log.warning(
867+
this.log.warning(
858868
`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
859869
` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`,
860870
);
@@ -885,7 +895,7 @@ export class BasicCrawler<
885895
isTaskReadyFunction: async () => {
886896
if (isMaxPagesExceeded()) {
887897
if (this.shouldLogMaxProcessedRequestsExceeded) {
888-
log.info(
898+
this.log.info(
889899
'Crawler reached the maxRequestsPerCrawl limit of ' +
890900
`${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`,
891901
);
@@ -898,7 +908,7 @@ export class BasicCrawler<
898908
},
899909
isFinishedFunction: async () => {
900910
if (isMaxPagesExceeded()) {
901-
log.info(
911+
this.log.info(
902912
`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
903913
'and all requests that were in progress at that time have now finished. ' +
904914
`In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`,
@@ -914,12 +924,12 @@ export class BasicCrawler<
914924
const reason = isFinishedFunction
915925
? "Crawler's custom isFinishedFunction() returned true, the crawler will shut down."
916926
: 'All requests from the queue have been processed, the crawler will shut down.';
917-
log.info(reason);
927+
this.log.info(reason);
918928
}
919929

920930
return isFinished;
921931
},
922-
log,
932+
log: this.log,
923933
};
924934

925935
this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
@@ -944,7 +954,7 @@ export class BasicCrawler<
944954
async setStatusMessage(message: string, options: SetStatusMessageOptions = {}) {
945955
const data =
946956
options.isStatusMessageTerminal != null ? { terminal: options.isStatusMessageTerminal } : undefined;
947-
this.log.internal(LogLevel[(options.level as 'DEBUG') ?? 'DEBUG'], message, data);
957+
this.log.logWithLevel(LogLevel[(options.level as 'DEBUG') ?? 'DEBUG'], message, data);
948958

949959
const client = serviceLocator.getStorageClient();
950960

@@ -1099,7 +1109,7 @@ export class BasicCrawler<
10991109
retryHistogram: this.stats.requestRetryHistogram,
11001110
...finalStats,
11011111
};
1102-
this.log.info('Final request statistics:', stats);
1112+
this.log.info('Final request statistics:', stats as unknown as Record<string, unknown>);
11031113

11041114
if (this.stats.errorTracker.total !== 0) {
11051115
const prettify = ([count, info]: [number, string[]]) =>
@@ -1193,12 +1203,14 @@ export class BasicCrawler<
11931203
BasicCrawler.useStateCrawlerIds.add(this.crawlerId);
11941204

11951205
if (BasicCrawler.useStateCrawlerIds.size > 1) {
1196-
defaultLog.warningOnce(
1197-
'Multiple crawler instances are calling useState() without an explicit `id` option. \n' +
1198-
'This means they will share the same state object, which is likely unintended. \n' +
1199-
'To fix this, provide a unique `id` option to each crawler instance. \n' +
1200-
'Example: new BasicCrawler({ id: "my-crawler-1", ... })',
1201-
);
1206+
serviceLocator
1207+
.getLogger()
1208+
.warningOnce(
1209+
'Multiple crawler instances are calling useState() without an explicit `id` option. \n' +
1210+
'This means they will share the same state object, which is likely unintended. \n' +
1211+
'To fix this, provide a unique `id` option to each crawler instance. \n' +
1212+
'Example: new BasicCrawler({ id: "my-crawler-1", ... })',
1213+
);
12021214
}
12031215

12041216
return kvs.getAutoSavedValue<State>(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);

packages/browser-pool/src/abstract-classes/browser-controller.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { type CrawleeLogger, serviceLocator } from '@crawlee/core';
12
import type { Cookie, Dictionary } from '@crawlee/types';
23
import { nanoid } from 'nanoid';
34
import { TypedEmitter } from 'tiny-typed-emitter';
@@ -6,7 +7,6 @@ import { tryCancel } from '@apify/timeout';
67

78
import { BROWSER_CONTROLLER_EVENTS } from '../events.js';
89
import type { LaunchContext } from '../launch-context.js';
9-
import { log } from '../logger.js';
1010
import type { UnwrapPromise } from '../utils.js';
1111
import type { BrowserPlugin, CommonBrowser, CommonLibrary } from './browser-plugin.js';
1212

@@ -40,6 +40,7 @@ export abstract class BrowserController<
4040
NewPageResult = UnwrapPromise<ReturnType<LaunchResult['newPage']>>,
4141
> extends TypedEmitter<BrowserControllerEvents<Library, LibraryOptions, LaunchResult, NewPageOptions, NewPageResult>> {
4242
id = nanoid();
43+
protected log!: CrawleeLogger;
4344

4445
/**
4546
* The `BrowserPlugin` instance used to launch the browser.
@@ -90,6 +91,7 @@ export abstract class BrowserController<
9091

9192
constructor(browserPlugin: BrowserPlugin<Library, LibraryOptions, LaunchResult, NewPageOptions, NewPageResult>) {
9293
super();
94+
this.log = serviceLocator.getLogger().child({ prefix: 'BrowserPool' });
9395
this.browserPlugin = browserPlugin;
9496
}
9597

@@ -136,14 +138,14 @@ export abstract class BrowserController<
136138
// TODO: shouldn't this go in a finally instead?
137139
this.isActive = false;
138140
} catch (error) {
139-
log.debug(`Could not close browser.\nCause: ${(error as Error).message}`, { id: this.id });
141+
this.log.debug(`Could not close browser.\nCause: ${(error as Error).message}`, { id: this.id });
140142
}
141143

142144
this.emit(BROWSER_CONTROLLER_EVENTS.BROWSER_CLOSED, this);
143145

144146
setTimeout(() => {
145147
this._kill().catch((err) => {
146-
log.debug(`Could not kill browser.\nCause: ${err.message}`, { id: this.id });
148+
this.log.debug(`Could not kill browser.\nCause: ${err.message}`, { id: this.id });
147149
});
148150
}, PROCESS_KILL_TIMEOUT_MILLIS);
149151
}

packages/browser-pool/src/abstract-classes/browser-plugin.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { CriticalError } from '@crawlee/core';
1+
import { type CrawleeLogger, CriticalError, serviceLocator } from '@crawlee/core';
22
import type { Dictionary } from '@crawlee/types';
33
import merge from 'lodash.merge';
44

@@ -105,6 +105,7 @@ export abstract class BrowserPlugin<
105105
NewPageResult = UnwrapPromise<ReturnType<LaunchResult['newPage']>>,
106106
> {
107107
name = this.constructor.name;
108+
protected log!: CrawleeLogger;
108109
library: Library;
109110
launchOptions: LibraryOptions;
110111
proxyUrl?: string;
@@ -121,6 +122,7 @@ export abstract class BrowserPlugin<
121122
browserPerProxy = false,
122123
} = options;
123124

125+
this.log = serviceLocator.getLogger().child({ prefix: 'BrowserPool' });
124126
this.library = library;
125127
this.launchOptions = launchOptions;
126128
this.proxyUrl = proxyUrl && new URL(proxyUrl).href.slice(0, -1);

packages/browser-pool/src/browser-pool.ts

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { TieredProxy } from '@crawlee/core';
1+
import { type CrawleeLogger, serviceLocator, type TieredProxy } from '@crawlee/core';
22
import type { BrowserFingerprintWithHeaders } from 'fingerprint-generator';
33
import { FingerprintGenerator } from 'fingerprint-generator';
44
import { FingerprintInjector } from 'fingerprint-injector';
@@ -20,7 +20,6 @@ import {
2020
} from './fingerprinting/hooks.js';
2121
import type { FingerprintGeneratorOptions } from './fingerprinting/types.js';
2222
import type { LaunchContext } from './launch-context.js';
23-
import { log } from './logger.js';
2423
import type { InferBrowserPluginArray, UnwrapPromise } from './utils.js';
2524

2625
const PAGE_CLOSE_KILL_TIMEOUT_MILLIS = 1000;
@@ -334,9 +333,11 @@ export class BrowserPool<
334333
private browserRetireInterval?: NodeJS.Timeout;
335334

336335
private limiter = pLimit(1);
336+
private log!: CrawleeLogger;
337337

338338
constructor(options: Options & BrowserPoolHooks<BrowserControllerReturn, LaunchContextReturn, PageReturn>) {
339339
super();
340+
this.log = serviceLocator.getLogger().child({ prefix: 'BrowserPool' });
340341

341342
this.browserKillerInterval!.unref();
342343

@@ -708,7 +709,7 @@ export class BrowserPool<
708709
throw err;
709710
}
710711

711-
log.debug('Launched new browser.', { id: browserController.id });
712+
this.log.debug('Launched new browser.', { id: browserController.id });
712713
browserController.proxyTier = proxyTier;
713714
browserController.proxyUrl = proxyUrl;
714715

@@ -719,7 +720,7 @@ export class BrowserPool<
719720
} catch (err) {
720721
this.startingBrowserControllers.delete(browserController);
721722
browserController.close().catch((closeErr) => {
722-
log.error(`Could not close browser whose post-launch hooks failed.\nCause:${closeErr.message}`, {
723+
this.log.error(`Could not close browser whose post-launch hooks failed.\nCause:${closeErr.message}`, {
723724
id: browserController.id,
724725
});
725726
});
@@ -774,15 +775,15 @@ export class BrowserPool<
774775

775776
if (isBrowserIdle || isBrowserEmpty) {
776777
const { id } = controller;
777-
log.debug('Closing retired browser.', { id });
778+
this.log.debug('Closing retired browser.', { id });
778779
await controller.close();
779780
this.retiredBrowserControllers.delete(controller);
780781
closedBrowserIds.push(id);
781782
}
782783
}
783784

784785
if (closedBrowserIds.length) {
785-
log.debug('Closed retired browsers.', {
786+
this.log.debug('Closed retired browsers.', {
786787
count: closedBrowserIds.length,
787788
closedBrowserIds,
788789
});
@@ -798,7 +799,7 @@ export class BrowserPool<
798799
await this._executeHooks(this.prePageCloseHooks, page, browserController);
799800

800801
await originalPageClose.apply(page, args).catch((err: Error) => {
801-
log.debug(`Could not close page.\nCause:${err.message}`, { id: browserController.id });
802+
this.log.debug(`Could not close page.\nCause:${err.message}`, { id: browserController.id });
802803
});
803804

804805
await this._executeHooks(this.postPageCloseHooks, pageId, browserController);
@@ -821,7 +822,7 @@ export class BrowserPool<
821822
// Run this with a delay, otherwise page.close()
822823
// might fail with "Protocol error (Target.closeTarget): Target closed."
823824
setTimeout(() => {
824-
log.debug('Closing retired browser because it has no active pages', { id: browserController.id });
825+
this.log.debug('Closing retired browser because it has no active pages', { id: browserController.id });
825826
void browserController.close().finally(() => {
826827
this.retiredBrowserControllers.delete(browserController);
827828
});

packages/browser-pool/src/logger.ts

Lines changed: 0 additions & 5 deletions
This file was deleted.

0 commit comments

Comments
 (0)