Skip to content

Commit f6dac27

Browse files
authored
refactor: move blocked HTTP status code management to BasicCrawler (#3496)
Refactors the blocked HTTP status code handling so it's managed by the Crawler instance rather than `SessionPool`. Closes #2441
1 parent f2016c3 commit f6dac27

15 files changed

Lines changed: 159 additions & 219 deletions

File tree

docs/guides/session_management_basic.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,6 @@ const crawler = new BasicCrawler({
3434
throw e;
3535
}
3636

37-
// Automatically retires the session based on response HTTP status code.
38-
session?.retireOnBlockedStatusCodes(response.status);
39-
4037
if ((await response.text()).includes('You are blocked!')) {
4138
// You are sure it is blocked.
4239
// This will throw away the session.

docs/upgrading/upgrading_v4.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ const crawler = new CheerioCrawler({
8484

8585
Previously, the crawling context extended a `Record` type, allowing to access any property. This was changed to a strict type, which means that you can only access properties that are defined in the context.
8686

87-
## `additionalBlockedStatusCodes` parameter is removed
87+
## `retireOnBlockedStatusCodes` is removed from `Session`
8888

89-
`additionalBlockedStatusCodes` parameter of `Session.retireOnBlockedStatusCodes` method is removed. Use the `blockedStatusCodes` crawler option instead.
89+
`Session.retireOnBlockedStatusCodes` is removed. Blocked status code handling is now internal to the crawler. Configure blocked status codes via the `blockedStatusCodes` crawler option (moved from `sessionPoolOptions`).
9090

9191
## Remove `experimentalContainers` option
9292

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 59 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import type {
3131
import {
3232
AutoscaledPool,
3333
bindMethodsToServiceLocator,
34+
BLOCKED_STATUS_CODES,
3435
ContextPipeline,
3536
ContextPipelineCleanupError,
3637
ContextPipelineInitializationError,
@@ -338,6 +339,12 @@ export interface BasicCrawlerOptions<
338339
*/
339340
statusMessageCallback?: StatusMessageCallback;
340341

342+
/**
343+
* HTTP status codes that indicate the session should be retired.
344+
* @default [401, 403, 429]
345+
*/
346+
blockedStatusCodes?: number[];
347+
341348
/**
342349
* If set to `true`, the crawler will automatically try to bypass any detected bot protection.
343350
*
@@ -426,6 +433,18 @@ export interface BasicCrawlerOptions<
426433
*
427434
*/
428435
id?: string;
436+
437+
/**
438+
* An array of HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be excluded from error consideration.
439+
* By default, status codes >= 500 trigger errors.
440+
*/
441+
ignoreHttpErrorStatusCodes?: number[];
442+
443+
/**
444+
* An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors.
445+
* By default, status codes >= 500 trigger errors.
446+
*/
447+
additionalHttpErrorStatusCodes?: number[];
429448
}
430449

431450
/**
@@ -546,7 +565,6 @@ export class BasicCrawler<
546565

547566
/**
548567
* A reference to the underlying {@apilink SessionPool} class that manages the crawler's {@apilink Session|sessions}.
549-
* Only available if used by the crawler.
550568
*/
551569
sessionPool?: SessionPool;
552570

@@ -624,6 +642,9 @@ export class BasicCrawler<
624642
protected statusMessageLoggingInterval: number;
625643
protected statusMessageCallback?: StatusMessageCallback;
626644
protected sessionPoolOptions: SessionPoolOptions;
645+
protected blockedStatusCodes = new Set<number>();
646+
protected additionalHttpErrorStatusCodes: Set<number>;
647+
protected ignoreHttpErrorStatusCodes: Set<number>;
627648
protected autoscaledPoolOptions: AutoscaledPoolOptions;
628649
protected httpClient: BaseHttpClient;
629650
protected retryOnBlocked: boolean;
@@ -666,6 +687,10 @@ export class BasicCrawler<
666687
statusMessageLoggingInterval: ow.optional.number,
667688
statusMessageCallback: ow.optional.function,
668689

690+
additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
691+
ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
692+
693+
blockedStatusCodes: ow.optional.array.ofType(ow.number),
669694
retryOnBlocked: ow.optional.boolean,
670695
respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
671696
onSkippedRequest: ow.optional.function,
@@ -713,6 +738,9 @@ export class BasicCrawler<
713738
sessionPoolOptions = {},
714739
proxyConfiguration,
715740

741+
additionalHttpErrorStatusCodes = [],
742+
ignoreHttpErrorStatusCodes = [],
743+
716744
// Service locator options
717745
configuration,
718746
storageClient,
@@ -724,6 +752,7 @@ export class BasicCrawler<
724752
maxConcurrency,
725753
maxRequestsPerMinute,
726754

755+
blockedStatusCodes: blockedStatusCodesInput,
727756
retryOnBlocked = false,
728757
respectRobotsTxtFile = false,
729758
onSkippedRequest,
@@ -795,6 +824,9 @@ export class BasicCrawler<
795824
this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
796825
this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
797826

827+
this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
828+
this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
829+
798830
this.requestHandler = requestHandler ?? this.router;
799831
this.failedRequestHandler = failedRequestHandler;
800832
this.errorHandler = errorHandler;
@@ -836,14 +868,8 @@ export class BasicCrawler<
836868
...sessionPoolOptions,
837869
log: this.log,
838870
};
839-
if (this.retryOnBlocked) {
840-
this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
841-
if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
842-
this.log.warning(
843-
`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.`,
844-
);
845-
}
846-
}
871+
this.blockedStatusCodes = new Set(blockedStatusCodesInput ?? BLOCKED_STATUS_CODES);
872+
847873
const maxSignedInteger = 2 ** 31 - 1;
848874
if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
849875
this.log.warning(
@@ -979,6 +1005,19 @@ export class BasicCrawler<
9791005
}
9801006
}
9811007

1008+
/**
1009+
* Determines if the given HTTP status code is an error status code given
1010+
* the default behaviour and user-set preferences.
1011+
* @param status
1012+
* @returns `true` if the status code is considered an error, `false` otherwise
1013+
*/
1014+
protected isErrorStatusCode(status: number): boolean {
1015+
const excludeError = this.ignoreHttpErrorStatusCodes.has(status);
1016+
const includeError = this.additionalHttpErrorStatusCodes.has(status);
1017+
1018+
return (status >= 500 && !excludeError) || includeError;
1019+
}
1020+
9821021
/**
9831022
* Builds the basic context pipeline that transforms `{ request }` into a full `CrawlingContext`.
9841023
* This handles base context creation, session resolution, and context helpers.
@@ -1640,11 +1679,11 @@ export class BasicCrawler<
16401679
/**
16411680
* Handles blocked request
16421681
*/
1643-
protected _throwOnBlockedRequest(session: Session, statusCode: number) {
1644-
const isBlocked = session.retireOnBlockedStatusCodes(statusCode);
1682+
protected _throwOnBlockedRequest(statusCode: number) {
1683+
if (this.retryOnBlocked) return;
16451684

1646-
if (isBlocked) {
1647-
throw new Error(`Request blocked - received ${statusCode} status code.`);
1685+
if (this.blockedStatusCodes.has(statusCode)) {
1686+
throw new SessionError(`Request blocked - received ${statusCode} status code.`);
16481687
}
16491688
}
16501689

@@ -2040,7 +2079,9 @@ export class BasicCrawler<
20402079
}
20412080

20422081
if (!request.noRetry) {
2043-
request.retryCount++;
2082+
if (!(error instanceof SessionError)) {
2083+
request.retryCount++;
2084+
}
20442085

20452086
const { url, retryCount, id } = request;
20462087

@@ -2058,6 +2099,10 @@ export class BasicCrawler<
20582099
}
20592100
}
20602101

2102+
if (error instanceof SessionError) {
2103+
crawlingContext.session?.retire();
2104+
}
2105+
20612106
// If the request is non-retryable, the error and snapshot aren't saved in the errorTrackerRetry object.
20622107
// Therefore, we pass the crawlingContext to the errorTracker.add method, enabling snapshot capture.
20632108
// This is to make sure the error snapshot is not duplicated in the errorTrackerRetry and errorTracker objects.

packages/browser-crawler/src/internals/browser-crawler.ts

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import type {
1515
} from '@crawlee/basic';
1616
import {
1717
BasicCrawler,
18-
BLOCKED_STATUS_CODES as DEFAULT_BLOCKED_STATUS_CODES,
1918
ContextPipeline,
2019
cookieStringToToughCookie,
2120
enqueueLinks,
@@ -429,13 +428,6 @@ export abstract class BrowserCrawler<
429428
): Promise<string | false> {
430429
const { page, response } = crawlingContext;
431430

432-
const blockedStatusCodes =
433-
// eslint-disable-next-line dot-notation
434-
(this.sessionPool?.['blockedStatusCodes'].length ?? 0) > 0
435-
? // eslint-disable-next-line dot-notation
436-
this.sessionPool!['blockedStatusCodes']
437-
: DEFAULT_BLOCKED_STATUS_CODES;
438-
439431
// Cloudflare specific heuristic - wait 5 seconds if we get a 403 for the JS challenge to load / resolve.
440432
if ((await this.containsSelectors(page, CLOUDFLARE_RETRY_CSS_SELECTORS)) && response?.status() === 403) {
441433
await sleep(5000);
@@ -448,10 +440,10 @@ export abstract class BrowserCrawler<
448440
}
449441

450442
const foundSelectors = await this.containsSelectors(page, RETRY_CSS_SELECTORS);
451-
const blockedStatusCode = blockedStatusCodes.find((x) => x === (response?.status() ?? 0));
443+
const statusCode = response?.status() ?? 0;
452444

453445
if (foundSelectors) return `Found selectors: ${foundSelectors.join(', ')}`;
454-
if (blockedStatusCode) return `Received blocked status code: ${blockedStatusCode}`;
446+
if (this.blockedStatusCodes.has(statusCode)) return `Received blocked status code: ${statusCode}`;
455447

456448
return false;
457449
}
@@ -651,11 +643,19 @@ export abstract class BrowserCrawler<
651643
const status: number = response.status();
652644

653645
this.stats.registerStatusCode(status);
646+
647+
if (this.isErrorStatusCode(status)) {
648+
if (this.additionalHttpErrorStatusCodes.has(status)) {
649+
throw new Error(`${status} - Error status code was set by user.`);
650+
}
651+
652+
throw new Error(`${status} - Internal Server Error`);
653+
}
654654
}
655655

656656
if (this.sessionPool && response && session) {
657657
if (typeof response === 'object' && typeof response.status === 'function') {
658-
this._throwOnBlockedRequest(session, response.status());
658+
this._throwOnBlockedRequest(response.status());
659659
} else {
660660
this.log.debug('Got a malformed Browser response.', { request, response });
661661
}

packages/core/src/session_pool/session.ts

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -288,23 +288,6 @@ export class Session implements ISession {
288288
this._maybeSelfRetire();
289289
}
290290

291-
/**
292-
* With certain status codes: `401`, `403` or `429` we can be certain
293-
* that the target website is blocking us. This function helps to do this conveniently
294-
* by retiring the session when such code is received. Optionally, the default status
295-
* codes can be extended in the second parameter.
296-
* @param statusCode HTTP status code.
297-
* @returns Whether the session was retired.
298-
*/
299-
retireOnBlockedStatusCodes(statusCode: number): boolean {
300-
// eslint-disable-next-line dot-notation -- accessing private property
301-
const isBlocked = this.sessionPool['blockedStatusCodes'].includes(statusCode);
302-
if (isBlocked) {
303-
this.retire();
304-
}
305-
return isBlocked;
306-
}
307-
308291
/**
309292
* Saves cookies from an HTTP response to be used with the session.
310293
* It expects an object with a `headers` property that's either an `Object`

packages/core/src/session_pool/session_pool.ts

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { EventType } from '../events/event_manager.js';
1010
import type { CrawleeLogger } from '../log.js';
1111
import { serviceLocator } from '../service_locator.js';
1212
import { KeyValueStore } from '../storages/key_value_store.js';
13-
import { BLOCKED_STATUS_CODES, MAX_POOL_SIZE, PERSIST_STATE_KEY } from './consts.js';
13+
import { MAX_POOL_SIZE, PERSIST_STATE_KEY } from './consts.js';
1414
import type { SessionOptions } from './session.js';
1515
import { Session } from './session.js';
1616

@@ -51,13 +51,6 @@ export interface SessionPoolOptions {
5151
*/
5252
createSessionFunction?: CreateSession;
5353

54-
/**
55-
* Specifies which response status codes are considered as blocked.
56-
* Session connected to such request will be marked as retired.
57-
* @default [401, 403, 429]
58-
*/
59-
blockedStatusCodes?: number[];
60-
6154
/** @internal */
6255
log?: CrawleeLogger;
6356

@@ -133,7 +126,6 @@ export class SessionPool extends EventEmitter {
133126
protected persistStateKey: string;
134127
protected _listener!: () => Promise<void>;
135128
protected events: EventManager;
136-
protected readonly blockedStatusCodes: number[];
137129
protected persistenceOptions: PersistenceOptions;
138130
protected isInitialized = false;
139131

@@ -153,7 +145,6 @@ export class SessionPool extends EventEmitter {
153145
persistStateKey: ow.optional.string,
154146
createSessionFunction: ow.optional.function,
155147
sessionOptions: ow.optional.object,
156-
blockedStatusCodes: ow.optional.array.ofType(ow.number),
157148
log: ow.optional.object,
158149
persistenceOptions: ow.optional.object,
159150
}),
@@ -165,14 +156,12 @@ export class SessionPool extends EventEmitter {
165156
persistStateKey = PERSIST_STATE_KEY,
166157
createSessionFunction,
167158
sessionOptions = {},
168-
blockedStatusCodes = BLOCKED_STATUS_CODES,
169159
log = serviceLocator.getLogger(),
170160
persistenceOptions = {
171161
enable: true,
172162
},
173163
} = options;
174164

175-
this.blockedStatusCodes = blockedStatusCodes;
176165
this.events = serviceLocator.getEventManager();
177166
this.log = log.child({ prefix: 'SessionPool' });
178167
this.persistenceOptions = persistenceOptions;

0 commit comments

Comments
 (0)