@@ -31,6 +31,7 @@ import type {
3131import {
3232 AutoscaledPool ,
3333 bindMethodsToServiceLocator ,
34+ BLOCKED_STATUS_CODES ,
3435 ContextPipeline ,
3536 ContextPipelineCleanupError ,
3637 ContextPipelineInitializationError ,
@@ -338,6 +339,12 @@ export interface BasicCrawlerOptions<
338339 */
339340 statusMessageCallback ?: StatusMessageCallback ;
340341
342+ /**
343+ * HTTP status codes that indicate the session should be retired.
344+ * @default [401, 403, 429]
345+ */
346+ blockedStatusCodes ?: number [ ] ;
347+
341348 /**
342349 * If set to `true`, the crawler will automatically try to bypass any detected bot protection.
343350 *
@@ -426,6 +433,18 @@ export interface BasicCrawlerOptions<
426433 *
427434 */
428435 id ?: string ;
436+
437+ /**
438+ * An array of HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be excluded from error consideration.
439+ * By default, status codes >= 500 trigger errors.
440+ */
441+ ignoreHttpErrorStatusCodes ?: number [ ] ;
442+
443+ /**
444+ * An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors.
445+ * By default, status codes >= 500 trigger errors.
446+ */
447+ additionalHttpErrorStatusCodes ?: number [ ] ;
429448}
430449
431450/**
@@ -546,7 +565,6 @@ export class BasicCrawler<
546565
547566 /**
548567 * A reference to the underlying {@apilink SessionPool} class that manages the crawler's {@apilink Session|sessions}.
549- * Only available if used by the crawler.
550568 */
551569 sessionPool ?: SessionPool ;
552570
@@ -624,6 +642,9 @@ export class BasicCrawler<
624642 protected statusMessageLoggingInterval : number ;
625643 protected statusMessageCallback ?: StatusMessageCallback ;
626644 protected sessionPoolOptions : SessionPoolOptions ;
645+ protected blockedStatusCodes = new Set < number > ( ) ;
646+ protected additionalHttpErrorStatusCodes : Set < number > ;
647+ protected ignoreHttpErrorStatusCodes : Set < number > ;
627648 protected autoscaledPoolOptions : AutoscaledPoolOptions ;
628649 protected httpClient : BaseHttpClient ;
629650 protected retryOnBlocked : boolean ;
@@ -666,6 +687,10 @@ export class BasicCrawler<
666687 statusMessageLoggingInterval : ow . optional . number ,
667688 statusMessageCallback : ow . optional . function ,
668689
690+ additionalHttpErrorStatusCodes : ow . optional . array . ofType ( ow . number ) ,
691+ ignoreHttpErrorStatusCodes : ow . optional . array . ofType ( ow . number ) ,
692+
693+ blockedStatusCodes : ow . optional . array . ofType ( ow . number ) ,
669694 retryOnBlocked : ow . optional . boolean ,
670695 respectRobotsTxtFile : ow . optional . any ( ow . boolean , ow . object ) ,
671696 onSkippedRequest : ow . optional . function ,
@@ -713,6 +738,9 @@ export class BasicCrawler<
713738 sessionPoolOptions = { } ,
714739 proxyConfiguration,
715740
741+ additionalHttpErrorStatusCodes = [ ] ,
742+ ignoreHttpErrorStatusCodes = [ ] ,
743+
716744 // Service locator options
717745 configuration,
718746 storageClient,
@@ -724,6 +752,7 @@ export class BasicCrawler<
724752 maxConcurrency,
725753 maxRequestsPerMinute,
726754
755+ blockedStatusCodes : blockedStatusCodesInput ,
727756 retryOnBlocked = false ,
728757 respectRobotsTxtFile = false ,
729758 onSkippedRequest,
@@ -795,6 +824,9 @@ export class BasicCrawler<
795824 this . robotsTxtFileCache = new LruCache ( { maxLength : 1000 } ) ;
796825 this . handleSkippedRequest = this . handleSkippedRequest . bind ( this ) ;
797826
827+ this . additionalHttpErrorStatusCodes = new Set ( [ ...additionalHttpErrorStatusCodes ] ) ;
828+ this . ignoreHttpErrorStatusCodes = new Set ( [ ...ignoreHttpErrorStatusCodes ] ) ;
829+
798830 this . requestHandler = requestHandler ?? this . router ;
799831 this . failedRequestHandler = failedRequestHandler ;
800832 this . errorHandler = errorHandler ;
@@ -836,14 +868,8 @@ export class BasicCrawler<
836868 ...sessionPoolOptions ,
837869 log : this . log ,
838870 } ;
839- if ( this . retryOnBlocked ) {
840- this . sessionPoolOptions . blockedStatusCodes = sessionPoolOptions . blockedStatusCodes ?? [ ] ;
841- if ( this . sessionPoolOptions . blockedStatusCodes . length !== 0 ) {
842- this . log . warning (
843- `Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.` ,
844- ) ;
845- }
846- }
871+ this . blockedStatusCodes = new Set ( blockedStatusCodesInput ?? BLOCKED_STATUS_CODES ) ;
872+
847873 const maxSignedInteger = 2 ** 31 - 1 ;
848874 if ( this . requestHandlerTimeoutMillis > maxSignedInteger ) {
849875 this . log . warning (
@@ -979,6 +1005,19 @@ export class BasicCrawler<
9791005 }
9801006 }
9811007
1008+ /**
1009+ * Determines if the given HTTP status code is an error status code given
1010+ * the default behaviour and user-set preferences.
1011+ * @param status
1012+ * @returns `true` if the status code is considered an error, `false` otherwise
1013+ */
1014+ protected isErrorStatusCode ( status : number ) : boolean {
1015+ const excludeError = this . ignoreHttpErrorStatusCodes . has ( status ) ;
1016+ const includeError = this . additionalHttpErrorStatusCodes . has ( status ) ;
1017+
1018+ return ( status >= 500 && ! excludeError ) || includeError ;
1019+ }
1020+
9821021 /**
9831022 * Builds the basic context pipeline that transforms `{ request }` into a full `CrawlingContext`.
9841023 * This handles base context creation, session resolution, and context helpers.
@@ -1640,11 +1679,11 @@ export class BasicCrawler<
16401679 /**
16411680 * Handles blocked request
16421681 */
1643- protected _throwOnBlockedRequest ( session : Session , statusCode : number ) {
1644- const isBlocked = session . retireOnBlockedStatusCodes ( statusCode ) ;
1682+ protected _throwOnBlockedRequest ( statusCode : number ) {
1683+ if ( this . retryOnBlocked ) return ;
16451684
1646- if ( isBlocked ) {
1647- throw new Error ( `Request blocked - received ${ statusCode } status code.` ) ;
1685+ if ( this . blockedStatusCodes . has ( statusCode ) ) {
1686+ throw new SessionError ( `Request blocked - received ${ statusCode } status code.` ) ;
16481687 }
16491688 }
16501689
@@ -2040,7 +2079,9 @@ export class BasicCrawler<
20402079 }
20412080
20422081 if ( ! request . noRetry ) {
2043- request . retryCount ++ ;
2082+ if ( ! ( error instanceof SessionError ) ) {
2083+ request . retryCount ++ ;
2084+ }
20442085
20452086 const { url, retryCount, id } = request ;
20462087
@@ -2058,6 +2099,10 @@ export class BasicCrawler<
20582099 }
20592100 }
20602101
2102+ if ( error instanceof SessionError ) {
2103+ crawlingContext . session ?. retire ( ) ;
2104+ }
2105+
20612106 // If the request is non-retryable, the error and snapshot aren't saved in the errorTrackerRetry object.
20622107 // Therefore, we pass the crawlingContext to the errorTracker.add method, enabling snapshot capture.
20632108 // This is to make sure the error snapshot is not duplicated in the errorTrackerRetry and errorTracker objects.
0 commit comments