66from datetime import timedelta
77from functools import partial
88from logging import getLogger
9- from typing import TYPE_CHECKING , AsyncGenerator , Awaitable , Callable , Generic , Sequence , Union , cast
9+ from typing import TYPE_CHECKING , Any , AsyncGenerator , Awaitable , Callable , Generic , Sequence , Union , cast
1010
1111import httpx
1212from tldextract import TLDExtract
3030from crawlee .basic_crawler .router import Router
3131from crawlee .basic_crawler .types import (
3232 BasicCrawlingContext ,
33- FinalStatistics ,
3433 RequestHandlerRunResult ,
3534 SendRequestFunction ,
3635)
4039from crawlee .http_clients .httpx_client import HttpxClient
4140from crawlee .models import BaseRequestData , Request , RequestState
4241from crawlee .sessions import SessionPool
42+ from crawlee .statistics .statistics import Statistics
4343from crawlee .storages .request_queue import RequestQueue
4444
4545if TYPE_CHECKING :
4646 import re
4747
4848 from crawlee .http_clients .base_http_client import BaseHttpClient , HttpResponse
4949 from crawlee .sessions .session import Session
50+ from crawlee .statistics .models import FinalStatistics , StatisticsState
5051 from crawlee .storages .request_provider import RequestProvider
5152
5253TCrawlingContext = TypeVar ('TCrawlingContext' , bound = BasicCrawlingContext , default = BasicCrawlingContext )
@@ -70,6 +71,7 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
7071 session_pool : NotRequired [SessionPool ]
7172 use_session_pool : NotRequired [bool ]
7273 retry_on_blocked : NotRequired [bool ]
74+ statistics : NotRequired [Statistics [StatisticsState ]]
7375 _context_pipeline : NotRequired [ContextPipeline [TCrawlingContext ]]
7476
7577
@@ -98,6 +100,7 @@ def __init__(
98100 session_pool : SessionPool | None = None ,
99101 use_session_pool : bool = True ,
100102 retry_on_blocked : bool = True ,
103+ statistics : Statistics | None = None ,
101104 _context_pipeline : ContextPipeline [TCrawlingContext ] | None = None ,
102105 ) -> None :
103106 """Initialize the BasicCrawler.
@@ -114,8 +117,9 @@ def __init__(
114117 configuration: Crawler configuration
115118 request_handler_timeout: How long is a single request handler allowed to run
116119 use_session_pool: Enables using the session pool for crawling
117- session_pool: A preconfigured SessionPool instance if you wish to use non-default configuration
120+ session_pool: A preconfigured ` SessionPool` instance if you wish to use non-default configuration
118121 retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection
122+ statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
119123 _context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
120124 This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
121125 """
@@ -165,6 +169,14 @@ def __init__(
165169
166170 self ._retry_on_blocked = retry_on_blocked
167171
172+ self ._statistics = statistics or Statistics (
173+ event_manager = self ._event_manager ,
174+ log_message = f'{ logger .name } request statistics' ,
175+ )
176+
177+ self ._running = False
178+ self ._has_finished_before = False
179+
168180 @property
169181 def router (self ) -> Router [TCrawlingContext ]:
170182 """The router used to handle each individual crawling request."""
@@ -180,6 +192,11 @@ def router(self, router: Router[TCrawlingContext]) -> None:
180192
181193 self ._router = router
182194
195+ @property
196+ def statistics (self ) -> Statistics [StatisticsState ]:
197+ """Statistics about the current (or last) crawler run."""
198+ return self ._statistics
199+
183200 async def _get_session (self ) -> Session | None :
184201 """If session pool is being used, try to take a session from it."""
185202 if not self ._use_session_pool :
@@ -235,19 +252,43 @@ async def add_requests(
235252
236253 async def run (self , requests : list [str | BaseRequestData ] | None = None ) -> FinalStatistics :
237254 """Run the crawler until all requests are processed."""
255+ if self ._running :
256+ raise RuntimeError (
257+ 'This crawler instance is already running, you can add more requests to it via `crawler.add_requests()`'
258+ )
259+
260+ self ._running = True
261+
262+ if self ._has_finished_before :
263+ await self ._statistics .reset ()
264+
265+ if self ._use_session_pool :
266+ await self ._session_pool .reset_store ()
267+
238268 if requests is not None :
239269 await self .add_requests (requests )
240270
241271 async with AsyncExitStack () as exit_stack :
242272 await exit_stack .enter_async_context (self ._event_manager )
243273 await exit_stack .enter_async_context (self ._snapshotter )
274+ await exit_stack .enter_async_context (self ._statistics )
244275
245276 if self ._use_session_pool :
246277 await exit_stack .enter_async_context (self ._session_pool )
247278
248279 await self ._pool .run ()
249280
250- return FinalStatistics ()
281+ if self ._statistics .error_tracker .total > 0 :
282+ logger .info (
283+ 'Error analysis:'
284+ f' total_errors={ self ._statistics .error_tracker .total } '
285+ f' unique_errors={ self ._statistics .error_tracker .unique_error_count } '
286+ )
287+
288+ self ._running = False
289+ self ._has_finished_before = True
290+
291+ return self ._statistics .calculate ()
251292
252293 def _should_retry_request (self , crawling_context : BasicCrawlingContext , error : Exception ) -> bool :
253294 if crawling_context .request .no_retry :
@@ -298,13 +339,13 @@ def _check_enqueue_strategy(
298339 if strategy == EnqueueStrategy .ALL :
299340 return True
300341
301- assert_never ()
342+ assert_never (strategy )
302343
303344 def _check_url_patterns (
304345 self ,
305346 target_url : httpx .URL ,
306- include : Sequence [re .Pattern | Glob ] | None ,
307- exclude : Sequence [re .Pattern | Glob ] | None ,
347+ include : Sequence [re .Pattern [ Any ] | Glob ] | None ,
348+ exclude : Sequence [re .Pattern [ Any ] | Glob ] | None ,
308349 ) -> bool :
309350 """Check if a URL matches configured include/exclude patterns."""
310351 # If the URL matches any `exclude` pattern, reject it
@@ -332,10 +373,11 @@ def _check_url_patterns(
332373
333374 async def _handle_request_error (self , crawling_context : TCrawlingContext , error : Exception ) -> None :
334375 request_provider = await self .get_request_provider ()
376+ request = crawling_context .request
335377
336378 if self ._should_retry_request (crawling_context , error ):
337- request = crawling_context .request
338379 request .retry_count += 1
380+ self ._statistics .error_tracker .add (error )
339381
340382 if self ._error_handler :
341383 try :
@@ -357,9 +399,11 @@ async def _handle_request_error(self, crawling_context: TCrawlingContext, error:
357399 max_retries = 3 ,
358400 )
359401 await self ._handle_failed_request (crawling_context , error )
402+ self ._statistics .record_request_processing_failure (request .id or request .unique_key )
360403
361404 async def _handle_failed_request (self , crawling_context : TCrawlingContext , error : Exception ) -> None :
362405 logger .exception ('Request failed and reached maximum retries' , exc_info = error )
406+ self ._statistics .error_tracker .add (error )
363407
364408 if self ._failed_request_handler :
365409 try :
@@ -441,6 +485,9 @@ async def __run_task_function(self) -> None: # noqa: PLR0912
441485 add_requests = result .add_requests ,
442486 )
443487
488+ statistics_id = request .id or request .unique_key
489+ self ._statistics .record_request_processing_start (statistics_id )
490+
444491 try :
445492 request .state = RequestState .REQUEST_HANDLER
446493
@@ -467,6 +514,8 @@ async def __run_task_function(self) -> None: # noqa: PLR0912
467514
468515 if crawling_context .session :
469516 crawling_context .session .mark_good ()
517+
518+ self ._statistics .record_request_processing_finish (statistics_id )
470519 except RequestHandlerError as primary_error :
471520 primary_error = cast (
472521 RequestHandlerError [TCrawlingContext ], primary_error
@@ -514,6 +563,7 @@ async def __run_task_function(self) -> None: # noqa: PLR0912
514563 crawling_context .request .session_rotation_count += 1
515564
516565 await request_provider .reclaim_request (request )
566+ self ._statistics .error_tracker_retry .add (session_error )
517567 else :
518568 logger .exception ('Request failed and reached maximum retries' , exc_info = session_error )
519569
@@ -525,6 +575,9 @@ async def __run_task_function(self) -> None: # noqa: PLR0912
525575 logger = logger ,
526576 max_retries = 3 ,
527577 )
578+
579+ self ._statistics .record_request_processing_failure (statistics_id )
580+ self ._statistics .error_tracker .add (session_error )
528581 except ContextPipelineInterruptedError as interruped_error :
529582 logger .debug ('The context pipeline was interrupted' , exc_info = interruped_error )
530583
0 commit comments