apify
diff --git a/‎pyproject.toml‎
Lines changed: 8 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/crawlee/_utils/models.py‎
Lines changed: 31 additions & 0 deletions b/‎src/crawlee/_utils/models.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/crawlee/basic_crawler/basic_crawler.py‎
Lines changed: 61 additions & 8 deletions b/‎src/crawlee/basic_crawler/basic_crawler.py‎
Lines changed: 61 additions & 8 deletions
diff --git a/‎src/crawlee/basic_crawler/types.py‎
Lines changed: 0 additions & 5 deletions b/‎src/crawlee/basic_crawler/types.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py‎
Lines changed: 1 addition & 1 deletion b/‎src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/crawlee/http_clients/base_http_client.py‎
Lines changed: 7 additions & 1 deletion b/‎src/crawlee/http_clients/base_http_client.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/crawlee/http_clients/httpx_client.py‎
Lines changed: 4 additions & 1 deletion b/‎src/crawlee/http_clients/httpx_client.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/crawlee/http_crawler/http_crawler.py‎
Lines changed: 1 addition & 1 deletion b/‎src/crawlee/http_crawler/http_crawler.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/crawlee/statistics/__init__.py‎ b/‎src/crawlee/statistics/__init__.py‎
diff --git a/‎src/crawlee/statistics/error_tracker.py‎
Lines changed: 35 additions & 0 deletions b/‎src/crawlee/statistics/error_tracker.py‎
Lines changed: 35 additions & 0 deletions
@@ -195,3 +195,11 @@ exclude_lines = [
     "if TYPE_CHECKING:",
     "assert_never()"
 ]
+
+[tool.basedpyright]
+reportPrivateLocalImportUsage = false
+reportUnusedCallResult = false
+reportUnusedVariable = false
+reportCallInDefaultInitializer = false
+reportImplicitStringConcatenation = false
+reportAny = false
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from datetime import timedelta
+from typing import Annotated, Any, Callable
+
+from pydantic import PlainSerializer, WrapValidator
+
+"""Utility types for Pydantic models."""
+
+
+def _timedelta_to_ms(td: timedelta | None) -> Any:
+    if td == timedelta.max:
+        return float('inf')
+
+    if td is None:
+        return td
+
+    return int(round(td.total_seconds() * 1000))
+
+
+def _timedelta_from_ms(value: float | timedelta | Any | None, handler: Callable[[Any], Any]) -> Any:
+    if value == float('inf'):
+        return timedelta.max
+
+    if not isinstance(value, (int, float)):
+        return handler(value)
+
+    return timedelta(milliseconds=value)
+
+
+timedelta_ms = Annotated[timedelta, PlainSerializer(_timedelta_to_ms), WrapValidator(_timedelta_from_ms)]
@@ -6,7 +6,7 @@
 from datetime import timedelta
 from functools import partial
 from logging import getLogger
-from typing import TYPE_CHECKING, AsyncGenerator, Awaitable, Callable, Generic, Sequence, Union, cast
+from typing import TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, Generic, Sequence, Union, cast
 
 import httpx
 from tldextract import TLDExtract
@@ -30,7 +30,6 @@
 from crawlee.basic_crawler.router import Router
 from crawlee.basic_crawler.types import (
     BasicCrawlingContext,
-    FinalStatistics,
     RequestHandlerRunResult,
     SendRequestFunction,
 )
@@ -40,13 +39,15 @@
 from crawlee.http_clients.httpx_client import HttpxClient
 from crawlee.models import BaseRequestData, Request, RequestState
 from crawlee.sessions import SessionPool
+from crawlee.statistics.statistics import Statistics
 from crawlee.storages.request_queue import RequestQueue
 
 if TYPE_CHECKING:
     import re
 
     from crawlee.http_clients.base_http_client import BaseHttpClient, HttpResponse
     from crawlee.sessions.session import Session
+    from crawlee.statistics.models import FinalStatistics, StatisticsState
     from crawlee.storages.request_provider import RequestProvider
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
@@ -70,6 +71,7 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
     session_pool: NotRequired[SessionPool]
     use_session_pool: NotRequired[bool]
     retry_on_blocked: NotRequired[bool]
+    statistics: NotRequired[Statistics[StatisticsState]]
     _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
 
 
@@ -98,6 +100,7 @@ def __init__(
         session_pool: SessionPool | None = None,
         use_session_pool: bool = True,
         retry_on_blocked: bool = True,
+        statistics: Statistics | None = None,
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
     ) -> None:
         """Initialize the BasicCrawler.
@@ -114,8 +117,9 @@ def __init__(
             configuration: Crawler configuration
             request_handler_timeout: How long is a single request handler allowed to run
             use_session_pool: Enables using the session pool for crawling
-            session_pool: A preconfigured SessionPool instance if you wish to use non-default configuration
+            session_pool: A preconfigured `SessionPool` instance if you wish to use non-default configuration
             retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection
+            statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
             _context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
                 This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
         """
@@ -165,6 +169,14 @@ def __init__(
 
         self._retry_on_blocked = retry_on_blocked
 
+        self._statistics = statistics or Statistics(
+            event_manager=self._event_manager,
+            log_message=f'{logger.name} request statistics',
+        )
+
+        self._running = False
+        self._has_finished_before = False
+
     @property
     def router(self) -> Router[TCrawlingContext]:
         """The router used to handle each individual crawling request."""
@@ -180,6 +192,11 @@ def router(self, router: Router[TCrawlingContext]) -> None:
 
         self._router = router
 
+    @property
+    def statistics(self) -> Statistics[StatisticsState]:
+        """Statistics about the current (or last) crawler run."""
+        return self._statistics
+
     async def _get_session(self) -> Session | None:
         """If session pool is being used, try to take a session from it."""
         if not self._use_session_pool:
@@ -235,19 +252,43 @@ async def add_requests(
 
     async def run(self, requests: list[str | BaseRequestData] | None = None) -> FinalStatistics:
         """Run the crawler until all requests are processed."""
+        if self._running:
+            raise RuntimeError(
+                'This crawler instance is already running, you can add more requests to it via `crawler.add_requests()`'
+            )
+
+        self._running = True
+
+        if self._has_finished_before:
+            await self._statistics.reset()
+
+            if self._use_session_pool:
+                await self._session_pool.reset_store()
+
         if requests is not None:
             await self.add_requests(requests)
 
         async with AsyncExitStack() as exit_stack:
             await exit_stack.enter_async_context(self._event_manager)
             await exit_stack.enter_async_context(self._snapshotter)
+            await exit_stack.enter_async_context(self._statistics)
 
             if self._use_session_pool:
                 await exit_stack.enter_async_context(self._session_pool)
 
             await self._pool.run()
 
-        return FinalStatistics()
+        if self._statistics.error_tracker.total > 0:
+            logger.info(
+                'Error analysis:'
+                f' total_errors={self._statistics.error_tracker.total}'
+                f' unique_errors={self._statistics.error_tracker.unique_error_count}'
+            )
+
+        self._running = False
+        self._has_finished_before = True
+
+        return self._statistics.calculate()
 
     def _should_retry_request(self, crawling_context: BasicCrawlingContext, error: Exception) -> bool:
         if crawling_context.request.no_retry:
@@ -298,13 +339,13 @@ def _check_enqueue_strategy(
         if strategy == EnqueueStrategy.ALL:
             return True
 
-        assert_never()
+        assert_never(strategy)
 
     def _check_url_patterns(
         self,
         target_url: httpx.URL,
-        include: Sequence[re.Pattern | Glob] | None,
-        exclude: Sequence[re.Pattern | Glob] | None,
+        include: Sequence[re.Pattern[Any] | Glob] | None,
+        exclude: Sequence[re.Pattern[Any] | Glob] | None,
     ) -> bool:
         """Check if a URL matches configured include/exclude patterns."""
         # If the URL matches any `exclude` pattern, reject it
@@ -332,10 +373,11 @@ def _check_url_patterns(
 
     async def _handle_request_error(self, crawling_context: TCrawlingContext, error: Exception) -> None:
         request_provider = await self.get_request_provider()
+        request = crawling_context.request
 
         if self._should_retry_request(crawling_context, error):
-            request = crawling_context.request
             request.retry_count += 1
+            self._statistics.error_tracker.add(error)
 
             if self._error_handler:
                 try:
@@ -357,9 +399,11 @@ async def _handle_request_error(self, crawling_context: TCrawlingContext, error:
                 max_retries=3,
             )
             await self._handle_failed_request(crawling_context, error)
+            self._statistics.record_request_processing_failure(request.id or request.unique_key)
 
     async def _handle_failed_request(self, crawling_context: TCrawlingContext, error: Exception) -> None:
         logger.exception('Request failed and reached maximum retries', exc_info=error)
+        self._statistics.error_tracker.add(error)
 
         if self._failed_request_handler:
             try:
@@ -441,6 +485,9 @@ async def __run_task_function(self) -> None:  # noqa: PLR0912
             add_requests=result.add_requests,
         )
 
+        statistics_id = request.id or request.unique_key
+        self._statistics.record_request_processing_start(statistics_id)
+
         try:
             request.state = RequestState.REQUEST_HANDLER
 
@@ -467,6 +514,8 @@ async def __run_task_function(self) -> None:  # noqa: PLR0912
 
             if crawling_context.session:
                 crawling_context.session.mark_good()
+
+            self._statistics.record_request_processing_finish(statistics_id)
         except RequestHandlerError as primary_error:
             primary_error = cast(
                 RequestHandlerError[TCrawlingContext], primary_error
@@ -514,6 +563,7 @@ async def __run_task_function(self) -> None:  # noqa: PLR0912
                 crawling_context.request.session_rotation_count += 1
 
                 await request_provider.reclaim_request(request)
+                self._statistics.error_tracker_retry.add(session_error)
             else:
                 logger.exception('Request failed and reached maximum retries', exc_info=session_error)
 
@@ -525,6 +575,9 @@ async def __run_task_function(self) -> None:  # noqa: PLR0912
                     logger=logger,
                     max_retries=3,
                 )
+
+                self._statistics.record_request_processing_failure(statistics_id)
+                self._statistics.error_tracker.add(session_error)
         except ContextPipelineInterruptedError as interruped_error:
             logger.debug('The context pipeline was interrupted', exc_info=interruped_error)
 
 
@@ -68,11 +68,6 @@ class BasicCrawlingContext:
     add_requests: AddRequestsFunction
 
 
-@dataclass(frozen=True)
-class FinalStatistics:
-    """Statistics about a crawler run."""
-
-
 class AddRequestsFunctionCall(AddRequestsFunctionKwargs):
     """Record of a call to `add_requests`."""
 
 
@@ -62,7 +62,7 @@ def __init__(
         super().__init__(**kwargs)
 
     async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
-        result = await self._http_client.crawl(context.request, context.session)
+        result = await self._http_client.crawl(context.request, context.session, self._statistics)
 
         yield HttpCrawlingContext(
             request=context.request,
 
@@ -9,6 +9,7 @@
 
     from crawlee.models import Request
     from crawlee.sessions.session import Session
+    from crawlee.statistics.statistics import Statistics
 
 
 class HttpResponse(Protocol):
@@ -48,7 +49,12 @@ def __init__(
         self._ignore_http_error_status_codes = set(ignore_http_error_status_codes)
 
     @abstractmethod
-    async def crawl(self, request: Request, session: Session | None) -> HttpCrawlingResult:
+    async def crawl(
+        self,
+        request: Request,
+        session: Session | None,
+        statistics: Statistics,
+    ) -> HttpCrawlingResult:
         """Perform a crawl of an URL."""
 
     @abstractmethod
 
@@ -12,6 +12,7 @@
 
 if TYPE_CHECKING:
     from crawlee.models import Request
+    from crawlee.statistics.statistics import Statistics
 
 
 class HttpTransport(httpx.AsyncHTTPTransport):
@@ -64,7 +65,7 @@ def __init__(
         self._client = httpx.AsyncClient(transport=HttpTransport())
 
     @override
-    async def crawl(self, request: Request, session: Session | None) -> HttpCrawlingResult:
+    async def crawl(self, request: Request, session: Session | None, statistics: Statistics) -> HttpCrawlingResult:
         http_request = self._client.build_request(
             method=request.method,
             url=request.url,
@@ -81,6 +82,8 @@ async def crawl(self, request: Request, session: Session | None) -> HttpCrawling
 
             raise
 
+        statistics.register_status_code(response.status_code)
+
         exclude_error = response.status_code in self._ignore_http_error_status_codes
         include_error = response.status_code in self._additional_http_error_status_codes
 
 
@@ -51,7 +51,7 @@ def __init__(
     async def _make_http_request(
         self, crawling_context: BasicCrawlingContext
     ) -> AsyncGenerator[HttpCrawlingContext, None]:
-        result = await self._http_client.crawl(crawling_context.request, crawling_context.session)
+        result = await self._http_client.crawl(crawling_context.request, crawling_context.session, self._statistics)
 
         yield HttpCrawlingContext(
             request=crawling_context.request,
 
@@ -0,0 +1,35 @@
+# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/error_tracker.ts
+
+from __future__ import annotations
+
+from collections import Counter
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True, unsafe_hash=True)
+class ErrorGroup:
+    """Identifies a group of similar errors."""
+
+    class_name: str | None
+
+
+class ErrorTracker:
+    """Track errors and aggregates their counts by similarity."""
+
+    def __init__(self) -> None:
+        self._errors = Counter[ErrorGroup]()
+
+    def add(self, error: Exception) -> None:
+        """Include an error in the statistics."""
+        error_group = ErrorGroup(class_name=error.__class__.__name__)
+        self._errors[error_group] += 1
+
+    @property
+    def unique_error_count(self) -> int:
+        """Number of distinct kinds of errors."""
+        return len(self._errors)
+
+    @property
+    def total(self) -> int:
+        """Total number of errors."""
+        return sum(self._errors.values())