apify · MrAliHasan · Feb 20, 2026 · Feb 23, 2026 · Feb 25, 2026 · Feb 26, 2026
diff --git a/docs/guides/code_examples/request_throttling/throttling_example.py b/docs/guides/code_examples/request_throttling/throttling_example.py
@@ -0,0 +1,41 @@
+import asyncio
+
+from crawlee.crawlers import BasicCrawler, BasicCrawlingContext
+from crawlee.request_loaders import ThrottlingRequestManager
+from crawlee.storages import RequestQueue
+
+
+async def main() -> None:
+    # Open the default request queue.
+    queue = await RequestQueue.open()
+
+    # Wrap it with ThrottlingRequestManager for specific domains.
+    # The throttler uses the same storage backend as the underlying queue.
+    throttler = ThrottlingRequestManager(
+        queue,
+        domains=['api.example.com', 'slow-site.org'],
+    )
+
+    # Pass the throttler as the crawler's request manager.
+    crawler = BasicCrawler(request_manager=throttler)
+
+    @crawler.router.default_handler
+    async def handler(context: BasicCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url}')
+
+    # Add requests. Listed domains are routed directly to their
+    # throttled sub-queues. Others go to the main queue.
+    await throttler.add_requests(
+        [
+            'https://api.example.com/data',
+            'https://api.example.com/users',
+            'https://slow-site.org/page1',
+            'https://fast-site.com/page1',  # Not throttled
+        ]
+    )
+
+    await crawler.run()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/request_throttling.mdx b/docs/guides/request_throttling.mdx
@@ -0,0 +1,47 @@
+---
+id: request-throttling
+title: Request throttling
+description: How to throttle requests per domain using the ThrottlingRequestManager.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
+
+import ThrottlingExample from '!!raw-loader!roa-loader!./code_examples/request_throttling/throttling_example.py';
+
+When crawling websites that enforce rate limits (HTTP 429) or specify `crawl-delay` in their `robots.txt`, you need a way to throttle requests per domain without blocking unrelated domains. The <ApiLink to="class/ThrottlingRequestManager">`ThrottlingRequestManager`</ApiLink> provides exactly this.
+
+## Overview
+
+The <ApiLink to="class/ThrottlingRequestManager">`ThrottlingRequestManager`</ApiLink> wraps a <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> (typically a <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>) and manages per-domain throttling. You specify which domains to throttle at initialization, and the manager automatically:
+
+- **Routes requests** for listed domains into dedicated sub-queues at insertion time.
+- **Enforces delays** from HTTP 429 responses (exponential backoff) and `robots.txt` crawl-delay directives.
+- **Schedules fairly** by fetching from the domain that has been waiting the longest.
+- **Sleeps intelligently** when all configured domains are throttled, instead of busy-waiting.
+
+Requests for domains **not** in the configured list pass through to the main queue without any throttling.
+
+## Basic usage
+
+To use request throttling, create a <ApiLink to="class/ThrottlingRequestManager">`ThrottlingRequestManager`</ApiLink> with the domains you want to throttle and pass it as the `request_manager` to your crawler:
+
+<RunnableCodeBlock className="language-python" language="python">
+    {ThrottlingExample}
+</RunnableCodeBlock>
+
+## How it works
+
+1. **Insertion-time routing**: When you add requests via `add_request` or `add_requests`, each request is checked against the configured domain list. Matching requests go directly into a per-domain sub-queue; all others go to the main queue. This eliminates request duplication entirely.
+
+2. **429 backoff**: When the crawler detects an HTTP 429 response, the `ThrottlingRequestManager` records an exponential backoff delay for that domain (starting at 2s, doubling up to 60s). If the response includes a `Retry-After` header, that value takes priority.
+
+3. **Crawl-delay**: If `robots.txt` specifies a `crawl-delay`, the manager enforces a minimum interval between requests to that domain.
+
+4. **Fair scheduling**: `fetch_next_request` sorts available sub-queues by how long each domain has been waiting, ensuring no domain is starved.
+
+:::tip
+
+The `ThrottlingRequestManager` is an opt-in feature. If you don't pass it to your crawler, requests are processed normally without any per-domain throttling.
+
+:::
diff --git a/src/crawlee/_utils/http.py b/src/crawlee/_utils/http.py
@@ -0,0 +1,41 @@
+"""HTTP utility functions for Crawlee."""
+
+from __future__ import annotations
+
+from datetime import datetime, timedelta, timezone
+from email.utils import parsedate_to_datetime
+
+
+def parse_retry_after_header(value: str | None) -> timedelta | None:
+    """Parse the Retry-After HTTP header value.
+
+    The header can contain either a number of seconds or an HTTP-date.
+    See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After
+
+    Args:
+        value: The raw Retry-After header value.
+
+    Returns:
+        A timedelta representing the delay, or None if the header is missing or unparsable.
+    """
+    if not value:
+        return None
+
+    # Try parsing as integer seconds first.
+    try:
+        seconds = int(value)
+        return timedelta(seconds=seconds)
+    except ValueError:
+        pass
+
+    # Try parsing as HTTP-date (e.g., "Wed, 21 Oct 2015 07:28:00 GMT").
+
+    try:
+        retry_date = parsedate_to_datetime(value)
+        delay = retry_date - datetime.now(retry_date.tzinfo or timezone.utc)
+        if delay.total_seconds() > 0:
+            return delay
+    except (ValueError, TypeError):
+        pass
+
+    return None
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -279,7 +279,12 @@ async def _handle_status_code_response(
         """
         status_code = context.http_response.status_code
         if self._retry_on_blocked:
-            self._raise_for_session_blocked_status_code(context.session, status_code)
+            self._raise_for_session_blocked_status_code(
+                context.session,
+                status_code,
+                request_url=context.request.url,
+                retry_after_header=context.http_response.headers.get('retry-after'),
+            )
         self._raise_for_error_status_code(status_code)
         yield context
 

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -45,6 +45,7 @@
 )
 from crawlee._utils.docs import docs_group
 from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
+from crawlee._utils.http import parse_retry_after_header
 from crawlee._utils.recurring_task import RecurringTask
 from crawlee._utils.robots import RobotsTxtFile
 from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -63,6 +64,7 @@
 )
 from crawlee.events._types import Event, EventCrawlerStatusData
 from crawlee.http_clients import ImpitHttpClient
+from crawlee.request_loaders import ThrottlingRequestManager
 from crawlee.router import Router
 from crawlee.sessions import SessionPool
 from crawlee.statistics import Statistics, StatisticsState
@@ -700,19 +702,30 @@ async def run(
 
         self._running = True
 
+        if self._respect_robots_txt_file and not isinstance(self._request_manager, ThrottlingRequestManager):
+            self._logger.warning(
+                'The `respect_robots_txt_file` option is enabled, but the crawler is not using '
+                '`ThrottlingRequestManager`. Crawl-delay directives from robots.txt will not be '
+                'enforced. To enable crawl-delay support, configure the crawler to use '
+                '`ThrottlingRequestManager` as the request manager.'
+            )
+
         if self._has_finished_before:
             await self._statistics.reset()
 
             if self._use_session_pool:
                 await self._session_pool.reset_store()
 
             request_manager = await self.get_request_manager()
-            if purge_request_queue and isinstance(request_manager, RequestQueue):
-                await request_manager.drop()
-                self._request_manager = await RequestQueue.open(
-                    storage_client=self._service_locator.get_storage_client(),
-                    configuration=self._service_locator.get_configuration(),
-                )
+            if purge_request_queue:
+                if isinstance(request_manager, RequestQueue):
+                    await request_manager.drop()
+                    self._request_manager = await RequestQueue.open(
+                        storage_client=self._service_locator.get_storage_client(),
+                        configuration=self._service_locator.get_configuration(),
+                    )
+                elif isinstance(request_manager, ThrottlingRequestManager):
+                    self._request_manager = await request_manager.recreate_purged()
 
         if requests is not None:
             await self.add_requests(requests)
@@ -1542,16 +1555,36 @@ def _raise_for_error_status_code(self, status_code: int) -> None:
         if is_status_code_server_error(status_code) and not is_ignored_status:
             raise HttpStatusCodeError('Error status code returned', status_code)
 
-    def _raise_for_session_blocked_status_code(self, session: Session | None, status_code: int) -> None:
+    def _raise_for_session_blocked_status_code(
+        self,
+        session: Session | None,
+        status_code: int,
+        *,
+        request_url: str = '',
+        retry_after_header: str | None = None,
+    ) -> None:
         """Raise an exception if the given status code indicates the session is blocked.
 
+        If the status code is 429 (Too Many Requests), the domain is recorded as
+        rate-limited in the `ThrottlingRequestManager` for per-domain backoff.
+
         Args:
             session: The session used for the request. If None, no check is performed.
             status_code: The HTTP status code to check.
+            request_url: The request URL, used for per-domain rate limit tracking.
+            retry_after_header: The value of the Retry-After response header, if present.
 
         Raises:
             SessionError: If the status code indicates the session is blocked.
         """
+        if status_code == 429 and request_url:  # noqa: PLR2004
+            retry_after = parse_retry_after_header(retry_after_header)
+
+            # _request_manager might not be initialized yet if called directly or early,
+            # but usually it's set in get_request_manager().
+            if isinstance(self._request_manager, ThrottlingRequestManager):
+                self._request_manager.record_domain_delay(request_url, retry_after=retry_after)
+
         if session is not None and session.is_blocked_status_code(
             status_code=status_code,
             ignore_http_error_status_codes=self._ignore_http_error_status_codes,
@@ -1582,7 +1615,16 @@ async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:
         if not self._respect_robots_txt_file:
             return True
         robots_txt_file = await self._get_robots_txt_file_for_url(url)
-        return not robots_txt_file or robots_txt_file.is_allowed(url)
+        if not robots_txt_file:
+            return True
+
+        # Wire robots.txt crawl-delay into ThrottlingRequestManager
+        if isinstance(self._request_manager, ThrottlingRequestManager):
+            crawl_delay = robots_txt_file.get_crawl_delay()
+            if crawl_delay is not None:
+                self._request_manager.set_crawl_delay(url, crawl_delay)
+
+        return robots_txt_file.is_allowed(url)
 
     async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
         """Get the RobotsTxtFile for a given URL.

diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -459,7 +459,12 @@ async def _handle_status_code_response(
         """
         status_code = context.response.status
         if self._retry_on_blocked:
-            self._raise_for_session_blocked_status_code(context.session, status_code)
+            self._raise_for_session_blocked_status_code(
+                context.session,
+                status_code,
+                request_url=context.request.url,
+                retry_after_header=context.response.headers.get('retry-after'),
+            )
         self._raise_for_error_status_code(status_code)
         yield context
 

diff --git a/src/crawlee/request_loaders/__init__.py b/src/crawlee/request_loaders/__init__.py
@@ -3,5 +3,13 @@
 from ._request_manager import RequestManager
 from ._request_manager_tandem import RequestManagerTandem
 from ._sitemap_request_loader import SitemapRequestLoader
+from ._throttling_request_manager import ThrottlingRequestManager
 
-__all__ = ['RequestList', 'RequestLoader', 'RequestManager', 'RequestManagerTandem', 'SitemapRequestLoader']
+__all__ = [
+    'RequestList',
+    'RequestLoader',
+    'RequestManager',
+    'RequestManagerTandem',
+    'SitemapRequestLoader',
+    'ThrottlingRequestManager',
+]
diff --git a/src/crawlee/request_loaders/_request_manager.py b/src/crawlee/request_loaders/_request_manager.py
@@ -28,7 +28,7 @@ async def add_request(
         request: str | Request,
         *,
         forefront: bool = False,
-    ) -> ProcessedRequest:
+    ) -> ProcessedRequest | None:
         """Add a single request to the manager and store it in underlying resource client.
 
         Args:
@@ -37,7 +37,7 @@ async def add_request(
                 of the manager.
 
         Returns:
-            Information about the request addition to the manager.
+            Information about the request addition to the manager, or None if the request was not added.
         """
 
     async def add_requests(
@@ -64,7 +64,8 @@ async def add_requests(
         processed_requests = list[ProcessedRequest]()
         for request in requests:
             processed_request = await self.add_request(request, forefront=forefront)
-            processed_requests.append(processed_request)
+            if processed_request is not None:
+                processed_requests.append(processed_request)
 
     @abstractmethod
     async def reclaim_request(self, request: Request, *, forefront: bool = False) -> ProcessedRequest | None:

diff --git a/src/crawlee/request_loaders/_request_manager_tandem.py b/src/crawlee/request_loaders/_request_manager_tandem.py
@@ -49,7 +49,7 @@ async def is_finished(self) -> bool:
         return (await self._read_only_loader.is_finished()) and (await self._read_write_manager.is_finished())
 
     @override
-    async def add_request(self, request: str | Request, *, forefront: bool = False) -> ProcessedRequest:
+    async def add_request(self, request: str | Request, *, forefront: bool = False) -> ProcessedRequest | None:
         return await self._read_write_manager.add_request(request, forefront=forefront)
 
     @override