22from __future__ import annotations
33
44import tempfile
5+ from collections .abc import AsyncGenerator , Awaitable , Sequence
56from contextlib import AsyncExitStack
67from datetime import timedelta
78from functools import partial
89from logging import getLogger
9- from typing import TYPE_CHECKING , Any , AsyncGenerator , Awaitable , Callable , Generic , Sequence , Union , cast
10+ from typing import TYPE_CHECKING , Any , Callable , Generic , Union , cast
1011
1112import httpx
1213from tldextract import TLDExtract
4647 import re
4748
4849 from crawlee .http_clients .base_http_client import BaseHttpClient , HttpResponse
50+ from crawlee .proxy_configuration import ProxyConfiguration , ProxyInfo
4951 from crawlee .sessions .session import Session
5052 from crawlee .statistics .models import FinalStatistics , StatisticsState
5153 from crawlee .storages .request_provider import RequestProvider
@@ -71,6 +73,7 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
7173 session_pool : NotRequired [SessionPool ]
7274 use_session_pool : NotRequired [bool ]
7375 retry_on_blocked : NotRequired [bool ]
76+ proxy_configuration : NotRequired [ProxyConfiguration ]
7477 statistics : NotRequired [Statistics [StatisticsState ]]
7578 _context_pipeline : NotRequired [ContextPipeline [TCrawlingContext ]]
7679
@@ -100,6 +103,7 @@ def __init__(
100103 session_pool : SessionPool | None = None ,
101104 use_session_pool : bool = True ,
102105 retry_on_blocked : bool = True ,
106+ proxy_configuration : ProxyConfiguration | None = None ,
103107 statistics : Statistics | None = None ,
104108 _context_pipeline : ContextPipeline [TCrawlingContext ] | None = None ,
105109 ) -> None :
@@ -119,6 +123,7 @@ def __init__(
119123 use_session_pool: Enables using the session pool for crawling
120124 session_pool: A preconfigured `SessionPool` instance if you wish to use non-default configuration
121125 retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection
126+ proxy_configuration: A HTTP proxy configuration to be used for making requests
122127 statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
123128 _context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
124129 This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
@@ -169,6 +174,7 @@ def __init__(
169174
170175 self ._retry_on_blocked = retry_on_blocked
171176
177+ self ._proxy_configuration = proxy_configuration
172178 self ._statistics = statistics or Statistics (
173179 event_manager = self ._event_manager ,
174180 log_message = f'{ logger .name } request statistics' ,
@@ -211,6 +217,17 @@ async def _get_session(self) -> Session | None:
211217 logger = logger ,
212218 )
213219
220+ async def _get_proxy_info (self , request : Request , session : Session | None ) -> ProxyInfo | None :
221+ """Retrieve a new ProxyInfo object based on crawler configuration and the current request and session."""
222+ if not self ._proxy_configuration :
223+ return None
224+
225+ return await self ._proxy_configuration .new_proxy_info (
226+ session_id = session .id if session else None ,
227+ request = request ,
228+ proxy_tier = None ,
229+ )
230+
214231 async def get_request_provider (self ) -> RequestProvider :
215232 """Return the configured request provider. If none is configured, open and return the default request queue."""
216233 if not self ._request_provider :
@@ -411,15 +428,23 @@ async def _handle_failed_request(self, crawling_context: TCrawlingContext, error
411428 except Exception as e :
412429 raise UserDefinedErrorHandlerError ('Exception thrown in user-defined failed request handler' ) from e
413430
414- def _prepare_send_request_function (self , session : Session | None ) -> SendRequestFunction :
431+ def _prepare_send_request_function (
432+ self ,
433+ session : Session | None ,
434+ proxy_info : ProxyInfo | None ,
435+ ) -> SendRequestFunction :
415436 async def send_request (
416437 url : str ,
417438 * ,
418439 method : str = 'get' ,
419440 headers : dict [str , str ] | None = None ,
420441 ) -> HttpResponse :
421442 return await self ._http_client .send_request (
422- url , method = method , headers = httpx .Headers (headers ), session = session
443+ url ,
444+ method = method ,
445+ headers = httpx .Headers (headers ),
446+ session = session ,
447+ proxy_info = proxy_info ,
423448 )
424449
425450 return send_request
@@ -461,7 +486,7 @@ async def __is_task_ready_function(self) -> bool:
461486 request_provider = await self .get_request_provider ()
462487 return not await request_provider .is_empty ()
463488
464- async def __run_task_function (self ) -> None : # noqa: PLR0912
489+ async def __run_task_function (self ) -> None :
465490 request_provider = await self .get_request_provider ()
466491
467492 request = await wait_for (
@@ -476,12 +501,14 @@ async def __run_task_function(self) -> None: # noqa: PLR0912
476501 return
477502
478503 session = await self ._get_session ()
504+ proxy_info = await self ._get_proxy_info (request , session )
479505 result = RequestHandlerRunResult ()
480506
481507 crawling_context = BasicCrawlingContext (
482508 request = request ,
483509 session = session ,
484- send_request = self ._prepare_send_request_function (session ),
510+ proxy_info = proxy_info ,
511+ send_request = self ._prepare_send_request_function (session , proxy_info ),
485512 add_requests = result .add_requests ,
486513 )
487514
0 commit comments