Skip to content

Commit 5c3753a

Browse files
authored
feat: Proxy configuration (#156)
### Description - closes #136 ### TODO - [x] copy applicable tests from SDK - [x] add the proxy info to context - [x] use the configured proxy in HTTP clients
1 parent eeebe9b commit 5c3753a

14 files changed

Lines changed: 616 additions & 17 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ types-beautifulsoup4 = "^4.12.0.20240229"
8383
types-colorama = "~0.4.15.20240106"
8484
types-psutil = "~5.9.5.20240205"
8585
types-python-dateutil = "^2.9.0.20240316"
86+
proxy-py = "^2.4.4"
8687

8788
[tool.poetry.extras]
8889
beautifulsoup = ["beautifulsoup4", "lxml", "html5lib"]

src/crawlee/basic_crawler/basic_crawler.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
from __future__ import annotations
33

44
import tempfile
5+
from collections.abc import AsyncGenerator, Awaitable, Sequence
56
from contextlib import AsyncExitStack
67
from datetime import timedelta
78
from functools import partial
89
from logging import getLogger
9-
from typing import TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, Generic, Sequence, Union, cast
10+
from typing import TYPE_CHECKING, Any, Callable, Generic, Union, cast
1011

1112
import httpx
1213
from tldextract import TLDExtract
@@ -46,6 +47,7 @@
4647
import re
4748

4849
from crawlee.http_clients.base_http_client import BaseHttpClient, HttpResponse
50+
from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
4951
from crawlee.sessions.session import Session
5052
from crawlee.statistics.models import FinalStatistics, StatisticsState
5153
from crawlee.storages.request_provider import RequestProvider
@@ -71,6 +73,7 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
7173
session_pool: NotRequired[SessionPool]
7274
use_session_pool: NotRequired[bool]
7375
retry_on_blocked: NotRequired[bool]
76+
proxy_configuration: NotRequired[ProxyConfiguration]
7477
statistics: NotRequired[Statistics[StatisticsState]]
7578
_context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
7679

@@ -100,6 +103,7 @@ def __init__(
100103
session_pool: SessionPool | None = None,
101104
use_session_pool: bool = True,
102105
retry_on_blocked: bool = True,
106+
proxy_configuration: ProxyConfiguration | None = None,
103107
statistics: Statistics | None = None,
104108
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
105109
) -> None:
@@ -119,6 +123,7 @@ def __init__(
119123
use_session_pool: Enables using the session pool for crawling
120124
session_pool: A preconfigured `SessionPool` instance if you wish to use non-default configuration
121125
retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection
126+
proxy_configuration: A HTTP proxy configuration to be used for making requests
122127
statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
123128
_context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
124129
This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
@@ -169,6 +174,7 @@ def __init__(
169174

170175
self._retry_on_blocked = retry_on_blocked
171176

177+
self._proxy_configuration = proxy_configuration
172178
self._statistics = statistics or Statistics(
173179
event_manager=self._event_manager,
174180
log_message=f'{logger.name} request statistics',
@@ -211,6 +217,17 @@ async def _get_session(self) -> Session | None:
211217
logger=logger,
212218
)
213219

220+
async def _get_proxy_info(self, request: Request, session: Session | None) -> ProxyInfo | None:
221+
"""Retrieve a new ProxyInfo object based on crawler configuration and the current request and session."""
222+
if not self._proxy_configuration:
223+
return None
224+
225+
return await self._proxy_configuration.new_proxy_info(
226+
session_id=session.id if session else None,
227+
request=request,
228+
proxy_tier=None,
229+
)
230+
214231
async def get_request_provider(self) -> RequestProvider:
215232
"""Return the configured request provider. If none is configured, open and return the default request queue."""
216233
if not self._request_provider:
@@ -411,15 +428,23 @@ async def _handle_failed_request(self, crawling_context: TCrawlingContext, error
411428
except Exception as e:
412429
raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e
413430

414-
def _prepare_send_request_function(self, session: Session | None) -> SendRequestFunction:
431+
def _prepare_send_request_function(
432+
self,
433+
session: Session | None,
434+
proxy_info: ProxyInfo | None,
435+
) -> SendRequestFunction:
415436
async def send_request(
416437
url: str,
417438
*,
418439
method: str = 'get',
419440
headers: dict[str, str] | None = None,
420441
) -> HttpResponse:
421442
return await self._http_client.send_request(
422-
url, method=method, headers=httpx.Headers(headers), session=session
443+
url,
444+
method=method,
445+
headers=httpx.Headers(headers),
446+
session=session,
447+
proxy_info=proxy_info,
423448
)
424449

425450
return send_request
@@ -461,7 +486,7 @@ async def __is_task_ready_function(self) -> bool:
461486
request_provider = await self.get_request_provider()
462487
return not await request_provider.is_empty()
463488

464-
async def __run_task_function(self) -> None: # noqa: PLR0912
489+
async def __run_task_function(self) -> None:
465490
request_provider = await self.get_request_provider()
466491

467492
request = await wait_for(
@@ -476,12 +501,14 @@ async def __run_task_function(self) -> None: # noqa: PLR0912
476501
return
477502

478503
session = await self._get_session()
504+
proxy_info = await self._get_proxy_info(request, session)
479505
result = RequestHandlerRunResult()
480506

481507
crawling_context = BasicCrawlingContext(
482508
request=request,
483509
session=session,
484-
send_request=self._prepare_send_request_function(session),
510+
proxy_info=proxy_info,
511+
send_request=self._prepare_send_request_function(session, proxy_info),
485512
add_requests=result.add_requests,
486513
)
487514

src/crawlee/basic_crawler/types.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
from __future__ import annotations
33

44
import re
5+
from collections.abc import Coroutine, Sequence
56
from dataclasses import dataclass, field
6-
from typing import TYPE_CHECKING, Any, Coroutine, Protocol, Sequence
7+
from typing import TYPE_CHECKING, Any, Protocol
78

89
from typing_extensions import NotRequired, TypedDict, Unpack
910

@@ -12,6 +13,7 @@
1213
from crawlee.enqueue_strategy import EnqueueStrategy
1314
from crawlee.http_clients.base_http_client import HttpResponse
1415
from crawlee.models import BaseRequestData, Request
16+
from crawlee.proxy_configuration import ProxyInfo
1517
from crawlee.sessions.session import Session
1618

1719

@@ -64,6 +66,7 @@ class BasicCrawlingContext:
6466

6567
request: Request
6668
session: Session | None
69+
proxy_info: ProxyInfo | None
6770
send_request: SendRequestFunction
6871
add_requests: AddRequestsFunction
6972

src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,17 @@ def __init__(
6262
super().__init__(**kwargs)
6363

6464
async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
65-
result = await self._http_client.crawl(context.request, context.session, self._statistics)
65+
result = await self._http_client.crawl(
66+
context.request,
67+
context.session,
68+
context.proxy_info,
69+
self._statistics,
70+
)
6671

6772
yield HttpCrawlingContext(
6873
request=context.request,
6974
session=context.session,
75+
proxy_info=context.proxy_info,
7076
send_request=context.send_request,
7177
add_requests=context.add_requests,
7278
http_response=result.http_response,
@@ -128,6 +134,7 @@ async def enqueue_links(
128134
yield BeautifulSoupCrawlingContext(
129135
request=context.request,
130136
session=context.session,
137+
proxy_info=context.proxy_info,
131138
send_request=context.send_request,
132139
add_requests=context.add_requests,
133140
enqueue_links=enqueue_links,

src/crawlee/http_clients/base_http_client.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from httpx import Headers # Type from `httpx` is used here because it is lightweight and convenient
99

1010
from crawlee.models import Request
11+
from crawlee.proxy_configuration import ProxyInfo
1112
from crawlee.sessions.session import Session
1213
from crawlee.statistics.statistics import Statistics
1314

@@ -53,12 +54,19 @@ async def crawl(
5354
self,
5455
request: Request,
5556
session: Session | None,
57+
proxy_info: ProxyInfo | None,
5658
statistics: Statistics,
5759
) -> HttpCrawlingResult:
5860
"""Perform a crawl of an URL."""
5961

6062
@abstractmethod
6163
async def send_request(
62-
self, url: str, *, method: str, headers: Headers | dict[str, str], session: Session | None = None
64+
self,
65+
url: str,
66+
*,
67+
method: str,
68+
headers: Headers | dict[str, str],
69+
session: Session | None,
70+
proxy_info: ProxyInfo | None,
6371
) -> HttpResponse:
6472
"""Perform an HTTP request."""

src/crawlee/http_clients/httpx_client.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING, Iterable, cast
3+
from typing import TYPE_CHECKING, Optional, cast
44

55
import httpx
66
from typing_extensions import override
@@ -11,9 +11,14 @@
1111
from crawlee.sessions.session import Session
1212

1313
if TYPE_CHECKING:
14+
from collections.abc import Iterable
15+
1416
from crawlee.models import Request
17+
from crawlee.proxy_configuration import ProxyInfo
1518
from crawlee.statistics.statistics import Statistics
1619

20+
__all__ = ['HttpxClient']
21+
1722

1823
class HttpTransport(httpx.AsyncHTTPTransport):
1924
"""A modified HTTP transport adapter that stores response cookies in a `Session` instead of the httpx client."""
@@ -62,11 +67,25 @@ def __init__(
6267
additional_http_error_status_codes=additional_http_error_status_codes,
6368
ignore_http_error_status_codes=ignore_http_error_status_codes,
6469
)
65-
self._client = httpx.AsyncClient(transport=HttpTransport())
70+
71+
self._client_by_proxy_url = dict[Optional[str], httpx.AsyncClient]()
72+
73+
def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
74+
if proxy_url not in self._client_by_proxy_url:
75+
self._client_by_proxy_url[proxy_url] = httpx.AsyncClient(transport=HttpTransport(), proxy=proxy_url)
76+
77+
return self._client_by_proxy_url[proxy_url]
6678

6779
@override
68-
async def crawl(self, request: Request, session: Session | None, statistics: Statistics) -> HttpCrawlingResult:
69-
http_request = self._client.build_request(
80+
async def crawl(
81+
self,
82+
request: Request,
83+
session: Session | None,
84+
proxy_info: ProxyInfo | None,
85+
statistics: Statistics,
86+
) -> HttpCrawlingResult:
87+
client = self._get_client(proxy_info.url if proxy_info else None)
88+
http_request = client.build_request(
7089
method=request.method,
7190
url=request.url,
7291
headers=request.headers,
@@ -75,7 +94,7 @@ async def crawl(self, request: Request, session: Session | None, statistics: Sta
7594
)
7695

7796
try:
78-
response = await self._client.send(http_request, follow_redirects=True)
97+
response = await client.send(http_request, follow_redirects=True)
7998
except httpx.TransportError as e:
8099
if _is_proxy_error(e):
81100
raise ProxyError from e
@@ -110,17 +129,20 @@ async def send_request(
110129
*,
111130
method: str,
112131
headers: httpx.Headers | dict[str, str],
113-
session: Session | None = None,
132+
session: Session | None,
133+
proxy_info: ProxyInfo | None,
114134
) -> HttpResponse:
115-
http_request = self._client.build_request(
135+
client = self._get_client(proxy_info.url if proxy_info else None)
136+
137+
http_request = client.build_request(
116138
url=url,
117139
method=method,
118140
headers=headers,
119141
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
120142
)
121143

122144
try:
123-
response = await self._client.send(http_request)
145+
response = await client.send(http_request)
124146
except httpx.TransportError as e:
125147
if _is_proxy_error(e):
126148
raise ProxyError from e

src/crawlee/http_crawler/http_crawler.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,17 @@ def __init__(
5151
async def _make_http_request(
5252
self, crawling_context: BasicCrawlingContext
5353
) -> AsyncGenerator[HttpCrawlingContext, None]:
54-
result = await self._http_client.crawl(crawling_context.request, crawling_context.session, self._statistics)
54+
result = await self._http_client.crawl(
55+
crawling_context.request,
56+
crawling_context.session,
57+
crawling_context.proxy_info,
58+
self._statistics,
59+
)
5560

5661
yield HttpCrawlingContext(
5762
request=crawling_context.request,
5863
session=crawling_context.session,
64+
proxy_info=crawling_context.proxy_info,
5965
send_request=crawling_context.send_request,
6066
add_requests=crawling_context.add_requests,
6167
http_response=result.http_response,

src/crawlee/models.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,26 @@ def enqueue_strategy(self, new_enqueue_strategy: EnqueueStrategy) -> None:
167167
self.user_data.setdefault('__crawlee', {})
168168
self.user_data['__crawlee']['enqueueStrategy'] = str(new_enqueue_strategy)
169169

170+
@property
171+
def last_proxy_tier(self) -> int | None:
172+
"""The last proxy tier used to process the request."""
173+
return self.crawlee_data.last_proxy_tier
174+
175+
@last_proxy_tier.setter
176+
def last_proxy_tier(self, new_value: int) -> None:
177+
self.user_data.setdefault('__crawlee', {})
178+
self.user_data['__crawlee']['lastProxyTier'] = new_value
179+
180+
@property
181+
def forefront(self) -> bool:
182+
"""Should the request be enqueued at the start of the queue?"""
183+
return self.crawlee_data.forefront
184+
185+
@forefront.setter
186+
def forefront(self, new_value: bool) -> None:
187+
self.user_data.setdefault('__crawlee', {})
188+
self.user_data['__crawlee']['forefront'] = new_value
189+
170190

171191
class RequestState(Enum):
172192
"""Crawlee-specific request handling state."""
@@ -197,6 +217,10 @@ class CrawleeRequestData(BaseModel):
197217

198218
skip_navigation: Annotated[bool, Field(alias='skipNavigation')] = False
199219

220+
last_proxy_tier: Annotated[int | None, Field(alias='lastProxyTier')] = None
221+
222+
forefront: Annotated[bool, Field()] = False
223+
200224

201225
class BaseStorageMetadata(BaseModel):
202226
"""Base model for storage metadata."""

0 commit comments

Comments
 (0)