Skip to content

Commit f6fcf48

Browse files
authored
fix: adopt new version of curl-cffi (#543)
### Description - Version 1.7.2 of `curl-cffi` introduces breaking changes. - This update includes adopting the new version, adding type aliases for `Request` fields, and incorporating other minor changes from PR #542. ### Issues - N/A (ad-hoc fix) ### Testing - The current set of unit tests should cover the changes. ### Checklist - [x] CI passed
1 parent 8a3d369 commit f6fcf48

15 files changed

Lines changed: 76 additions & 40 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ apify = { version = ">=2.0.0", optional = true }
4949
beautifulsoup4 = { version = ">=4.12.0", optional = true }
5050
colorama = ">=0.4.0"
5151
cookiecutter = ">=2.6.0"
52-
curl-cffi = { version = ">=0.7.0", optional = true }
52+
curl-cffi = { version = ">=0.7.2", optional = true }
5353
docutils = ">=0.21.0"
5454
eval-type-backport = ">=0.2.0"
5555
html5lib = { version = ">=1.0", optional = true }

src/crawlee/_request.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
)
2121
from typing_extensions import Self
2222

23-
from crawlee._types import EnqueueStrategy, HttpMethod
23+
from crawlee._types import EnqueueStrategy, HttpMethod, HttpPayload, HttpQueryParams
2424
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
2525
from crawlee._utils.urls import extract_query_params, validate_http_url
2626

@@ -117,14 +117,17 @@ class BaseRequestData(BaseModel):
117117
"""
118118

119119
method: HttpMethod = 'GET'
120+
"""HTTP request method."""
120121

121-
payload: str | None = None
122+
headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
123+
"""HTTP request headers."""
122124

123-
headers: Annotated[dict[str, str] | None, Field(default_factory=dict)] = None
125+
query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {}
126+
"""URL query parameters."""
124127

125-
query_params: Annotated[dict[str, Any] | None, Field(default_factory=dict)] = None
128+
payload: HttpPayload | None = None
126129

127-
data: Annotated[dict[str, Any] | None, Field(default_factory=dict)] = None
130+
data: Annotated[dict[str, Any], Field(default_factory=dict)] = {}
128131

129132
user_data: Annotated[
130133
dict[str, JsonValue], # Internally, the model contains `UserData`, this is just for convenience
@@ -139,7 +142,7 @@ class BaseRequestData(BaseModel):
139142
exclude_defaults=True,
140143
)
141144
),
142-
]
145+
] = {}
143146
"""Custom user data assigned to the request. Use this to save any request related data to the
144147
request's scope, keeping them accessible on retries, failures etc.
145148
"""
@@ -158,7 +161,7 @@ def from_url(
158161
url: str,
159162
*,
160163
method: HttpMethod = 'GET',
161-
payload: str | None = None,
164+
payload: HttpPayload | None = None,
162165
label: str | None = None,
163166
unique_key: str | None = None,
164167
id: str | None = None,
@@ -232,7 +235,7 @@ def from_url(
232235
url: str,
233236
*,
234237
method: HttpMethod = 'GET',
235-
payload: str | None = None,
238+
payload: HttpPayload | None = None,
236239
label: str | None = None,
237240
unique_key: str | None = None,
238241
id: str | None = None,

src/crawlee/_types.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
from collections.abc import Coroutine, Iterator, Mapping, Sequence
3+
from collections.abc import Mapping
44
from dataclasses import dataclass, field
55
from enum import Enum
66
from typing import TYPE_CHECKING, Any, Literal, Protocol, Union
@@ -10,6 +10,7 @@
1010
if TYPE_CHECKING:
1111
import logging
1212
import re
13+
from collections.abc import Coroutine, Iterator, Sequence
1314

1415
from crawlee import Glob
1516
from crawlee._request import BaseRequestData, Request
@@ -26,6 +27,10 @@
2627

2728
HttpMethod: TypeAlias = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']
2829

30+
HttpQueryParams: TypeAlias = dict[str, str]
31+
32+
HttpPayload: TypeAlias = Union[str, bytes]
33+
2934

3035
class EnqueueStrategy(str, Enum):
3136
"""Strategy for deciding which links should be followed and which ones should be ignored."""

src/crawlee/_utils/http.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from __future__ import annotations
2+
3+
14
def is_status_code_error(value: int) -> bool:
25
"""Returns `True` for 4xx or 5xx status codes, `False` otherwise."""
36
return is_status_code_client_error(value) or is_status_code_server_error(value)

src/crawlee/_utils/requests.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,14 @@
44
from base64 import b64encode
55
from hashlib import sha256
66
from logging import getLogger
7+
from typing import TYPE_CHECKING
78
from urllib.parse import parse_qsl, urlencode, urlparse
89

910
from crawlee._utils.crypto import compute_short_hash
1011

12+
if TYPE_CHECKING:
13+
from crawlee._types import HttpMethod, HttpPayload
14+
1115
logger = getLogger(__name__)
1216

1317

@@ -82,22 +86,22 @@ def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
8286

8387
def compute_unique_key(
8488
url: str,
85-
method: str = 'GET',
86-
payload: str | bytes | None = None,
89+
method: HttpMethod = 'GET',
90+
payload: HttpPayload | None = None,
8791
*,
8892
keep_url_fragment: bool = False,
8993
use_extended_unique_key: bool = False,
9094
) -> str:
9195
"""Computes a unique key for caching & deduplication of requests.
9296
9397
This function computes a unique key by normalizing the provided URL and method.
94-
If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and
98+
If `use_extended_unique_key` is True and a payload is provided, the payload is hashed and
9599
included in the key. Otherwise, the unique key is just the normalized URL.
96100
97101
Args:
98102
url: The request URL.
99103
method: The HTTP method, defaults to 'GET'.
100-
payload: The request payload, defaults to None.
104+
payload: The data to be sent as the request body, defaults to None.
101105
keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
102106
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
103107

src/crawlee/fingerprint_suite/_consts.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
# ruff: noqa: E501
24

35
COMMON_ACCEPT = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'

src/crawlee/http_clients/_base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22

33
from abc import ABC, abstractmethod
44
from dataclasses import dataclass
5-
from typing import TYPE_CHECKING, Protocol
5+
from typing import TYPE_CHECKING, Any, Protocol
66

77
from crawlee._utils.http import is_status_code_error
88
from crawlee.errors import HttpStatusCodeError
99

1010
if TYPE_CHECKING:
1111
from collections.abc import Iterable
1212

13-
from crawlee._types import HttpHeaders, HttpMethod
13+
from crawlee._types import HttpHeaders, HttpMethod, HttpQueryParams
1414
from crawlee.base_storage_client._models import Request
1515
from crawlee.proxy_configuration import ProxyInfo
1616
from crawlee.sessions import Session
@@ -114,6 +114,8 @@ async def send_request(
114114
*,
115115
method: HttpMethod = 'GET',
116116
headers: HttpHeaders | None = None,
117+
query_params: HttpQueryParams | None = None,
118+
data: dict[str, Any] | None = None,
117119
session: Session | None = None,
118120
proxy_info: ProxyInfo | None = None,
119121
) -> HttpResponse:
@@ -125,6 +127,8 @@ async def send_request(
125127
url: The URL to send the request to.
126128
method: The HTTP method to use.
127129
headers: The headers to include in the request.
130+
query_params: The query parameters to include in the request.
131+
data: The data to be sent as the request body.
128132
session: The session associated with the request.
129133
proxy_info: The information about the proxy to be used.
130134

src/crawlee/http_clients/_httpx.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
if TYPE_CHECKING:
1717
from collections.abc import Iterable
1818

19-
from crawlee._types import HttpMethod
19+
from crawlee._types import HttpMethod, HttpQueryParams
2020
from crawlee.base_storage_client._models import Request
2121
from crawlee.proxy_configuration import ProxyInfo
2222
from crawlee.statistics import Statistics
@@ -166,7 +166,7 @@ async def send_request(
166166
*,
167167
method: HttpMethod = 'GET',
168168
headers: HttpHeaders | None = None,
169-
query_params: dict[str, Any] | None = None,
169+
query_params: HttpQueryParams | None = None,
170170
data: dict[str, Any] | None = None,
171171
session: Session | None = None,
172172
proxy_info: ProxyInfo | None = None,

src/crawlee/http_clients/curl_impersonate.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44

55
try:
66
from curl_cffi.requests import AsyncSession
7-
from curl_cffi.requests.errors import RequestsError
8-
from curl_cffi.requests.impersonate import BrowserType
7+
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
8+
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
9+
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
910
except ImportError as exc:
1011
raise ImportError(
1112
"To import anything from this subpackage, you need to install the 'curl-impersonate' extra."
@@ -24,7 +25,7 @@
2425

2526
from curl_cffi.requests import Response
2627

27-
from crawlee._types import HttpHeaders, HttpMethod
28+
from crawlee._types import HttpHeaders, HttpMethod, HttpQueryParams
2829
from crawlee.base_storage_client._models import Request
2930
from crawlee.proxy_configuration import ProxyInfo
3031
from crawlee.sessions import Session
@@ -116,14 +117,14 @@ async def crawl(
116117
try:
117118
response = await client.request(
118119
url=request.url,
119-
method=request.method.upper(), # curl-cffi requires uppercase method
120+
method=request.method.upper(), # type: ignore # curl-cffi requires uppercase method
120121
headers=request.headers,
121122
params=request.query_params,
122-
data=request.data,
123+
data=request.payload,
123124
cookies=session.cookies if session else None,
124125
allow_redirects=True,
125126
)
126-
except RequestsError as exc:
127+
except CurlRequestError as exc:
127128
if self._is_proxy_error(exc):
128129
raise ProxyError from exc
129130
raise
@@ -150,7 +151,7 @@ async def send_request(
150151
*,
151152
method: HttpMethod = 'GET',
152153
headers: HttpHeaders | None = None,
153-
query_params: dict[str, Any] | None = None,
154+
query_params: HttpQueryParams | None = None,
154155
data: dict[str, Any] | None = None,
155156
session: Session | None = None,
156157
proxy_info: ProxyInfo | None = None,
@@ -161,14 +162,14 @@ async def send_request(
161162
try:
162163
response = await client.request(
163164
url=url,
164-
method=method.upper(), # curl-cffi requires uppercase method
165+
method=method.upper(), # type: ignore # curl-cffi requires uppercase method
165166
headers=headers,
166167
params=query_params,
167168
data=data,
168169
cookies=session.cookies if session else None,
169170
allow_redirects=True,
170171
)
171-
except RequestsError as exc:
172+
except CurlRequestError as exc:
172173
if self._is_proxy_error(exc):
173174
raise ProxyError from exc
174175
raise
@@ -194,7 +195,7 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:
194195
# are set as default options.
195196
kwargs: dict[str, Any] = {
196197
'proxy': proxy_url,
197-
'impersonate': BrowserType.chrome,
198+
'impersonate': CURL_DEFAULT_CHROME,
198199
}
199200

200201
# Update the default kwargs with any additional user-provided kwargs.
@@ -206,13 +207,12 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:
206207
return self._client_by_proxy_url[proxy_url]
207208

208209
@staticmethod
209-
def _is_proxy_error(error: RequestsError) -> bool:
210+
def _is_proxy_error(error: CurlRequestError) -> bool:
210211
"""Helper to check whether the given error is a proxy-related error."""
211212
if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS):
212213
return True
213214

214-
# Once https://github.com/yifeikong/curl_cffi/issues/361 is resolved, do it better.
215-
if 'CONNECT tunnel failed' in str(error): # noqa: SIM103
215+
if isinstance(error, CurlProxyError): # noqa: SIM103
216216
return True
217217

218218
return False

src/crawlee/memory_storage_client/_request_queue_client.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,9 +504,12 @@ async def _delete_request_file_from_storage(self, *, request_id: str, entity_dir
504504
def _json_to_request(self, request_json: str | None) -> Request | None:
505505
if request_json is None:
506506
return None
507+
507508
request_dict = filter_out_none_values_recursively(json.loads(request_json))
509+
508510
if request_dict is None:
509511
return None
512+
510513
return Request.model_validate(request_dict)
511514

512515
async def _create_internal_request(self, request: Request, forefront: bool | None) -> Request:
@@ -525,7 +528,6 @@ async def _create_internal_request(self, request: Request, forefront: bool | Non
525528
retry_count=request.retry_count,
526529
order_no=order_no,
527530
json_=json_request,
528-
user_data={},
529531
)
530532

531533
def _calculate_order_no(self, request: Request, forefront: bool | None) -> Decimal | None:

0 commit comments

Comments
 (0)