Skip to content

Commit febd963

Browse files
committed
Implement fix
1 parent c39ba97 commit febd963

File tree

2 files changed

+14
-2
lines changed

2 files changed

+14
-2
lines changed

src/apify/scrapy/requests.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from scrapy.http.headers import Headers
1111
from scrapy.utils.request import request_from_dict
1212

13+
from crawlee._request import UserData
1314
from crawlee._types import HttpHeaders
1415

1516
from apify import Request as ApifyRequest
@@ -52,7 +53,19 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
5253
if scrapy_request.meta.get('apify_request_id'):
5354
request_kwargs['id'] = scrapy_request.meta['apify_request_id']
5455

55-
request_kwargs['user_data'] = scrapy_request.meta.get('userData', {})
56+
user_data = scrapy_request.meta.get('userData', {})
57+
58+
# Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects
59+
# from leaking into Request.from_url() during Scrapy-Apify roundtrips.
60+
if isinstance(user_data, UserData):
61+
user_data = user_data.model_dump(by_alias=True)
62+
63+
# Remove internal Crawlee data since it's managed by Request.from_url() and values
64+
# from previous roundtrips cause incorrect state.
65+
if isinstance(user_data, dict):
66+
user_data.pop('__crawlee', None)
67+
68+
request_kwargs['user_data'] = user_data if isinstance(user_data, dict) else {}
5669

5770
# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
5871
if isinstance(scrapy_request.headers, Headers):

tests/unit/scrapy/requests/test_to_apify_request.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ def test_invalid_scrapy_request_returns_none(spider: Spider) -> None:
9292
assert apify_request is None
9393

9494

95-
@pytest.mark.xfail(reason='CrawleeRequestData object leaks into UserData extras after two roundtrips')
9695
def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) -> None:
9796
"""Reproduce: CrawleeRequestData() argument after ** must be a mapping, not CrawleeRequestData.
9897

0 commit comments

Comments
 (0)