Skip to content

Commit 3b9d588

Browse files
vdusekclaude
andauthored
fix(scrapy): Resolve Crawlee's request data round-trip failure in request conversion (#832)
## Summary - Adds a reproduction test for a bug where `to_apify_request()` fails with `CrawleeRequestData() argument after ** must be a mapping, not CrawleeRequestData` after two roundtrips through Scrapy↔Apify request conversion when spiders propagate `userData` to follow-up requests - Root cause: `Request.from_url()` writes a `CrawleeRequestData` object into `UserData.__pydantic_extra__['__crawlee']`, which is then found by `.get('__crawlee')` on the next roundtrip and passed to `CrawleeRequestData(**obj)` instead of `CrawleeRequestData(**dict)` - Test is marked `xfail` until the fix is applied 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e343ed5 commit 3b9d588

File tree

2 files changed

+49
-2
lines changed

2 files changed

+49
-2
lines changed

src/apify/scrapy/requests.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from scrapy.http.headers import Headers
1111
from scrapy.utils.request import request_from_dict
1212

13+
from crawlee._request import UserData
1314
from crawlee._types import HttpHeaders
1415

1516
from apify import Request as ApifyRequest
@@ -52,7 +53,19 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
5253
if scrapy_request.meta.get('apify_request_id'):
5354
request_kwargs['id'] = scrapy_request.meta['apify_request_id']
5455

55-
request_kwargs['user_data'] = scrapy_request.meta.get('userData', {})
56+
user_data = scrapy_request.meta.get('userData', {})
57+
58+
# Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects
59+
# from leaking into Request.from_url() during Scrapy-Apify roundtrips.
60+
if isinstance(user_data, UserData):
61+
user_data = user_data.model_dump(by_alias=True)
62+
63+
# Remove internal Crawlee data since it's managed by Request.from_url() and values
64+
# from previous roundtrips cause incorrect state.
65+
if isinstance(user_data, dict):
66+
user_data.pop('__crawlee', None)
67+
68+
request_kwargs['user_data'] = user_data if isinstance(user_data, dict) else {}
5669

5770
# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
5871
if isinstance(scrapy_request.headers, Headers):

tests/unit/scrapy/requests/test_to_apify_request.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from crawlee._types import HttpHeaders
88

9-
from apify.scrapy.requests import to_apify_request
9+
from apify.scrapy.requests import to_apify_request, to_scrapy_request
1010

1111

1212
class DummySpider(Spider):
@@ -90,3 +90,37 @@ def test_invalid_scrapy_request_returns_none(spider: Spider) -> None:
9090

9191
apify_request = to_apify_request(scrapy_request, spider) # ty: ignore[invalid-argument-type]
9292
assert apify_request is None
93+
94+
95+
def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) -> None:
96+
"""Reproduce: CrawleeRequestData() argument after ** must be a mapping, not CrawleeRequestData.
97+
98+
After two roundtrips through to_apify_request/to_scrapy_request with userData propagation,
99+
Request.from_url() writes a CrawleeRequestData object into UserData.__pydantic_extra__['__crawlee'].
100+
On the next roundtrip, this CrawleeRequestData object is found by user_data_dict.get('__crawlee')
101+
and passed to CrawleeRequestData(**obj), which fails because CrawleeRequestData is not a mapping.
102+
"""
103+
# Step 1: Initial request -> first roundtrip
104+
initial_scrapy_request = Request(url='https://example.com/page')
105+
apify_request_1 = to_apify_request(initial_scrapy_request, spider)
106+
assert apify_request_1 is not None
107+
scrapy_request_1 = to_scrapy_request(apify_request_1, spider)
108+
109+
# Step 2: Spider yields follow-up with propagated userData -> second roundtrip
110+
follow_up_1 = Request(
111+
url='https://example.com/page2',
112+
meta={'userData': scrapy_request_1.meta['userData']},
113+
)
114+
apify_request_2 = to_apify_request(follow_up_1, spider)
115+
assert apify_request_2 is not None
116+
scrapy_request_2 = to_scrapy_request(apify_request_2, spider)
117+
118+
# Step 3: Spider yields another follow-up with propagated userData from 2nd roundtrip.
119+
# This fails because userData now has __crawlee as CrawleeRequestData in __pydantic_extra__.
120+
follow_up_2 = Request(
121+
url='https://example.com/image.png',
122+
meta={'userData': scrapy_request_2.meta['userData']},
123+
)
124+
follow_up_apify_request = to_apify_request(follow_up_2, spider)
125+
assert follow_up_apify_request is not None
126+
assert follow_up_apify_request.url == 'https://example.com/image.png'

0 commit comments

Comments
 (0)