Skip to content

Commit c39ba97

Browse files
vdusekclaude
andcommitted
test: Add reproduction test for CrawleeRequestData roundtrip bug
After two roundtrips through to_apify_request/to_scrapy_request with userData propagation, CrawleeRequestData object leaks into UserData.__pydantic_extra__['__crawlee'], causing subsequent conversions to fail with "argument after ** must be a mapping, not CrawleeRequestData". Marked as xfail until the fix is applied. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e343ed5 commit c39ba97

File tree

1 file changed

+36
-1
lines changed

1 file changed

+36
-1
lines changed

tests/unit/scrapy/requests/test_to_apify_request.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from crawlee._types import HttpHeaders
88

9-
from apify.scrapy.requests import to_apify_request
9+
from apify.scrapy.requests import to_apify_request, to_scrapy_request
1010

1111

1212
class DummySpider(Spider):
@@ -90,3 +90,38 @@ def test_invalid_scrapy_request_returns_none(spider: Spider) -> None:
9090

9191
apify_request = to_apify_request(scrapy_request, spider) # ty: ignore[invalid-argument-type]
9292
assert apify_request is None
93+
94+
95+
@pytest.mark.xfail(reason='CrawleeRequestData object leaks into UserData extras after two roundtrips')
96+
def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) -> None:
97+
"""Reproduce: CrawleeRequestData() argument after ** must be a mapping, not CrawleeRequestData.
98+
99+
After two roundtrips through to_apify_request/to_scrapy_request with userData propagation,
100+
Request.from_url() writes a CrawleeRequestData object into UserData.__pydantic_extra__['__crawlee'].
101+
On the next roundtrip, this CrawleeRequestData object is found by user_data_dict.get('__crawlee')
102+
and passed to CrawleeRequestData(**obj), which fails because CrawleeRequestData is not a mapping.
103+
"""
104+
# Step 1: Initial request -> first roundtrip
105+
initial_scrapy_request = Request(url='https://example.com/page')
106+
apify_request_1 = to_apify_request(initial_scrapy_request, spider)
107+
assert apify_request_1 is not None
108+
scrapy_request_1 = to_scrapy_request(apify_request_1, spider)
109+
110+
# Step 2: Spider yields follow-up with propagated userData -> second roundtrip
111+
follow_up_1 = Request(
112+
url='https://example.com/page2',
113+
meta={'userData': scrapy_request_1.meta['userData']},
114+
)
115+
apify_request_2 = to_apify_request(follow_up_1, spider)
116+
assert apify_request_2 is not None
117+
scrapy_request_2 = to_scrapy_request(apify_request_2, spider)
118+
119+
# Step 3: Spider yields another follow-up with propagated userData from 2nd roundtrip.
120+
# This fails because userData now has __crawlee as CrawleeRequestData in __pydantic_extra__.
121+
follow_up_2 = Request(
122+
url='https://example.com/image.png',
123+
meta={'userData': scrapy_request_2.meta['userData']},
124+
)
125+
follow_up_apify_request = to_apify_request(follow_up_2, spider)
126+
assert follow_up_apify_request is not None
127+
assert follow_up_apify_request.url == 'https://example.com/image.png'

0 commit comments

Comments
 (0)