fix(scrapy): Resolve Crawlee's request data round-trip failure in request conversion (#832)

vdusek · claude · web-flow · commit 3b9d58886115 · 2026-03-16T10:13:46.000+01:00
## Summary - Adds a reproduction test for a bug where `to_apify_request()` fails with `CrawleeRequestData() argument after ** must be a mapping, not CrawleeRequestData` after two roundtrips through Scrapy↔Apify request conversion when spiders propagate `userData` to follow-up requests - Root cause: `Request.from_url()` writes a `CrawleeRequestData` object into `UserData.__pydantic_extra__['__crawlee']`, which is then found by `.get('__crawlee')` on the next roundtrip and passed to `CrawleeRequestData(**obj)` instead of `CrawleeRequestData(**dict)` - Test is marked `xfail` until the fix is applied 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py
@@ -10,6 +10,7 @@
 from scrapy.http.headers import Headers
 from scrapy.utils.request import request_from_dict
 
+from crawlee._request import UserData
 from crawlee._types import HttpHeaders
 
 from apify import Request as ApifyRequest
@@ -52,7 +53,19 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
             if scrapy_request.meta.get('apify_request_id'):
                 request_kwargs['id'] = scrapy_request.meta['apify_request_id']
 
-        request_kwargs['user_data'] = scrapy_request.meta.get('userData', {})
+        user_data = scrapy_request.meta.get('userData', {})
+
+        # Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects
+        # from leaking into Request.from_url() during Scrapy-Apify roundtrips.
+        if isinstance(user_data, UserData):
+            user_data = user_data.model_dump(by_alias=True)
+
+        # Remove internal Crawlee data since it's managed by Request.from_url() and values
+        # from previous roundtrips cause incorrect state.
+        if isinstance(user_data, dict):
+            user_data.pop('__crawlee', None)
+
+        request_kwargs['user_data'] = user_data if isinstance(user_data, dict) else {}
 
         # Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
         if isinstance(scrapy_request.headers, Headers):
diff --git a/tests/unit/scrapy/requests/test_to_apify_request.py b/tests/unit/scrapy/requests/test_to_apify_request.py
@@ -6,7 +6,7 @@
 
 from crawlee._types import HttpHeaders
 
-from apify.scrapy.requests import to_apify_request
+from apify.scrapy.requests import to_apify_request, to_scrapy_request
 
 
 class DummySpider(Spider):
@@ -90,3 +90,37 @@ def test_invalid_scrapy_request_returns_none(spider: Spider) -> None:
 
     apify_request = to_apify_request(scrapy_request, spider)  # ty: ignore[invalid-argument-type]
     assert apify_request is None
+
+
+def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) -> None:
+    """Reproduce: CrawleeRequestData() argument after ** must be a mapping, not CrawleeRequestData.
+
+    After two roundtrips through to_apify_request/to_scrapy_request with userData propagation,
+    Request.from_url() writes a CrawleeRequestData object into UserData.__pydantic_extra__['__crawlee'].
+    On the next roundtrip, this CrawleeRequestData object is found by user_data_dict.get('__crawlee')
+    and passed to CrawleeRequestData(**obj), which fails because CrawleeRequestData is not a mapping.
+    """
+    # Step 1: Initial request -> first roundtrip
+    initial_scrapy_request = Request(url='https://example.com/page')
+    apify_request_1 = to_apify_request(initial_scrapy_request, spider)
+    assert apify_request_1 is not None
+    scrapy_request_1 = to_scrapy_request(apify_request_1, spider)
+
+    # Step 2: Spider yields follow-up with propagated userData -> second roundtrip
+    follow_up_1 = Request(
+        url='https://example.com/page2',
+        meta={'userData': scrapy_request_1.meta['userData']},
+    )
+    apify_request_2 = to_apify_request(follow_up_1, spider)
+    assert apify_request_2 is not None
+    scrapy_request_2 = to_scrapy_request(apify_request_2, spider)
+
+    # Step 3: Spider yields another follow-up with propagated userData from 2nd roundtrip.
+    # This fails because userData now has __crawlee as CrawleeRequestData in __pydantic_extra__.
+    follow_up_2 = Request(
+        url='https://example.com/image.png',
+        meta={'userData': scrapy_request_2.meta['userData']},
+    )
+    follow_up_apify_request = to_apify_request(follow_up_2, spider)
+    assert follow_up_apify_request is not None
+    assert follow_up_apify_request.url == 'https://example.com/image.png'