fix(scrapy): Avoid mutating request userData during Scrapy-Apify conversion (#978)

vdusek · web-flow · commit b0b7df72eb16 · 2026-06-18T14:05:40.000+02:00
`to_apify_request` serialized the Scrapy request **after** `Request.from_url()` had mutated the shared `meta['userData']` dict (same bug class as #832). `Request.from_url()` injects a live `CrawleeRequestData` object under `__crawlee` into the very `user_data` dict it receives — which was the spider's own `meta['userData']`. Because `scrapy_request.to_dict()` ran afterward, two things went wrong: - the spider's own request `meta['userData']` was mutated in place, and - the serialized `scrapy_request` blob stored on the platform embedded redundant Crawlee internals in every request. **Fix:** capture `scrapy_request.to_dict()` *before* calling `from_url()`, and pass `from_url()` a copy of `user_data` (`dict(user_data)` for the plain-dict branch; `model_dump()` already returns a fresh dict). The spider's request stays untouched and the stored blob is free of injected Crawlee data. Added two regression tests covering both the no-mutation guarantee and the clean serialized blob.
diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from copy import deepcopy
 from logging import getLogger
 from typing import Any, cast
 
@@ -80,12 +81,22 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
         elif scrapy_request.meta.get('apify_request_unique_key'):
             request_kwargs['unique_key'] = scrapy_request.meta['apify_request_unique_key']
 
+        # Serialize the Scrapy request now, before `Request.from_url()` runs below. `from_url()` mutates the
+        # `user_data` dict it receives in place (it injects a live `CrawleeRequestData` under `__crawlee`), and that
+        # dict can be the spider's own `meta['userData']`. Capturing `to_dict()` first keeps the stored blob free of
+        # those injected internals, and copying `user_data` below leaves the spider's request untouched.
+        scrapy_request_dict = scrapy_request.to_dict(spider=spider)
+
         user_data = scrapy_request.meta.get('userData', {})
 
         # Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects from leaking
-        # into Request.from_url() during Scrapy-Apify roundtrips.
+        # into Request.from_url() during Scrapy-Apify roundtrips. `model_dump()` already returns a fresh, fully
+        # detached dict; the plain-dict case is deep-copied so that neither the `pop` and `from_url()` mutations
+        # below nor any mutation of a nested value can ever reach back into the spider's meta.
         if isinstance(user_data, UserData):
             user_data = user_data.model_dump(by_alias=True)
+        elif isinstance(user_data, dict):
+            user_data = deepcopy(user_data)
 
         # Remove internal Crawlee data since it's managed by Request.from_url() and values from previous roundtrips
         # cause incorrect state.
@@ -117,7 +128,6 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
             )
 
         apify_request = ApifyRequest.from_url(**request_kwargs)
-        scrapy_request_dict = scrapy_request.to_dict(spider=spider)
 
     except Exception as exc:
         logger.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
diff --git a/tests/unit/scrapy/requests/test_to_apify_request.py b/tests/unit/scrapy/requests/test_to_apify_request.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import json
 import logging
 from typing import cast
 
@@ -140,6 +141,28 @@ def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) ->
     assert follow_up_apify_request.url == 'https://example.com/image.png'
 
 
+def test_does_not_mutate_spider_request_user_data(spider: Spider) -> None:
+    """Conversion must not mutate the spider's own `meta['userData']`, including nested values, in place."""
+    user_data = {'some_user_data': 'test', 'nested': {'key': 'value'}}
+    scrapy_request = Request(url='https://example.com', meta={'userData': user_data})
+
+    to_apify_request(scrapy_request, spider)
+
+    assert user_data == {'some_user_data': 'test', 'nested': {'key': 'value'}}
+    assert '__crawlee' not in user_data
+
+
+def test_serialized_request_omits_injected_crawlee_data(spider: Spider) -> None:
+    """The stored `scrapy_request` blob must not embed the `__crawlee` data `Request.from_url()` injects."""
+    scrapy_request = Request(url='https://example.com', meta={'userData': {'some_user_data': 'test'}})
+
+    apify_request = to_apify_request(scrapy_request, spider)
+    assert apify_request is not None
+
+    stored = json.loads(cast('str', apify_request.user_data['scrapy_request']))
+    assert '__crawlee' not in stored['meta'].get('userData', {})
+
+
 def test_dont_filter_request_is_always_enqueued(spider: Spider) -> None:
     """A `dont_filter=True` request is always enqueued: each conversion gets a fresh unique key, bypassing dedup."""
     first = to_apify_request(Request(url='https://example.com', dont_filter=True), spider)