Skip to content

Commit b0b7df7

Browse files
authored
fix(scrapy): Avoid mutating request userData during Scrapy-Apify conversion (#978)
`to_apify_request` serialized the Scrapy request **after** `Request.from_url()` had mutated the shared `meta['userData']` dict (same bug class as #832). `Request.from_url()` injects a live `CrawleeRequestData` object under `__crawlee` into the very `user_data` dict it receives — which was the spider's own `meta['userData']`. Because `scrapy_request.to_dict()` ran afterward, two things went wrong: - the spider's own request `meta['userData']` was mutated in place, and - the serialized `scrapy_request` blob stored on the platform embedded redundant Crawlee internals in every request. **Fix:** capture `scrapy_request.to_dict()` *before* calling `from_url()`, and pass `from_url()` a copy of `user_data` (`dict(user_data)` for the plain-dict branch; `model_dump()` already returns a fresh dict). The spider's request stays untouched and the stored blob is free of injected Crawlee data. Added two regression tests covering both the no-mutation guarantee and the clean serialized blob.
1 parent 53122e4 commit b0b7df7

2 files changed

Lines changed: 35 additions & 2 deletions

File tree

src/apify/scrapy/requests.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
from copy import deepcopy
34
from logging import getLogger
45
from typing import Any, cast
56

@@ -80,12 +81,22 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
8081
elif scrapy_request.meta.get('apify_request_unique_key'):
8182
request_kwargs['unique_key'] = scrapy_request.meta['apify_request_unique_key']
8283

84+
# Serialize the Scrapy request now, before `Request.from_url()` runs below. `from_url()` mutates the
85+
# `user_data` dict it receives in place (it injects a live `CrawleeRequestData` under `__crawlee`), and that
86+
# dict can be the spider's own `meta['userData']`. Capturing `to_dict()` first keeps the stored blob free of
87+
# those injected internals, and copying `user_data` below leaves the spider's request untouched.
88+
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
89+
8390
user_data = scrapy_request.meta.get('userData', {})
8491

8592
# Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects from leaking
86-
# into Request.from_url() during Scrapy-Apify roundtrips.
93+
# into Request.from_url() during Scrapy-Apify roundtrips. `model_dump()` already returns a fresh, fully
94+
# detached dict; the plain-dict case is deep-copied so that neither the `pop` and `from_url()` mutations
95+
# below nor any mutation of a nested value can ever reach back into the spider's meta.
8796
if isinstance(user_data, UserData):
8897
user_data = user_data.model_dump(by_alias=True)
98+
elif isinstance(user_data, dict):
99+
user_data = deepcopy(user_data)
89100

90101
# Remove internal Crawlee data since it's managed by Request.from_url() and values from previous roundtrips
91102
# cause incorrect state.
@@ -117,7 +128,6 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
117128
)
118129

119130
apify_request = ApifyRequest.from_url(**request_kwargs)
120-
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
121131

122132
except Exception as exc:
123133
logger.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')

tests/unit/scrapy/requests/test_to_apify_request.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import json
34
import logging
45
from typing import cast
56

@@ -140,6 +141,28 @@ def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) ->
140141
assert follow_up_apify_request.url == 'https://example.com/image.png'
141142

142143

144+
def test_does_not_mutate_spider_request_user_data(spider: Spider) -> None:
145+
"""Conversion must not mutate the spider's own `meta['userData']`, including nested values, in place."""
146+
user_data = {'some_user_data': 'test', 'nested': {'key': 'value'}}
147+
scrapy_request = Request(url='https://example.com', meta={'userData': user_data})
148+
149+
to_apify_request(scrapy_request, spider)
150+
151+
assert user_data == {'some_user_data': 'test', 'nested': {'key': 'value'}}
152+
assert '__crawlee' not in user_data
153+
154+
155+
def test_serialized_request_omits_injected_crawlee_data(spider: Spider) -> None:
156+
"""The stored `scrapy_request` blob must not embed the `__crawlee` data `Request.from_url()` injects."""
157+
scrapy_request = Request(url='https://example.com', meta={'userData': {'some_user_data': 'test'}})
158+
159+
apify_request = to_apify_request(scrapy_request, spider)
160+
assert apify_request is not None
161+
162+
stored = json.loads(cast('str', apify_request.user_data['scrapy_request']))
163+
assert '__crawlee' not in stored['meta'].get('userData', {})
164+
165+
143166
def test_dont_filter_request_is_always_enqueued(spider: Spider) -> None:
144167
"""A `dont_filter=True` request is always enqueued: each conversion gets a fresh unique key, bypassing dedup."""
145168
first = to_apify_request(Request(url='https://example.com', dont_filter=True), spider)

0 commit comments

Comments
 (0)