|
10 | 10 | from scrapy.http.headers import Headers |
11 | 11 | from scrapy.utils.request import request_from_dict |
12 | 12 |
|
| 13 | +from crawlee._request import UserData |
13 | 14 | from crawlee._types import HttpHeaders |
14 | 15 |
|
15 | 16 | from apify import Request as ApifyRequest |
@@ -52,7 +53,19 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ |
52 | 53 | if scrapy_request.meta.get('apify_request_id'): |
53 | 54 | request_kwargs['id'] = scrapy_request.meta['apify_request_id'] |
54 | 55 |
|
55 | | - request_kwargs['user_data'] = scrapy_request.meta.get('userData', {}) |
| 56 | + user_data = scrapy_request.meta.get('userData', {}) |
| 57 | + |
| 58 | + # Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects |
| 59 | + # from leaking into Request.from_url() during Scrapy-Apify roundtrips. |
| 60 | + if isinstance(user_data, UserData): |
| 61 | + user_data = user_data.model_dump(by_alias=True) |
| 62 | + |
| 63 | + # Remove internal Crawlee data since it's managed by Request.from_url() and values |
| 64 | + # from previous roundtrips cause incorrect state. |
| 65 | + if isinstance(user_data, dict): |
| 66 | + user_data.pop('__crawlee', None) |
| 67 | + |
| 68 | + request_kwargs['user_data'] = user_data if isinstance(user_data, dict) else {} |
56 | 69 |
|
57 | 70 | # Convert Scrapy's headers to a HttpHeaders and store them in the apify_request |
58 | 71 | if isinstance(scrapy_request.headers, Headers): |
|
0 commit comments