|
6 | 6 |
|
7 | 7 | from crawlee._types import HttpHeaders |
8 | 8 |
|
9 | | -from apify.scrapy.requests import to_apify_request |
| 9 | +from apify.scrapy.requests import to_apify_request, to_scrapy_request |
10 | 10 |
|
11 | 11 |
|
12 | 12 | class DummySpider(Spider): |
@@ -90,3 +90,38 @@ def test_invalid_scrapy_request_returns_none(spider: Spider) -> None: |
90 | 90 |
|
91 | 91 | apify_request = to_apify_request(scrapy_request, spider) # ty: ignore[invalid-argument-type] |
92 | 92 | assert apify_request is None |
| 93 | + |
| 94 | + |
| 95 | +@pytest.mark.xfail(reason='CrawleeRequestData object leaks into UserData extras after two roundtrips') |
| 96 | +def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) -> None: |
| 97 | + """Reproduce: CrawleeRequestData() argument after ** must be a mapping, not CrawleeRequestData. |
| 98 | +
|
| 99 | + After two roundtrips through to_apify_request/to_scrapy_request with userData propagation, |
| 100 | + Request.from_url() writes a CrawleeRequestData object into UserData.__pydantic_extra__['__crawlee']. |
| 101 | + On the next roundtrip, this CrawleeRequestData object is found by user_data_dict.get('__crawlee') |
| 102 | + and passed to CrawleeRequestData(**obj), which fails because CrawleeRequestData is not a mapping. |
| 103 | + """ |
| 104 | + # Step 1: Initial request -> first roundtrip |
| 105 | + initial_scrapy_request = Request(url='https://example.com/page') |
| 106 | + apify_request_1 = to_apify_request(initial_scrapy_request, spider) |
| 107 | + assert apify_request_1 is not None |
| 108 | + scrapy_request_1 = to_scrapy_request(apify_request_1, spider) |
| 109 | + |
| 110 | + # Step 2: Spider yields follow-up with propagated userData -> second roundtrip |
| 111 | + follow_up_1 = Request( |
| 112 | + url='https://example.com/page2', |
| 113 | + meta={'userData': scrapy_request_1.meta['userData']}, |
| 114 | + ) |
| 115 | + apify_request_2 = to_apify_request(follow_up_1, spider) |
| 116 | + assert apify_request_2 is not None |
| 117 | + scrapy_request_2 = to_scrapy_request(apify_request_2, spider) |
| 118 | + |
| 119 | + # Step 3: Spider yields another follow-up with propagated userData from 2nd roundtrip. |
| 120 | + # This fails because userData now has __crawlee as CrawleeRequestData in __pydantic_extra__. |
| 121 | + follow_up_2 = Request( |
| 122 | + url='https://example.com/image.png', |
| 123 | + meta={'userData': scrapy_request_2.meta['userData']}, |
| 124 | + ) |
| 125 | + follow_up_apify_request = to_apify_request(follow_up_2, spider) |
| 126 | + assert follow_up_apify_request is not None |
| 127 | + assert follow_up_apify_request.url == 'https://example.com/image.png' |
0 commit comments