Skip to content

Commit 596821b

Browse files
vdusekclaude
andauthored
test: add e2e tests for Scrapy spiders running as Apify Actors (#788)
## Summary Add 5 new e2e tests covering key Scrapy spider types and integration patterns, all using a local HTTP server for reliability. Tests cover basic Spider, CrawlSpider with rules, ItemLoader, cb_kwargs serialization, and custom pipeline merging with apply_apify_settings. ## Motivation - Better test coverage for Scrapy integration, up until now, we have only 1 E2E test, and that was all. ## Issue - Relates: #785 ## Test plan - [x] CI passes --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 66ed5ea commit 596821b

21 files changed

+545
-0
lines changed

tests/e2e/test_scrapy/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

tests/e2e/test_scrapy/actor_source/__init__.py

Whitespace-only changes.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from __future__ import annotations
2+
3+
from scrapy.utils.reactor import install_reactor
4+
5+
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
6+
7+
import os # noqa: E402, I001
8+
9+
from apify.scrapy import initialize_logging, run_scrapy_actor # noqa: E402
10+
11+
from .main import main # noqa: E402
12+
13+
os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
14+
15+
if __name__ == '__main__':
16+
initialize_logging()
17+
run_scrapy_actor(main())
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from __future__ import annotations
2+
3+
from scrapy import Field, Item
4+
5+
6+
class ProductItem(Item):
7+
name = Field()
8+
url = Field()
9+
price = Field()
10+
description = Field()
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from __future__ import annotations # noqa: I001
2+
3+
from scrapy.crawler import CrawlerRunner
4+
from scrapy.utils.defer import deferred_to_future
5+
6+
from apify import Actor
7+
from apify.scrapy import apply_apify_settings
8+
9+
from .spiders import Spider # ty: ignore[unresolved-import]
10+
11+
12+
async def main() -> None:
13+
async with Actor:
14+
settings = apply_apify_settings()
15+
runner = CrawlerRunner(settings)
16+
await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/']))
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from __future__ import annotations # noqa: I001
2+
3+
import os
4+
5+
from scrapy.crawler import CrawlerRunner
6+
from scrapy.utils.defer import deferred_to_future
7+
8+
from apify import Actor
9+
from apify.scrapy import apply_apify_settings
10+
11+
from .spiders import Spider # ty: ignore[unresolved-import]
12+
13+
14+
async def main() -> None:
15+
async with Actor:
16+
os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings_custom_pipeline'
17+
settings = apply_apify_settings()
18+
runner = CrawlerRunner(settings)
19+
await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/']))
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
if TYPE_CHECKING:
6+
from scrapy import Item, Spider
7+
8+
9+
class PriceCleanerPipeline:
10+
def process_item(
11+
self,
12+
item: Item,
13+
_: Spider,
14+
) -> Item:
15+
if 'price' in item and isinstance(item['price'], str):
16+
item['price'] = item['price'].lstrip('$')
17+
return item
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
import logging
5+
from collections.abc import Awaitable, Callable, Coroutine
6+
from typing import Any
7+
8+
from uvicorn import Config
9+
from uvicorn.server import Server
10+
11+
Receive = Callable[[], Awaitable[dict[str, Any]]]
12+
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
13+
14+
_PRODUCTS = {
15+
'1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'},
16+
'2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'},
17+
'3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'},
18+
}
19+
20+
21+
async def _send_html(send: Send, html: str, status: int = 200) -> None:
22+
await send(
23+
{
24+
'type': 'http.response.start',
25+
'status': status,
26+
'headers': [[b'content-type', b'text/html; charset=utf-8']],
27+
}
28+
)
29+
await send({'type': 'http.response.body', 'body': html.encode()})
30+
31+
32+
async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
33+
assert scope['type'] == 'http'
34+
path = scope['path']
35+
36+
if path == '/':
37+
await _send_html(
38+
send,
39+
'<html><head><title>E-commerce Test Store</title></head><body>'
40+
'<h1>Welcome to Test Store</h1>'
41+
'<a href="/products/1">Widget A</a>'
42+
'<a href="/products/2">Widget B</a>'
43+
'<a href="/products/3">Widget C</a>'
44+
'<a href="/about">About Us</a>'
45+
'</body></html>',
46+
)
47+
elif path.startswith('/products/'):
48+
product = _PRODUCTS.get(path.split('/')[-1])
49+
if product:
50+
await _send_html(
51+
send,
52+
f'<html><head><title>{product["name"]}</title></head><body>'
53+
f'<h1>{product["name"]}</h1>'
54+
f'<span class="price">{product["price"]}</span>'
55+
f'<p class="description">{product["description"]}</p>'
56+
f'<a href="/">Back to Home</a>'
57+
f'</body></html>',
58+
)
59+
else:
60+
await _send_html(send, '<html><body>Not Found</body></html>', 404)
61+
elif path == '/about':
62+
await _send_html(
63+
send,
64+
'<html><head><title>About Us</title></head><body>'
65+
'<h1>About Test Store</h1>'
66+
'<p class="description">We sell the best widgets in the world.</p>'
67+
'<a href="/">Back to Home</a>'
68+
'</body></html>',
69+
)
70+
else:
71+
await _send_html(send, '<html><body>Not Found</body></html>', 404)
72+
73+
74+
if __name__ == '__main__':
75+
asyncio.run(
76+
Server(
77+
config=Config(
78+
app=app,
79+
lifespan='off',
80+
loop='asyncio',
81+
port=8080,
82+
log_config=None,
83+
log_level=logging.CRITICAL,
84+
)
85+
).serve()
86+
)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
BOT_NAME = 'testbot'
2+
LOG_LEVEL = 'INFO'
3+
NEWSPIDER_MODULE = 'src.spiders'
4+
ROBOTSTXT_OBEY = False
5+
SPIDER_MODULES = ['src.spiders']
6+
TELNETCONSOLE_ENABLED = False
7+
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
8+
HTTPCACHE_ENABLED = False
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from src.settings import * # noqa: F403 # ty: ignore[unresolved-import]
2+
3+
ITEM_PIPELINES = {
4+
'src.pipelines.PriceCleanerPipeline': 100,
5+
}

0 commit comments

Comments
 (0)