Skip to content

Commit 8dd26ac

Browse files
vdusekclaude
andcommitted
test: extract inline Python strings to dedicated files in crawlee crawler e2e tests
Move Actor source code from triple-quoted string constants into standalone files under actor_source/, so they benefit from syntax highlighting, linting, and type-checking. Load them at runtime via Path.read_text() helpers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 6a28b53 commit 8dd26ac

15 files changed

+409
-375
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
FROM apify/actor-python-playwright:PYTHON_VERSION_PLACEHOLDER
2+
3+
COPY . ./
4+
5+
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
6+
7+
RUN pip install --force-reinstall -r requirements.txt
8+
9+
CMD ["sh", "-c", "python server.py & python -m src"]
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from __future__ import annotations
2+
3+
from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext
4+
5+
from apify import Actor
6+
7+
8+
async def main() -> None:
9+
async with Actor:
10+
pages_visited: list[str] = []
11+
crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser()
12+
13+
@crawler.router.default_handler
14+
async def handler(context: AdaptivePlaywrightCrawlingContext) -> None:
15+
pages_visited.append(context.request.url)
16+
await context.enqueue_links()
17+
18+
if '/products/' in context.request.url:
19+
name = context.parsed_content.css('h1::text').get('').strip()
20+
price = context.parsed_content.css('span.price::text').get('').strip()
21+
description = context.parsed_content.css('p.description::text').get('').strip()
22+
if name:
23+
await context.push_data(
24+
{
25+
'url': context.request.url,
26+
'name': name,
27+
'price': price,
28+
'description': description,
29+
}
30+
)
31+
32+
await crawler.run(['http://localhost:8080/'])
33+
34+
await Actor.set_value(
35+
'CRAWLER_RESULT',
36+
{
37+
'pages_visited_count': len(pages_visited),
38+
'crawler_type': 'AdaptivePlaywrightCrawler',
39+
},
40+
)
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from __future__ import annotations
2+
3+
from html.parser import HTMLParser
4+
from typing import TYPE_CHECKING
5+
6+
from crawlee.crawlers import BasicCrawler
7+
8+
from apify import Actor
9+
10+
if TYPE_CHECKING:
11+
from crawlee._types import BasicCrawlingContext
12+
13+
14+
class _PageParser(HTMLParser):
15+
def __init__(self) -> None:
16+
super().__init__()
17+
self.links: list[str] = []
18+
self.data: dict[str, str] = {}
19+
self._in_tag: str | None = None
20+
self._in_class: str = ''
21+
22+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
23+
attrs_dict = dict(attrs)
24+
if tag == 'a' and (href := attrs_dict.get('href')):
25+
self.links.append(href)
26+
self._in_tag = tag
27+
self._in_class = attrs_dict.get('class', '') or ''
28+
29+
def handle_endtag(self, tag: str) -> None: # noqa: ARG002
30+
self._in_tag = None
31+
self._in_class = ''
32+
33+
def handle_data(self, data: str) -> None:
34+
text = data.strip()
35+
if not text:
36+
return
37+
if self._in_tag == 'h1':
38+
self.data['name'] = text
39+
elif self._in_tag == 'span' and self._in_class == 'price':
40+
self.data['price'] = text
41+
elif self._in_tag == 'p' and self._in_class == 'description':
42+
self.data['description'] = text
43+
44+
45+
async def main() -> None:
46+
async with Actor:
47+
pages_visited: list[str] = []
48+
crawler = BasicCrawler()
49+
50+
@crawler.router.default_handler
51+
async def handler(context: BasicCrawlingContext) -> None:
52+
pages_visited.append(context.request.url)
53+
54+
response = await context.send_request(context.request.url)
55+
html = (await response.read()).decode()
56+
57+
parser = _PageParser()
58+
parser.feed(html)
59+
60+
base_url = 'http://localhost:8080'
61+
await context.add_requests([f'{base_url}{link}' for link in parser.links if link.startswith('/')])
62+
63+
if '/products/' in context.request.url and parser.data.get('name'):
64+
await context.push_data({'url': context.request.url, **parser.data})
65+
66+
await crawler.run(['http://localhost:8080/'])
67+
68+
await Actor.set_value(
69+
'CRAWLER_RESULT',
70+
{
71+
'pages_visited_count': len(pages_visited),
72+
'crawler_type': 'BasicCrawler',
73+
},
74+
)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from __future__ import annotations
2+
3+
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
4+
5+
from apify import Actor
6+
7+
8+
async def main() -> None:
9+
async with Actor:
10+
pages_visited: list[str] = []
11+
crawler = BeautifulSoupCrawler()
12+
13+
@crawler.router.default_handler
14+
async def handler(context: BeautifulSoupCrawlingContext) -> None:
15+
pages_visited.append(context.request.url)
16+
await context.enqueue_links()
17+
18+
if '/products/' in context.request.url:
19+
name_tag = context.soup.find('h1')
20+
price_tag = context.soup.find('span', class_='price')
21+
desc_tag = context.soup.find('p', class_='description')
22+
if name_tag:
23+
await context.push_data(
24+
{
25+
'url': context.request.url,
26+
'name': name_tag.get_text(strip=True),
27+
'price': price_tag.get_text(strip=True) if price_tag else '',
28+
'description': desc_tag.get_text(strip=True) if desc_tag else '',
29+
}
30+
)
31+
32+
await crawler.run(['http://localhost:8080/'])
33+
34+
await Actor.set_value(
35+
'CRAWLER_RESULT',
36+
{
37+
'pages_visited_count': len(pages_visited),
38+
'crawler_type': 'BeautifulSoupCrawler',
39+
},
40+
)
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from __future__ import annotations
2+
3+
import re
4+
5+
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
6+
7+
from apify import Actor
8+
9+
10+
async def main() -> None:
11+
async with Actor:
12+
pages_visited: list[str] = []
13+
crawler = HttpCrawler()
14+
15+
@crawler.router.default_handler
16+
async def handler(context: HttpCrawlingContext) -> None:
17+
pages_visited.append(context.request.url)
18+
html = (await context.http_response.read()).decode()
19+
20+
links = re.findall(r'href="(/[^"]*)"', html)
21+
base_url = 'http://localhost:8080'
22+
await context.add_requests([f'{base_url}{link}' for link in links])
23+
24+
if '/products/' in context.request.url:
25+
name_match = re.search(r'<h1>(.*?)</h1>', html)
26+
price_match = re.search(r'<span class="price">(.*?)</span>', html)
27+
desc_match = re.search(r'<p class="description">(.*?)</p>', html)
28+
if name_match:
29+
await context.push_data(
30+
{
31+
'url': context.request.url,
32+
'name': name_match.group(1),
33+
'price': price_match.group(1) if price_match else '',
34+
'description': desc_match.group(1) if desc_match else '',
35+
}
36+
)
37+
38+
await crawler.run(['http://localhost:8080/'])
39+
40+
await Actor.set_value(
41+
'CRAWLER_RESULT',
42+
{
43+
'pages_visited_count': len(pages_visited),
44+
'crawler_type': 'HttpCrawler',
45+
},
46+
)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from __future__ import annotations
2+
3+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
4+
5+
from apify import Actor
6+
7+
8+
async def main() -> None:
9+
async with Actor:
10+
pages_visited: list[str] = []
11+
crawler = ParselCrawler()
12+
13+
@crawler.router.default_handler
14+
async def handler(context: ParselCrawlingContext) -> None:
15+
pages_visited.append(context.request.url)
16+
await context.enqueue_links()
17+
18+
if '/products/' in context.request.url:
19+
name = context.selector.css('h1::text').get('').strip()
20+
price = context.selector.css('span.price::text').get('').strip()
21+
description = context.selector.css('p.description::text').get('').strip()
22+
if name:
23+
await context.push_data(
24+
{
25+
'url': context.request.url,
26+
'name': name,
27+
'price': price,
28+
'description': description,
29+
}
30+
)
31+
32+
await crawler.run(['http://localhost:8080/'])
33+
34+
await Actor.set_value(
35+
'CRAWLER_RESULT',
36+
{
37+
'pages_visited_count': len(pages_visited),
38+
'crawler_type': 'ParselCrawler',
39+
},
40+
)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from __future__ import annotations
2+
3+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
4+
5+
from apify import Actor
6+
7+
8+
async def main() -> None:
9+
async with Actor:
10+
pages_visited: list[str] = []
11+
crawler = PlaywrightCrawler()
12+
13+
@crawler.router.default_handler
14+
async def handler(context: PlaywrightCrawlingContext) -> None:
15+
pages_visited.append(context.request.url)
16+
await context.enqueue_links()
17+
18+
if '/products/' in context.request.url:
19+
name = await context.page.locator('h1').text_content()
20+
price = await context.page.locator('span.price').text_content()
21+
description = await context.page.locator('p.description').text_content()
22+
if name:
23+
await context.push_data(
24+
{
25+
'url': context.request.url,
26+
'name': name.strip(),
27+
'price': (price or '').strip(),
28+
'description': (description or '').strip(),
29+
}
30+
)
31+
32+
await crawler.run(['http://localhost:8080/'])
33+
34+
await Actor.set_value(
35+
'CRAWLER_RESULT',
36+
{
37+
'pages_visited_count': len(pages_visited),
38+
'crawler_type': 'PlaywrightCrawler',
39+
},
40+
)
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import asyncio
2+
import logging
3+
from collections.abc import Awaitable, Callable, Coroutine
4+
from typing import Any
5+
6+
from uvicorn import Config
7+
from uvicorn.server import Server
8+
9+
Receive = Callable[[], Awaitable[dict[str, Any]]]
10+
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
11+
12+
_PRODUCTS = {
13+
'1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'},
14+
'2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'},
15+
'3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'},
16+
}
17+
18+
19+
async def _send_html(send: Send, html: str, status: int = 200) -> None:
20+
await send(
21+
{
22+
'type': 'http.response.start',
23+
'status': status,
24+
'headers': [[b'content-type', b'text/html; charset=utf-8']],
25+
}
26+
)
27+
await send({'type': 'http.response.body', 'body': html.encode()})
28+
29+
30+
async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
31+
assert scope['type'] == 'http'
32+
path = scope['path']
33+
34+
if path == '/':
35+
await _send_html(
36+
send,
37+
'<html><head><title>E-commerce Test Store</title></head><body>'
38+
'<h1>Welcome to Test Store</h1>'
39+
'<a href="/products/1">Widget A</a>'
40+
'<a href="/products/2">Widget B</a>'
41+
'<a href="/products/3">Widget C</a>'
42+
'<a href="/about">About Us</a>'
43+
'</body></html>',
44+
)
45+
elif path.startswith('/products/'):
46+
product = _PRODUCTS.get(path.split('/')[-1])
47+
if product:
48+
await _send_html(
49+
send,
50+
f'<html><head><title>{product["name"]}</title></head><body>'
51+
f'<h1>{product["name"]}</h1>'
52+
f'<span class="price">{product["price"]}</span>'
53+
f'<p class="description">{product["description"]}</p>'
54+
f'<a href="/">Back to Home</a>'
55+
f'</body></html>',
56+
)
57+
else:
58+
await _send_html(send, '<html><body>Not Found</body></html>', 404)
59+
elif path == '/about':
60+
await _send_html(
61+
send,
62+
'<html><head><title>About Us</title></head><body>'
63+
'<h1>About Test Store</h1>'
64+
'<p class="description">We sell the best widgets in the world.</p>'
65+
'<a href="/">Back to Home</a>'
66+
'</body></html>',
67+
)
68+
else:
69+
await _send_html(send, '<html><body>Not Found</body></html>', 404)
70+
71+
72+
if __name__ == '__main__':
73+
asyncio.run(
74+
Server(
75+
config=Config(
76+
app=app,
77+
lifespan='off',
78+
loop='asyncio',
79+
port=8080,
80+
log_config=None,
81+
log_level=logging.CRITICAL,
82+
)
83+
).serve()
84+
)

0 commit comments

Comments
 (0)