forked from apify/crawlee-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathselectolax_adaptive_run.py
More file actions
38 lines (28 loc) · 1.14 KB
/
selectolax_adaptive_run.py
File metadata and controls
38 lines (28 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import asyncio
from crawlee.crawlers import (
AdaptivePlaywrightCrawler,
AdaptivePlaywrightCrawlerStatisticState,
AdaptivePlaywrightCrawlingContext,
)
from crawlee.statistics import Statistics
from .selectolax_parser import SelectolaxLexborParser
async def main() -> None:
crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler(
max_requests_per_crawl=10,
# Use custom Selectolax parser for static content parsing.
static_parser=SelectolaxLexborParser(),
# Set up statistics with AdaptivePlaywrightCrawlerStatisticState.
statistics=Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState),
)
@crawler.router.default_handler
async def handle_request(context: AdaptivePlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = {
'url': context.request.url,
'title': await context.query_selector_one('title'),
}
await context.push_data(data)
await context.enqueue_links()
await crawler.run(['https://crawlee.dev/'])
if __name__ == '__main__':
asyncio.run(main())