forked from apify/crawlee-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_parallel_crawlers.py
More file actions
94 lines (77 loc) · 3.73 KB
/
run_parallel_crawlers.py
File metadata and controls
94 lines (77 loc) · 3.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import asyncio
from crawlee import ConcurrencySettings
from crawlee.crawlers import (
ParselCrawler,
ParselCrawlingContext,
PlaywrightCrawler,
PlaywrightCrawlingContext,
)
from crawlee.sessions import SessionPool
from crawlee.storages import RequestQueue
async def main() -> None:
# Open request queues for both crawlers with different aliases
playwright_rq = await RequestQueue.open(alias='playwright-requests')
parsel_rq = await RequestQueue.open(alias='parsel-requests')
# Use a shared session pool between both crawlers
async with SessionPool() as session_pool:
playwright_crawler = PlaywrightCrawler(
# Set the request queue for Playwright crawler
request_manager=playwright_rq,
session_pool=session_pool,
# Configure concurrency settings for Playwright crawler
concurrency_settings=ConcurrencySettings(
max_concurrency=5, desired_concurrency=5
),
# Set `keep_alive`` so that the crawler does not stop working when there are
# no requests in the queue.
keep_alive=True,
)
parsel_crawler = ParselCrawler(
# Set the request queue for Parsel crawler
request_manager=parsel_rq,
session_pool=session_pool,
# Configure concurrency settings for Parsel crawler
concurrency_settings=ConcurrencySettings(
max_concurrency=10, desired_concurrency=10
),
# Set maximum requests per crawl for Parsel crawler
max_requests_per_crawl=50,
)
@playwright_crawler.router.default_handler
async def handle_playwright(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Playwright Processing {context.request.url}...')
title = await context.page.title()
# Push the extracted data to the dataset for Playwright crawler
await context.push_data(
{'title': title, 'url': context.request.url, 'source': 'playwright'},
dataset_name='playwright-data',
)
@parsel_crawler.router.default_handler
async def handle_parsel(context: ParselCrawlingContext) -> None:
context.log.info(f'Parsel Processing {context.request.url}...')
title = context.parsed_content.css('title::text').get()
# Push the extracted data to the dataset for Parsel crawler
await context.push_data(
{'title': title, 'url': context.request.url, 'source': 'parsel'},
dataset_name='parsel-data',
)
# Enqueue links to the Playwright request queue for blog pages
await context.enqueue_links(
selector='a[href*="/blog/"]', rq_alias='playwright-requests'
)
# Enqueue other links to the Parsel request queue
await context.enqueue_links(selector='a:not([href*="/blog/"])')
# Start the Playwright crawler in the background
background_crawler_task = asyncio.create_task(playwright_crawler.run([]))
# Run the Parsel crawler with the initial URL and wait for it to finish
await parsel_crawler.run(['https://crawlee.dev/blog'])
# Wait for the Playwright crawler to finish processing all requests
while not await playwright_rq.is_empty():
playwright_crawler.log.info('Waiting for Playwright crawler to finish...')
await asyncio.sleep(5)
# Stop the Playwright crawler after all requests are processed
playwright_crawler.stop()
# Wait for the background Playwright crawler task to complete
await background_crawler_task
if __name__ == '__main__':
asyncio.run(main())