-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy path03_playwright.py
More file actions
97 lines (75 loc) · 3.94 KB
/
03_playwright.py
File metadata and controls
97 lines (75 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import asyncio
from urllib.parse import urljoin
from playwright.async_api import async_playwright
from apify import Actor, Request
# Note: To run this Actor locally, ensure that Playwright browsers are installed.
# Run `playwright install --with-deps` in the Actor's virtual environment to install them.
# When running on the Apify platform, these dependencies are already included
# in the Actor's Docker image.
async def main() -> None:
# Enter the context of the Actor.
async with Actor:
# Retrieve the Actor input, and use default values if not provided.
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
max_depth = actor_input.get('max_depth', 1)
# Exit if no start URLs are provided.
if not start_urls:
Actor.log.info('No start URLs specified in actor input, exiting...')
await Actor.exit()
# Open the default request queue for handling URLs to be processed.
request_queue = await Actor.open_request_queue()
# Enqueue the start URLs with an initial crawl depth of 0.
for start_url in start_urls:
url = start_url.get('url')
Actor.log.info(f'Enqueuing {url} ...')
new_request = Request.from_url(url, user_data={'depth': 0})
await request_queue.add_request(new_request)
Actor.log.info('Launching Playwright...')
# Launch Playwright and open a new browser context.
async with async_playwright() as playwright:
# Configure the browser to launch in headless mode as per Actor configuration.
browser = await playwright.chromium.launch(
headless=Actor.configuration.headless,
args=['--disable-gpu'],
)
context = await browser.new_context()
# Process the URLs from the request queue.
while request := await request_queue.fetch_next_request():
url = request.url
if not isinstance(request.user_data['depth'], (str, int)):
raise TypeError('Request.depth is an enexpected type.')
depth = int(request.user_data['depth'])
Actor.log.info(f'Scraping {url} (depth={depth}) ...')
try:
# Open a new page in the browser context and navigate to the URL.
page = await context.new_page()
await page.goto(url)
# If the current depth is less than max_depth, find nested links
# and enqueue them.
if depth < max_depth:
for link in await page.locator('a').all():
link_href = await link.get_attribute('href')
link_url = urljoin(url, link_href)
if link_url.startswith(('http://', 'https://')):
Actor.log.info(f'Enqueuing {link_url} ...')
new_request = Request.from_url(
link_url,
user_data={'depth': depth + 1},
)
await request_queue.add_request(new_request)
# Extract the desired data.
data = {
'url': url,
'title': await page.title(),
}
# Store the extracted data to the default dataset.
await Actor.push_data(data)
except Exception:
Actor.log.exception(f'Cannot extract data from {url}.')
finally:
await page.close()
# Mark the request as handled to ensure it is not processed again.
await request_queue.mark_request_as_handled(request)
if __name__ == '__main__':
asyncio.run(main())