-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy path04_selenium.py
More file actions
129 lines (100 loc) · 4.29 KB
/
Copy path04_selenium.py
File metadata and controls
129 lines (100 loc) · 4.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import asyncio
from typing import Any
from urllib.parse import urljoin, urlsplit
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from apify import Actor, Request
from apify.storages import RequestQueue
# To run locally, install the Selenium Chromedriver:
# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
# On the Apify platform, it's already in the Actor's Docker image.
def build_chrome_driver() -> webdriver.Chrome:
"""Create a headless Chrome WebDriver suitable for a container."""
chrome_options = ChromeOptions()
if Actor.configuration.headless:
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
return webdriver.Chrome(options=chrome_options)
def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], list[str]]:
"""Navigate to the URL with Selenium and return its data and same-site links."""
driver.get(url)
data = {
'url': url,
'title': driver.title,
'h1s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h1')],
'h2s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h2')],
'h3s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h3')],
}
# Keep only absolute links on the same host.
links: list[str] = []
host = urlsplit(url).netloc
for link in driver.find_elements(By.TAG_NAME, 'a'):
link_url = urljoin(url, link.get_attribute('href'))
if not link_url.startswith(('http://', 'https://')):
continue
if urlsplit(link_url).netloc == host:
links.append(link_url)
return data, links
async def enqueue_links(
request_queue: RequestQueue,
links: list[str],
*,
depth: int,
max_depth: int,
) -> None:
"""Enqueue the links one level deeper, unless max_depth was reached."""
if depth >= max_depth:
return
for link_url in links:
Actor.log.info(f'Enqueuing {link_url} ...')
request = Request.from_url(link_url)
request.crawl_depth = depth + 1
await request_queue.add_request(request)
async def main() -> None:
async with Actor:
# Read the Actor input.
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}])
max_depth = actor_input.get('maxDepth', 1)
if not start_urls:
Actor.log.info('No start URLs specified in Actor input, exiting...')
await Actor.exit()
# Open the request queue and enqueue the start URLs (crawl depth 0).
request_queue = await Actor.open_request_queue()
for start_url in start_urls:
url = start_url.get('url')
Actor.log.info(f'Enqueuing start URL: {url}')
await request_queue.add_request(Request.from_url(url))
# Cap the crawl. Raise or remove the limit to follow more pages.
max_requests = 10
handled_requests = 0
Actor.log.info('Launching Chrome WebDriver...')
driver = build_chrome_driver()
while handled_requests < max_requests and (
request := await request_queue.fetch_next_request()
):
handled_requests += 1
url = request.url
depth = request.crawl_depth
Actor.log.info(f'Scraping {url} (depth={depth}) ...')
try:
# Blocking WebDriver calls run in a worker thread.
data, links = await asyncio.to_thread(scrape_page, driver, url)
await Actor.push_data(data)
Actor.log.info(
f'Stored data from {url} '
f'(title={data["title"]!r}, {len(links)} links found).'
)
await enqueue_links(
request_queue, links, depth=depth, max_depth=max_depth
)
except Exception:
Actor.log.exception(f'Cannot extract data from {url}.')
finally:
await request_queue.mark_request_as_handled(request)
driver.quit()
if __name__ == '__main__':
asyncio.run(main())