Skip to content

Commit cf6c7dd

Browse files
committed
feat(youtube): add youtube channel crawl script
1 parent fe0f962 commit cf6c7dd

2 files changed

Lines changed: 124 additions & 0 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ dev = [
7777

7878
scripts = [
7979
"bs4",
80+
"selenium",
8081
]
8182

8283
[project.scripts]
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/usr/bin/python3
2+
3+
"""
4+
Outputs URLs of ArduPilot YouTube videos from the ArduPilot YouTube channel.
5+
6+
SPDX-FileCopyrightText: 2024-2025 Amilcar do Carmo Lucas <amilcar.lucas@iav.de>
7+
8+
SPDX-License-Identifier: GPL-3.0-or-later
9+
"""
10+
11+
import logging
12+
import time
13+
from os import environ as os_environ
14+
from typing import Union
15+
16+
from requests.auth import HTTPProxyAuth
17+
from selenium import webdriver
18+
from selenium.webdriver.common.by import By
19+
from selenium.webdriver.firefox.options import Options
20+
from selenium.webdriver.support import expected_conditions as ec
21+
from selenium.webdriver.support.ui import WebDriverWait
22+
23+
# Define the URL of the YouTube channel
24+
URL = "https://www.youtube.com/@ardupilot19/videos"
25+
USERNAME = "your_username" # Replace with actual username if needed
26+
PASSWORD = "" # Replace with actual password if needed
27+
28+
29+
# pylint: disable=duplicate-code
30+
def get_env_proxies() -> Union[dict[str, str], None]:
31+
proxies_env = {
32+
"http": os_environ.get("HTTP_PROXY") or os_environ.get("http_proxy"),
33+
"https": os_environ.get("HTTPS_PROXY") or os_environ.get("https_proxy"),
34+
"no_proxy": os_environ.get("NO_PROXY") or os_environ.get("no_proxy"),
35+
}
36+
# Remove None values
37+
proxies_dict: dict[str, str] = {k: v for k, v in proxies_env.items() if v is not None}
38+
# define as None if no proxies are defined in the OS environment variables
39+
proxies = proxies_dict if proxies_dict else None
40+
if proxies:
41+
logging.info("Proxies: %s", proxies)
42+
else:
43+
logging.debug("Proxies: %s", proxies)
44+
return proxies
45+
46+
47+
# pylint: enable=duplicate-code
48+
49+
50+
def main() -> None:
51+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
52+
53+
proxies = get_env_proxies()
54+
if proxies:
55+
HTTPProxyAuth(USERNAME, PASSWORD)
56+
57+
# Setup Firefox in headless mode
58+
options = Options()
59+
options.add_argument("--headless")
60+
driver = webdriver.Firefox(options=options)
61+
62+
try:
63+
logging.info("Loading page: %s", URL)
64+
driver.get(URL)
65+
66+
# Handle cookie consent dialog
67+
try:
68+
logging.info("Looking for cookie consent button")
69+
wait = WebDriverWait(driver, 10)
70+
cookie_button = wait.until(ec.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Accept all']")))
71+
cookie_button.click()
72+
logging.info("Clicked cookie consent button")
73+
except Exception as e: # pylint: disable=broad-exception-caught
74+
logging.warning("Cookie consent handling failed: %s", e)
75+
76+
# Wait for content to load initially
77+
time.sleep(5) # Increased initial wait
78+
79+
# Scroll more times to ensure content loads
80+
for i in range(5): # Increased scroll iterations
81+
logging.info("Scrolling iteration %d", i + 1)
82+
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
83+
time.sleep(3) # Increased scroll wait
84+
85+
# Try different selectors for video links
86+
selectors = [
87+
"a#video-title", # Try this selector first
88+
"a[href*='/watch?v=']", # More generic selector
89+
"h3.ytd-grid-video-renderer a", # Another possible selector
90+
]
91+
92+
video_links = []
93+
for selector in selectors:
94+
logging.info("Trying selector: %s", selector)
95+
elements = driver.find_elements(By.CSS_SELECTOR, selector)
96+
logging.info("Found %d elements with selector %s", len(elements), selector)
97+
98+
for element in elements:
99+
href = element.get_attribute("href")
100+
if href and "watch?v=" in href and href not in video_links:
101+
video_links.append(href)
102+
logging.info("Found video: %s", href)
103+
if len(video_links) >= 80: # gurubase has a limit of 100 videos
104+
break
105+
106+
if len(video_links) >= 80: # gurubase has a limit of 100 videos
107+
break
108+
109+
logging.info("Total unique videos found: %d", len(video_links))
110+
111+
# Print the video URLs
112+
for video in video_links:
113+
print(video) # noqa: T201
114+
115+
except Exception as e: # pylint: disable=broad-exception-caught
116+
logging.exception("An error occurred: %s", str(e))
117+
118+
finally:
119+
driver.quit()
120+
121+
122+
if __name__ == "__main__":
123+
main() # Call the main function

0 commit comments

Comments
 (0)