Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ dev = [

scripts = [
"bs4",
"selenium",
]

[project.scripts]
Expand Down
123 changes: 123 additions & 0 deletions scripts/crawl_ardupilot_youtube_channel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/python3

Check failure on line 1 in scripts/crawl_ardupilot_youtube_channel.py

View workflow job for this annotation

GitHub Actions / ruff (py39)

Ruff (EXE001)

scripts/crawl_ardupilot_youtube_channel.py:1:1: EXE001 Shebang is present but file is not executable

"""
Outputs URLs of ArduPilot YouTube videos from the ArduPilot YouTube channel.

SPDX-FileCopyrightText: 2024-2025 Amilcar do Carmo Lucas <amilcar.lucas@iav.de>

SPDX-License-Identifier: GPL-3.0-or-later
"""

import logging
import time
from os import environ as os_environ
from typing import Union

from requests.auth import HTTPProxyAuth
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait

# Define the URL of the YouTube channel
URL = "https://www.youtube.com/@ardupilot19/videos"
USERNAME = "your_username" # Replace with actual username if needed
PASSWORD = "" # Replace with actual password if needed


# pylint: disable=duplicate-code
def get_env_proxies() -> Union[dict[str, str], None]:
proxies_env = {
"http": os_environ.get("HTTP_PROXY") or os_environ.get("http_proxy"),
"https": os_environ.get("HTTPS_PROXY") or os_environ.get("https_proxy"),
"no_proxy": os_environ.get("NO_PROXY") or os_environ.get("no_proxy"),
}
# Remove None values
proxies_dict: dict[str, str] = {k: v for k, v in proxies_env.items() if v is not None}
# define as None if no proxies are defined in the OS environment variables
proxies = proxies_dict if proxies_dict else None
if proxies:
logging.info("Proxies: %s", proxies)
else:
logging.debug("Proxies: %s", proxies)
return proxies


# pylint: enable=duplicate-code


def main() -> None:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

proxies = get_env_proxies()
if proxies:
HTTPProxyAuth(USERNAME, PASSWORD)
Copy link

Copilot AI Apr 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The instantiation of HTTPProxyAuth is not assigned to any variable nor passed to the WebDriver. Ensure that proxy authentication is properly configured for the driver if required.

Copilot uses AI. Check for mistakes.

# Setup Firefox in headless mode
options = Options()
options.add_argument("--headless")
Comment on lines +54 to +59
Copy link

Copilot AI Apr 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The call to HTTPProxyAuth here is not assigned to any variable or integrated with the webdriver or session, which may make proxy authentication ineffective. Consider reviewing and properly integrating proxy authentication if proxies are in use.

Suggested change
if proxies:
HTTPProxyAuth(USERNAME, PASSWORD)
# Setup Firefox in headless mode
options = Options()
options.add_argument("--headless")
auth = None
if proxies:
auth = HTTPProxyAuth(USERNAME, PASSWORD)
from selenium.webdriver.common.proxy import Proxy, ProxyType
proxy = Proxy()
proxy.http_proxy = proxies.get("http")
proxy.ssl_proxy = proxies.get("https")
proxy.proxy_type = ProxyType.MANUAL
# Setup Firefox in headless mode
options = Options()
options.add_argument("--headless")
if proxies:
options.proxy = proxy

Copilot uses AI. Check for mistakes.
driver = webdriver.Firefox(options=options)

try:
logging.info("Loading page: %s", URL)
driver.get(URL)

# Handle cookie consent dialog
try:
logging.info("Looking for cookie consent button")
wait = WebDriverWait(driver, 10)
cookie_button = wait.until(ec.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Accept all']")))
cookie_button.click()
logging.info("Clicked cookie consent button")
except Exception as e: # pylint: disable=broad-exception-caught
logging.warning("Cookie consent handling failed: %s", e)

# Wait for content to load initially
time.sleep(5) # Increased initial wait

# Scroll more times to ensure content loads
for i in range(5): # Increased scroll iterations
logging.info("Scrolling iteration %d", i + 1)
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(3) # Increased scroll wait

# Try different selectors for video links
selectors = [
"a#video-title", # Try this selector first
"a[href*='/watch?v=']", # More generic selector
"h3.ytd-grid-video-renderer a", # Another possible selector
]

video_links = []
for selector in selectors:
logging.info("Trying selector: %s", selector)
elements = driver.find_elements(By.CSS_SELECTOR, selector)
logging.info("Found %d elements with selector %s", len(elements), selector)

for element in elements:
href = element.get_attribute("href")
if href and "watch?v=" in href and href not in video_links:
video_links.append(href)
logging.info("Found video: %s", href)
if len(video_links) >= 80: # gurubase has a limit of 100 videos
break

if len(video_links) >= 80: # gurubase has a limit of 100 videos
Comment on lines +103 to +106
Copy link

Copilot AI Apr 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The condition restricts video_links to 80, but the comment indicates a limit of 100. Please update the condition or comment to align the intended logic.

Suggested change
if len(video_links) >= 80: # gurubase has a limit of 100 videos
break
if len(video_links) >= 80: # gurubase has a limit of 100 videos
if len(video_links) >= 100: # gurubase has a limit of 100 videos
break
if len(video_links) >= 100: # gurubase has a limit of 100 videos

Copilot uses AI. Check for mistakes.
break

logging.info("Total unique videos found: %d", len(video_links))

# Print the video URLs
for video in video_links:
print(video) # noqa: T201

except Exception as e: # pylint: disable=broad-exception-caught
logging.exception("An error occurred: %s", str(e))

finally:
driver.quit()


if __name__ == "__main__":
main() # Call the main function
Loading