-
Notifications
You must be signed in to change notification settings - Fork 61
Added scripts that feed the gurubase.io RAG AI tool #307
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,239 @@ | ||
| #!/usr/bin/python3 | ||
|
|
||
| """ | ||
| Outputs URLs of ArduPilot documentation pages. | ||
|
|
||
| SPDX-FileCopyrightText: 2024-2025 Amilcar do Carmo Lucas <amilcar.lucas@iav.de> | ||
|
|
||
| SPDX-License-Identifier: GPL-3.0-or-later | ||
| """ | ||
|
|
||
| import logging | ||
| import time | ||
| from os import environ as os_environ | ||
| from typing import Union | ||
| from urllib.parse import urljoin, urlparse | ||
|
|
||
| import requests | ||
| from bs4 import BeautifulSoup, Tag | ||
|
|
||
| # Define the URL where to start crawling | ||
| URL = "https://ardupilot.org/ardupilot/" | ||
| USERNAME = "your_username" # Replace with actual username if needed | ||
| PASSWORD = "" # Replace with actual password if needed | ||
|
|
||
| # Crawls and outputs only URLs that start with: | ||
| ALLOWED_DOMAINS = ( | ||
| "ardupilot.org/ardupilot/", | ||
| "ardupilot.org/copter/", | ||
| "ardupilot.org/plane/", | ||
| "ardupilot.org/rover/", | ||
| "www.ardusub.com/", | ||
| "ardupilot.org/blimp/", | ||
| "ardupilot.org/antennatracker/", | ||
| "ardupilot.org/planner/", | ||
| "ardupilot.org/mavproxy/", | ||
| "ardupilot.org/dev/", | ||
| "ardupilot.github.io/MethodicConfigurator/", | ||
| "mavlink.io/en/", | ||
| "docs.cubepilot.org/", | ||
| "doc.cuav.net/", | ||
| "docs.holybro.com/", | ||
| ) | ||
|
|
||
| URL_BLACKLIST = [ | ||
| "https://mavlink.io/en/messages/ASLUAV.html", | ||
| "https://mavlink.io/en/messages/AVSSUAS.html", | ||
| "https://mavlink.io/en/messages/all.html", | ||
| "https://mavlink.io/en/messages/csAirLink.html", | ||
| "https://mavlink.io/en/messages/dialects.html", | ||
| "https://mavlink.io/en/messages/icarous.html", | ||
| "https://mavlink.io/en/messages/matrixpilot.html", | ||
| "https://mavlink.io/en/messages/paparazzi.html", | ||
| "https://mavlink.io/en/messages/python_array_test.html", | ||
| "https://mavlink.io/en/messages/test.html", | ||
| "https://mavlink.io/en/messages/uAvionix.html", | ||
| "https://mavlink.io/en/messages/ualberta.html", | ||
| ] | ||
|
|
||
| URL_BLACKLIST_PREFIXES = ["https://docs.cubepilot.org/user-guides/~/changes/", "zh-hans", "doc.cuav.net/tutorial"] | ||
|
|
||
|
|
||
| # pylint: disable=duplicate-code | ||
| def get_env_proxies() -> Union[dict[str, str], None]: | ||
| proxies_env = { | ||
| "http": os_environ.get("HTTP_PROXY") or os_environ.get("http_proxy"), | ||
| "https": os_environ.get("HTTPS_PROXY") or os_environ.get("https_proxy"), | ||
| "no_proxy": os_environ.get("NO_PROXY") or os_environ.get("no_proxy"), | ||
| } | ||
| # Remove None values | ||
| proxies_dict: dict[str, str] = {k: v for k, v in proxies_env.items() if v is not None} | ||
| # define as None if no proxies are defined in the OS environment variables | ||
| proxies = proxies_dict if proxies_dict else None | ||
| if proxies: | ||
| logging.info("Proxies: %s", proxies) | ||
| else: | ||
| logging.debug("Proxies: %s", proxies) | ||
| return proxies | ||
|
|
||
|
|
||
| # pylint: enable=duplicate-code | ||
|
|
||
|
|
||
| def remove_duplicates(visited_urls: set[str]) -> set[str]: | ||
| # if the URL is https:// and the same URL exists as http://, remove the https:// URL from the list | ||
| urls_to_remove = set() | ||
| for url in visited_urls: | ||
| if url.startswith("https://") and f"http://{url[8:]}" in visited_urls: | ||
| urls_to_remove.add(url) | ||
| visited_urls -= urls_to_remove | ||
|
|
||
| # if visited URLs end in "/index.html", and the same URL without "/index.html" or without "index.html" is in the list, | ||
| # remove the one with "/index.html" from the list | ||
| urls_to_remove = set() | ||
| for url in visited_urls: | ||
| if url.endswith("/index.html") and (url[:-11] in visited_urls or url[:-10] in visited_urls): | ||
| urls_to_remove.add(url) | ||
| visited_urls -= urls_to_remove | ||
|
|
||
| # if visited URLs end in "/", and the same URL without "/" is in the list, remove the one with "/" from the list | ||
| urls_to_remove = set() | ||
| for url in visited_urls: | ||
| if url.endswith("/") and url[:-1] in visited_urls: | ||
| urls_to_remove.add(url) | ||
| visited_urls -= urls_to_remove | ||
|
|
||
| # if visited URLs end in "common-*.html", and the file URL exists in with base URL http://ardupilot.org/copter/docs/, | ||
| # remove it from the list | ||
| urls_to_remove = set() | ||
| for url in visited_urls: | ||
| if "/common-" in url and url.endswith(".html"): | ||
| filename = url.split("/")[-1] | ||
| copter_url = f"http://ardupilot.org/copter/docs/{filename}" | ||
| if copter_url in visited_urls and url != copter_url: | ||
| urls_to_remove.add(url) | ||
| visited_urls -= urls_to_remove | ||
|
|
||
| return visited_urls | ||
|
|
||
|
|
||
| def find_all_links(soup: BeautifulSoup, current_url: str, visited_urls: set[str], urls_to_visit: set[str]) -> set[str]: | ||
| for link in soup.find_all("a", href=True): | ||
| if not isinstance(link, Tag): | ||
| continue | ||
|
|
||
| href = link.attrs.get("href") | ||
|
|
||
| if not isinstance(href, str): | ||
| continue | ||
|
|
||
| full_url = urljoin(current_url, href) | ||
|
|
||
| if not isinstance(full_url, str): | ||
| continue | ||
|
|
||
| # Remove anchor from URL | ||
| parsed_url = urlparse(full_url) | ||
|
|
||
| clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" | ||
|
|
||
| # Skip image files | ||
| if parsed_url.path.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".svg", ".bmp", "_images")): | ||
| continue | ||
|
|
||
| # Check if URL matches allowed domains | ||
| if ( | ||
| any(domain in clean_url for domain in ALLOWED_DOMAINS) | ||
| and clean_url not in visited_urls | ||
| and clean_url not in urls_to_visit | ||
| and all(domain not in clean_url for domain in URL_BLACKLIST_PREFIXES) | ||
| ): | ||
| urls_to_visit.add(clean_url) | ||
|
|
||
| return urls_to_visit | ||
|
|
||
|
|
||
| def main() -> None: | ||
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | ||
| start_time = time.time() | ||
|
|
||
| visited_urls = set() | ||
| broken_urls = set() | ||
| urls_to_visit = {URL} | ||
| proxies = get_env_proxies() | ||
|
|
||
| # Set up authentication if credentials provided | ||
| auth = (USERNAME, PASSWORD) if USERNAME and PASSWORD else None | ||
|
|
||
| session = requests.Session() | ||
| if auth: | ||
| session.auth = auth | ||
| if proxies: | ||
| session.proxies = proxies | ||
|
|
||
| while urls_to_visit: | ||
| current_url = urls_to_visit.pop() | ||
|
|
||
| if current_url in visited_urls or current_url in broken_urls: | ||
| continue | ||
|
|
||
| try: | ||
| logging.info("Crawling: %s", current_url) | ||
| response = session.get(current_url, timeout=30) | ||
| response.raise_for_status() | ||
|
|
||
| visited_urls.add(current_url) | ||
|
|
||
| # Parse HTML | ||
| soup = BeautifulSoup(response.text, "html.parser") | ||
|
|
||
| urls_to_visit = find_all_links(soup, current_url, visited_urls, urls_to_visit) | ||
|
|
||
| except (requests.RequestException, requests.Timeout) as e: | ||
| logging.error("Network error crawling %s: %s", current_url, str(e)) | ||
| broken_urls.add(current_url) | ||
| if current_url in visited_urls: | ||
| visited_urls.remove(current_url) | ||
| except (KeyError, ValueError) as e: | ||
| logging.error("URL processing error for %s: %s", current_url, str(e)) | ||
| broken_urls.add(current_url) | ||
| if current_url in visited_urls: | ||
| visited_urls.remove(current_url) | ||
| output_urls(visited_urls, broken_urls, start_time) | ||
|
|
||
|
|
||
| def output_urls(visited_urls: set[str], broken_urls: set[str], start_time: float) -> None: | ||
| # Write all html URLs to file | ||
| raw_pages = len(visited_urls) | ||
| with open("gurubase.io_url_list_raw.txt", "w", encoding="utf-8") as f: | ||
| for url in sorted(visited_urls): | ||
| f.write(f"{url}\n") # Output to file | ||
|
|
||
| visited_urls -= set(URL_BLACKLIST) | ||
| dedup_urls = remove_duplicates(visited_urls) | ||
|
|
||
| # Write de-duplicated URLs to file and terminal | ||
| with open("gurubase.io_url_list.txt", "w", encoding="utf-8") as f: | ||
| for url in sorted(dedup_urls): | ||
| print(url) # Output to terminal # noqa: T201 | ||
| f.write(f"{url}\n") # Output to file | ||
|
|
||
| # Write broken URLs to file | ||
| with open("gurubase.io_broken_urls_list.txt", "w", encoding="utf-8") as f: | ||
| for url in sorted(broken_urls): | ||
| f.write(f"{url}\n") | ||
|
|
||
| duration_mins = (time.time() - start_time) / 60 | ||
| pages_per_min = raw_pages / duration_mins | ||
|
|
||
| logging.info("\nCrawling Statistics:") | ||
| msg = f"{raw_pages} pages crawled in {duration_mins:.2f} minutes ({pages_per_min:.2f} pages/min)" | ||
| logging.info(msg) | ||
| msg = f"De-duplicated pages: {len(dedup_urls)}" | ||
| logging.info(msg) | ||
| msg = f"Broken URLs found: {len(broken_urls)}" | ||
| logging.info(msg) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() # Call the main function | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| #!/bin/bash | ||
|
|
||
| # Outputs URLs of ArduPilot README.md pages on GitHub | ||
|
|
||
| # This script is used to crawl the ArduPilot GitHub repository for README.md files | ||
| # and output the URLs to a file called github_urllist.txt | ||
|
|
||
| # SPDX-FileCopyrightText: 2024-2025 Amilcar do Carmo Lucas <amilcar.lucas@iav.de> | ||
|
|
||
| # SPDX-License-Identifier: GPL-3.0-or-later | ||
|
|
||
| cd ../ardupilot | ||
| find . -type f -name "*.md" | sed 's|^\.|https://github.com/ArduPilot/ardupilot/blob/master|' > ../ardupilot_methodic_configurator/github_urllist.txt | ||
| cd - |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.