-
-
Notifications
You must be signed in to change notification settings - Fork 25
Expand file tree
/
Copy pathsitemap.py
More file actions
162 lines (125 loc) · 4.47 KB
/
sitemap.py
File metadata and controls
162 lines (125 loc) · 4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor, as_completed
USERNAME = "ptprashanttripathi"
BASE_URL = f"https://{USERNAME}.github.io/"
GITHUB_API_URL = f"https://api.github.com/users/{USERNAME}/repos"
visited = set()
all_urls = set()
MAX_WORKERS = 20
TIMEOUT = 10
# ✅ Allowed extensions
ALLOWED_EXTENSIONS = (
".html",
".htm",
".png",
".jpg",
".jpeg",
".gif",
".svg",
".webp",
".ico",
".mp4",
".webm",
".mov",
".pdf",
)
def is_allowed(path):
"""
Check if URL should be included based on extension or if it's a domain root/base file.
"""
# Root or subfolder without extension allowed
if path == "/" or path.endswith("/"):
return True
# Base domain files like robots.txt or sitemap.xml
if "." not in path.split("/")[-1]:
return True
# Check extension
return path.endswith(ALLOWED_EXTENSIONS)
def fetch_repos():
"""Fetch all repositories for the user."""
try:
response = requests.get(GITHUB_API_URL, timeout=TIMEOUT)
response.raise_for_status()
repos = response.json()
return [repo["name"] for repo in repos]
except Exception as e:
print(f"Error fetching repos: {e}")
return []
def fetch_page_urls(url):
"""Fetch and parse URLs from a given page."""
urls_set = set()
try:
response = requests.get(url, timeout=TIMEOUT)
if response.status_code != 200:
return urls_set
soup = BeautifulSoup(response.content, "html.parser")
# Collect <a> tags
for tag in soup.find_all("a", href=True):
full_url = urljoin(BASE_URL, tag["href"])
if full_url.startswith(BASE_URL) and is_allowed(full_url):
urls_set.add(full_url)
# Collect <img> tags
for tag in soup.find_all("img", src=True):
full_url = urljoin(BASE_URL, tag["src"])
if full_url.startswith(BASE_URL) and is_allowed(full_url):
urls_set.add(full_url)
# Collect <video> tags
for tag in soup.find_all("video", src=True):
full_url = urljoin(BASE_URL, tag["src"])
if full_url.startswith(BASE_URL) and is_allowed(full_url):
urls_set.add(full_url)
# Collect <source> tags inside <video> or <audio>
for tag in soup.find_all("source", src=True):
full_url = urljoin(BASE_URL, tag["src"])
if full_url.startswith(BASE_URL) and is_allowed(full_url):
urls_set.add(full_url)
except Exception as e:
print(f"Error fetching {url}: {e}")
return urls_set
def crawl_repo(repo_name):
"""Crawl a single GitHub Pages repo starting from its base URL."""
repo_url = urljoin(BASE_URL, repo_name + "/")
to_visit = [repo_url]
local_visited = set()
while to_visit:
current_url = to_visit.pop()
if current_url in local_visited or current_url in visited:
continue
local_visited.add(current_url)
visited.add(current_url)
page_urls = fetch_page_urls(current_url)
all_urls.update(page_urls)
# Only crawl deeper into HTML pages or subfolders
for link in page_urls:
if link not in local_visited:
to_visit.append(link)
def create_sitemap(urls):
"""Generate a sitemap.xml from collected URLs."""
urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for url in sorted(urls):
url_elem = ET.SubElement(urlset, "url")
loc_elem = ET.SubElement(url_elem, "loc")
loc_elem.text = url
tree = ET.ElementTree(urlset)
ET.indent(tree, space="\t", level=0)
with open("sitemap.xml", "wb") as f:
tree.write(f, encoding="utf-8", xml_declaration=True)
if __name__ == "__main__":
print("Fetching repositories...")
repos = fetch_repos()
if not repos:
print("No repositories found or error fetching repos.")
exit()
print(f"Found repos: {repos}")
# Crawl each repo in parallel
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = [executor.submit(crawl_repo, repo) for repo in repos]
for future in as_completed(futures):
future.result()
print(f"Total URLs collected: {len(all_urls)}")
# Create sitemap
create_sitemap(all_urls)
print("Sitemap generated: sitemap.xml")