-
-
Notifications
You must be signed in to change notification settings - Fork 504
Expand file tree
/
Copy pathlinkchecker.py
More file actions
146 lines (121 loc) · 4.33 KB
/
linkchecker.py
File metadata and controls
146 lines (121 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv
from datetime import datetime
def get_all_links(url):
"""
Fetches a URL and extracts all unique HTTP/HTTPS links.
"""
print(f"Fetching page: {url}")
try:
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, timeout=10, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
links = set()
# Find all 'a' tags with an 'href' attribute
for tag in soup.find_all("a", href=True):
full_url = urljoin(url, tag["href"])
# Ensure we only check web links
if full_url.startswith("http"):
links.add(full_url)
print(f"Found {len(links)} links.")
return links
except Exception as e:
print(f"Error fetching page: {e}")
return set()
def check_link(url):
"""
Checks a single URL's status. It first tries a HEAD request for efficiency,
then falls back to a GET request if needed.
"""
try:
headers = {"User-Agent": "Mozilla/5.0"}
# Use a HEAD request to get status without downloading the whole page
response = requests.head(url, timeout=10, allow_redirects=True, headers=headers)
# If HEAD is not allowed (405) or forbidden (403), try a GET request
if response.status_code in [405, 403]:
response = requests.get(url, timeout=10, headers=headers)
return response.status_code
except requests.exceptions.ConnectionError:
# Handle cases where the server is not reachable
return "Connection Error"
except requests.exceptions.Timeout:
return "Timeout"
except Exception as e:
return f"Error: {e}"
def get_status_label(status):
"""
Converts an HTTP status code or error string into a user-friendly label.
"""
if isinstance(status, int):
if status < 300:
return " ✔ OK"
elif status < 400:
return "⚠️Redirect"
elif status == 404:
return "❌Not Found"
elif status == 403:
return "🔒Forbidden"
elif status >= 500:
return "Server Error"
return str(status)
def export_to_csv(results, filename):
"""
Exports the list of checked links and their statuses to a CSV file.
"""
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["URL", "Status", "Result"])
for url, status in results:
label = "Working" if isinstance(status, int) and status < 400 else "Broken"
writer.writerow([url, status, label])
print(f"Results saved to: {filename}")
def check_all_links(website_url):
"""
Main function to orchestrate the link checking process for a given website.
"""
links = get_all_links(website_url)
if not links:
print("No links found.")
return
broken = []
working = []
all_results = []
print(f"{'STATUS':<15} URL")
print("-" * 60)
# Iterate through all found links and check their status
for i, link in enumerate(links, 1):
status = check_link(link)
label = get_status_label(status)
all_results.append((link, status))
print(f"{label:<15} {link}")
# Categorize links as working or broken
if isinstance(status, int) and status < 400:
working.append((link, status))
else:
broken.append((link, status))
# Summary
print("\nSUMMARY")
print("-" * 20)
print(f"Total Links: {len(links)}")
print(f"Working: {len(working)}")
print(f"Broken: {len(broken)}")
# Broken links detail
if broken:
print("\nBROKEN LINKS:")
for url, status in broken:
print(f"[{status}] {url}")
# Ask to export
save = input("\nSave results to CSV? (y/n): ").strip().lower()
if save == "y":
filename = f"link_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
export_to_csv(all_results, filename)
# --- Script Entry Point ---
if __name__ == "__main__":
print("--- Link Checker Tool ---")
website = input("Enter website URL: ").strip()
# Ensure the URL has a scheme (http or https)
if not website.startswith("http"):
website = "https://" + website
check_all_links(website)