forked from Robbie1977/VFB2-draft
-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathcheck_images.py
More file actions
94 lines (77 loc) · 3.19 KB
/
check_images.py
File metadata and controls
94 lines (77 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
"""Check external image URLs in content (optionally themes) and report failures."""
from __future__ import annotations
import argparse
import pathlib
import re
import sys
import ssl
import socket
import urllib.request
import urllib.error
IMAGE_PATTERN = re.compile(r'https?://[^\s"\')]+\.(?:png|jpg|jpeg|gif|svg)', re.IGNORECASE)
DEFAULT_EXTENSIONS = ['.md', '.html', '.yml', '.yaml', '.toml', '.json', '.js', '.ts', '.scss', '.css']
def collect_image_urls(base_dir: pathlib.Path, ext_filter=None):
ext_filter = ext_filter or DEFAULT_EXTENSIONS
matches = []
for path in base_dir.rglob('*'):
if not path.is_file() or path.suffix.lower() not in ext_filter:
continue
text = path.read_text(encoding='utf-8', errors='ignore')
for m in IMAGE_PATTERN.finditer(text):
matches.append((str(path), m.group(0)))
return matches
def check_url(url: str, timeout: float = 8.0):
socket.setdefaulttimeout(timeout)
ctx = ssl.create_default_context()
headers = {'User-Agent': 'Mozilla/5.0 (compatible; image-link-checker/1.0)'}
try:
req = urllib.request.Request(url, method='HEAD', headers=headers)
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as response:
return response.status, None
except urllib.error.HTTPError as he:
if 300 <= he.code < 400 and he.headers.get('Location'):
return he.code, None
try:
req = urllib.request.Request(url, method='GET', headers=headers)
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as response:
return response.status, None
except Exception as inner_e:
return he.code, str(inner_e)
except Exception as e:
return None, str(e)
def main() -> int:
parser = argparse.ArgumentParser(description='Check external image URLs in a static site repo')
parser.add_argument('--base', default='content', help='Base directory to scan (default: content)')
parser.add_argument('--include-themes', action='store_true', help='Also include themes directory')
parser.add_argument('--timeout', type=float, default=8.0, help='HTTP timeout in seconds')
args = parser.parse_args()
dirs = [pathlib.Path(args.base)]
if args.include_themes:
dirs.append(pathlib.Path('themes'))
all_urls = []
for d in dirs:
if not d.exists():
continue
all_urls.extend(collect_image_urls(d))
if not all_urls:
print('No external image URLs found')
return 0
unique = list(dict.fromkeys(all_urls))
print(f'Found {len(all_urls)} references, {len(unique)} unique URLs')
failures = []
for i, (path, url) in enumerate(unique, 1):
status, err = check_url(url, timeout=args.timeout)
if status is None or status >= 400:
failures.append((path, url, status, err))
if i % 20 == 0:
print(f'Checked {i}/{len(unique)}')
if failures:
print(f'FAILED {len(failures)} URLs')
for p, u, s, e in failures:
print(f'{p} {s or "ERR"} {e or ""} {u}')
return 1
print('All URLs OK')
return 0
if __name__ == '__main__':
sys.exit(main())