|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Find image files in static/ that are not referenced in the repository.""" |
| 3 | + |
| 4 | +from __future__ import annotations |
| 5 | + |
| 6 | +import argparse |
| 7 | +import os |
| 8 | +import re |
| 9 | +from pathlib import Path |
| 10 | +from typing import Iterable |
| 11 | + |
| 12 | + |
| 13 | +IMAGE_SUFFIXES: set[str] = { |
| 14 | + ".avif", |
| 15 | + ".bmp", |
| 16 | + ".gif", |
| 17 | + ".ico", |
| 18 | + ".jpeg", |
| 19 | + ".jpg", |
| 20 | + ".png", |
| 21 | + ".svg", |
| 22 | + ".tif", |
| 23 | + ".tiff", |
| 24 | + ".webp", |
| 25 | +} |
| 26 | + |
| 27 | +TEXT_SUFFIXES: set[str] = { |
| 28 | + ".css", |
| 29 | + ".html", |
| 30 | + ".js", |
| 31 | + ".json", |
| 32 | + ".jsx", |
| 33 | + ".md", |
| 34 | + ".mdx", |
| 35 | + ".scss", |
| 36 | + ".ts", |
| 37 | + ".tsx", |
| 38 | + ".yaml", |
| 39 | + ".yml", |
| 40 | +} |
| 41 | + |
| 42 | +EXCLUDED_DIRS: set[str] = { |
| 43 | + ".codex", |
| 44 | + ".docusaurus", |
| 45 | + ".git", |
| 46 | + ".tickets", |
| 47 | + "build", |
| 48 | + "node_modules", |
| 49 | + "static", |
| 50 | +} |
| 51 | + |
| 52 | +DEFAULT_REFERENCE_SOURCES: tuple[str, ...] = ( |
| 53 | + "user-guide", |
| 54 | + "developer-guide", |
| 55 | + "experiments", |
| 56 | + "src", |
| 57 | + "docusaurus.config.js", |
| 58 | + "sidebars.js", |
| 59 | +) |
| 60 | + |
| 61 | +IMAGE_TOKEN_RE = re.compile( |
| 62 | + r"""(?P<quote>['"`])(?P<path>[^'"`\n]+?\.(?:avif|bmp|gif|ico|jpe?g|png|svg|tiff?|webp))(?P=quote)""", |
| 63 | + re.IGNORECASE, |
| 64 | +) |
| 65 | +IMG_WEB_PATH_RE = re.compile( |
| 66 | + r"""/img/[^\s"'`)\]>]+?\.(?:avif|bmp|gif|ico|jpe?g|png|svg|tiff?|webp)""", |
| 67 | + re.IGNORECASE, |
| 68 | +) |
| 69 | +STATIC_PATH_RE = re.compile( |
| 70 | + r"""(?:@site/)?/?static/img/[^\s"'`)\]>]+?\.(?:avif|bmp|gif|ico|jpe?g|png|svg|tiff?|webp)""", |
| 71 | + re.IGNORECASE, |
| 72 | +) |
| 73 | + |
| 74 | + |
| 75 | +def strip_url_suffix(path: str) -> str: |
| 76 | + return path.split("?", 1)[0].split("#", 1)[0] |
| 77 | + |
| 78 | + |
| 79 | +def normalize_reference_to_static_path(reference: str) -> Path | None: |
| 80 | + cleaned = strip_url_suffix(reference.strip()) |
| 81 | + lowered = cleaned.lower() |
| 82 | + |
| 83 | + if lowered.startswith("/img/"): |
| 84 | + return Path("img") / cleaned[len("/img/") :] |
| 85 | + if lowered.startswith("img/"): |
| 86 | + return Path("img") / cleaned[len("img/") :] |
| 87 | + if lowered.startswith("@site/static/"): |
| 88 | + return Path(cleaned[len("@site/static/") :]) |
| 89 | + if lowered.startswith("/static/"): |
| 90 | + return Path(cleaned[len("/static/") :]) |
| 91 | + if lowered.startswith("static/"): |
| 92 | + return Path(cleaned[len("static/") :]) |
| 93 | + if lowered.startswith(("http://", "https://", "data:", "mailto:", "#", "/")): |
| 94 | + return None |
| 95 | + |
| 96 | + # Dynamic React requires like /static/img/${imageUrl} often pass "misc/foo.svg". |
| 97 | + if "/" in cleaned and Path(cleaned).suffix.lower() in IMAGE_SUFFIXES: |
| 98 | + return Path("img") / cleaned |
| 99 | + |
| 100 | + return None |
| 101 | + |
| 102 | + |
| 103 | +def extract_reference_tokens(text: str) -> set[str]: |
| 104 | + tokens: set[str] = set() |
| 105 | + tokens.update(match.group(0) for match in IMG_WEB_PATH_RE.finditer(text)) |
| 106 | + tokens.update(match.group(0) for match in STATIC_PATH_RE.finditer(text)) |
| 107 | + tokens.update(match.group("path") for match in IMAGE_TOKEN_RE.finditer(text)) |
| 108 | + return {strip_url_suffix(token) for token in tokens} |
| 109 | + |
| 110 | + |
| 111 | +def iter_repo_text_files(repo_root: Path, sources: Iterable[str]) -> Iterable[Path]: |
| 112 | + for source in sources: |
| 113 | + source_path = (repo_root / source).resolve() |
| 114 | + if not source_path.exists(): |
| 115 | + continue |
| 116 | + |
| 117 | + if source_path.is_file(): |
| 118 | + if source_path.suffix.lower() in TEXT_SUFFIXES: |
| 119 | + yield source_path |
| 120 | + continue |
| 121 | + |
| 122 | + for root, dirs, files in os.walk(source_path): |
| 123 | + dirs[:] = [name for name in dirs if name not in EXCLUDED_DIRS and not name.startswith(".")] |
| 124 | + root_path = Path(root) |
| 125 | + |
| 126 | + for filename in files: |
| 127 | + if filename.startswith("."): |
| 128 | + continue |
| 129 | + file_path = root_path / filename |
| 130 | + if file_path.suffix.lower() in TEXT_SUFFIXES: |
| 131 | + yield file_path |
| 132 | + |
| 133 | + |
| 134 | +def iter_static_images(static_dir: Path) -> Iterable[Path]: |
| 135 | + for path in static_dir.rglob("*"): |
| 136 | + if not path.is_file(): |
| 137 | + continue |
| 138 | + if path.name.startswith("."): |
| 139 | + continue |
| 140 | + if path.suffix.lower() not in IMAGE_SUFFIXES: |
| 141 | + continue |
| 142 | + yield path |
| 143 | + |
| 144 | + |
| 145 | +def find_used_static_images( |
| 146 | + repo_root: Path, static_dir: Path, sources: Iterable[str] |
| 147 | +) -> tuple[set[Path], set[str]]: |
| 148 | + used: set[Path] = set() |
| 149 | + missing_refs: set[str] = set() |
| 150 | + |
| 151 | + for source_file in iter_repo_text_files(repo_root, sources): |
| 152 | + text = source_file.read_text(encoding="utf-8", errors="ignore") |
| 153 | + |
| 154 | + for token in extract_reference_tokens(text): |
| 155 | + static_relative = normalize_reference_to_static_path(token) |
| 156 | + if static_relative is None: |
| 157 | + continue |
| 158 | + |
| 159 | + target = (static_dir / static_relative).resolve() |
| 160 | + if target.exists(): |
| 161 | + used.add(target) |
| 162 | + elif static_relative.parts and static_relative.parts[0] == "img": |
| 163 | + missing_refs.add(token) |
| 164 | + |
| 165 | + return used, missing_refs |
| 166 | + |
| 167 | + |
| 168 | +def main() -> None: |
| 169 | + parser = argparse.ArgumentParser(description="Find images under static/ that are not referenced.") |
| 170 | + parser.add_argument("--repo-root", default=os.getcwd(), help="Repository root (default: cwd)") |
| 171 | + parser.add_argument( |
| 172 | + "--static-dir", |
| 173 | + default="static", |
| 174 | + help="Path to static directory, relative to --repo-root if not absolute (default: static)", |
| 175 | + ) |
| 176 | + parser.add_argument( |
| 177 | + "--show-missing-refs", |
| 178 | + action="store_true", |
| 179 | + help="Also print image references that point to missing files in static/", |
| 180 | + ) |
| 181 | + parser.add_argument( |
| 182 | + "--fail-on-unused", |
| 183 | + action="store_true", |
| 184 | + help="Exit with code 1 when unused images are found.", |
| 185 | + ) |
| 186 | + parser.add_argument( |
| 187 | + "--source", |
| 188 | + action="append", |
| 189 | + default=[], |
| 190 | + help=( |
| 191 | + "Additional source path to scan for references. " |
| 192 | + "Can be provided multiple times and is relative to --repo-root when not absolute." |
| 193 | + ), |
| 194 | + ) |
| 195 | + args = parser.parse_args() |
| 196 | + |
| 197 | + repo_root = Path(args.repo_root).resolve() |
| 198 | + static_dir = Path(args.static_dir) |
| 199 | + if not static_dir.is_absolute(): |
| 200 | + static_dir = (repo_root / static_dir).resolve() |
| 201 | + |
| 202 | + if not static_dir.exists(): |
| 203 | + raise SystemExit(f"static directory not found: {static_dir}") |
| 204 | + |
| 205 | + all_images = {path.resolve() for path in iter_static_images(static_dir)} |
| 206 | + sources = list(DEFAULT_REFERENCE_SOURCES) |
| 207 | + sources.extend(args.source) |
| 208 | + used_images, missing_refs = find_used_static_images(repo_root, static_dir, sources) |
| 209 | + unused_images = sorted(path for path in all_images if path not in used_images) |
| 210 | + |
| 211 | + print(f"Scanned {len(all_images)} image files under {static_dir}") |
| 212 | + print(f"Referenced images: {len(used_images)}") |
| 213 | + print(f"Unused images: {len(unused_images)}") |
| 214 | + print("") |
| 215 | + |
| 216 | + for path in unused_images: |
| 217 | + print(path.relative_to(repo_root).as_posix()) |
| 218 | + |
| 219 | + if args.show_missing_refs and missing_refs: |
| 220 | + print("") |
| 221 | + print(f"Missing image references ({len(missing_refs)}):") |
| 222 | + for ref in sorted(missing_refs): |
| 223 | + print(ref) |
| 224 | + |
| 225 | + if args.fail_on_unused and unused_images: |
| 226 | + raise SystemExit(1) |
| 227 | + |
| 228 | + |
| 229 | +if __name__ == "__main__": |
| 230 | + main() |
0 commit comments