|
| 1 | +#!/usr/bin/env python3 |
| 2 | +from __future__ import annotations |
| 3 | + |
| 4 | +import re |
| 5 | +import sys |
| 6 | +from pathlib import Path |
| 7 | +from urllib.parse import unquote |
| 8 | + |
| 9 | +ROOT_DIR = Path(__file__).resolve().parents[1] |
| 10 | +FENCE_RE = re.compile(r"^\s{0,3}(`{3,}|~{3,})") |
| 11 | +REFERENCE_RE = re.compile(r"^\s{0,3}\[([^\]]+)\]:\s*(.+?)\s*$") |
| 12 | +URI_SCHEME_RE = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*:") |
| 13 | + |
| 14 | + |
| 15 | +def normalize_reference_label(label: str) -> str: |
| 16 | + return " ".join(label.strip().lower().split()) |
| 17 | + |
| 18 | + |
| 19 | +def strip_fenced_code_blocks(text: str) -> str: |
| 20 | + output: list[str] = [] |
| 21 | + in_fence = False |
| 22 | + fence_char = "" |
| 23 | + fence_len = 0 |
| 24 | + |
| 25 | + for line in text.splitlines(keepends=True): |
| 26 | + if not in_fence: |
| 27 | + match = FENCE_RE.match(line) |
| 28 | + if match: |
| 29 | + marker = match.group(1) |
| 30 | + in_fence = True |
| 31 | + fence_char = marker[0] |
| 32 | + fence_len = len(marker) |
| 33 | + continue |
| 34 | + output.append(line) |
| 35 | + continue |
| 36 | + |
| 37 | + stripped = line.lstrip() |
| 38 | + if stripped.startswith(fence_char * fence_len): |
| 39 | + in_fence = False |
| 40 | + fence_char = "" |
| 41 | + fence_len = 0 |
| 42 | + |
| 43 | + return "".join(output) |
| 44 | + |
| 45 | + |
| 46 | +def parse_reference_destination(raw: str) -> str: |
| 47 | + raw = raw.strip() |
| 48 | + if not raw: |
| 49 | + return "" |
| 50 | + if raw.startswith("<"): |
| 51 | + end = raw.find(">") |
| 52 | + if end != -1: |
| 53 | + return raw[1:end].strip() |
| 54 | + return raw.split()[0].strip() |
| 55 | + |
| 56 | + |
| 57 | +def parse_reference_definitions(text: str) -> dict[str, str]: |
| 58 | + definitions: dict[str, str] = {} |
| 59 | + for line in text.splitlines(): |
| 60 | + match = REFERENCE_RE.match(line) |
| 61 | + if not match: |
| 62 | + continue |
| 63 | + label = normalize_reference_label(match.group(1)) |
| 64 | + destination = parse_reference_destination(match.group(2)) |
| 65 | + if destination: |
| 66 | + definitions[label] = destination |
| 67 | + return definitions |
| 68 | + |
| 69 | + |
| 70 | +def read_bracket_content(text: str, start: int) -> tuple[int | None, str]: |
| 71 | + if start >= len(text) or text[start] != "[": |
| 72 | + return None, "" |
| 73 | + depth = 1 |
| 74 | + cursor = start + 1 |
| 75 | + while cursor < len(text): |
| 76 | + char = text[cursor] |
| 77 | + if char == "\\": |
| 78 | + cursor += 2 |
| 79 | + continue |
| 80 | + if char == "[": |
| 81 | + depth += 1 |
| 82 | + elif char == "]": |
| 83 | + depth -= 1 |
| 84 | + if depth == 0: |
| 85 | + return cursor, text[start + 1 : cursor] |
| 86 | + cursor += 1 |
| 87 | + return None, "" |
| 88 | + |
| 89 | + |
| 90 | +def parse_optional_title_and_close(text: str, cursor: int) -> int | None: |
| 91 | + while cursor < len(text) and text[cursor].isspace(): |
| 92 | + cursor += 1 |
| 93 | + if cursor >= len(text): |
| 94 | + return None |
| 95 | + if text[cursor] == ")": |
| 96 | + return cursor + 1 |
| 97 | + |
| 98 | + if text[cursor] in "\"'": |
| 99 | + quote = text[cursor] |
| 100 | + cursor += 1 |
| 101 | + while cursor < len(text): |
| 102 | + if text[cursor] == "\\": |
| 103 | + cursor += 2 |
| 104 | + continue |
| 105 | + if text[cursor] == quote: |
| 106 | + cursor += 1 |
| 107 | + break |
| 108 | + cursor += 1 |
| 109 | + elif text[cursor] == "(": |
| 110 | + depth = 1 |
| 111 | + cursor += 1 |
| 112 | + while cursor < len(text) and depth > 0: |
| 113 | + if text[cursor] == "\\": |
| 114 | + cursor += 2 |
| 115 | + continue |
| 116 | + if text[cursor] == "(": |
| 117 | + depth += 1 |
| 118 | + elif text[cursor] == ")": |
| 119 | + depth -= 1 |
| 120 | + cursor += 1 |
| 121 | + else: |
| 122 | + return None |
| 123 | + |
| 124 | + while cursor < len(text) and text[cursor].isspace(): |
| 125 | + cursor += 1 |
| 126 | + if cursor < len(text) and text[cursor] == ")": |
| 127 | + return cursor + 1 |
| 128 | + return None |
| 129 | + |
| 130 | + |
| 131 | +def parse_inline_destination(text: str, cursor: int) -> tuple[str | None, int | None]: |
| 132 | + while cursor < len(text) and text[cursor].isspace(): |
| 133 | + cursor += 1 |
| 134 | + if cursor >= len(text): |
| 135 | + return None, None |
| 136 | + |
| 137 | + if text[cursor] == "<": |
| 138 | + end = text.find(">", cursor + 1) |
| 139 | + if end == -1: |
| 140 | + return None, None |
| 141 | + destination = text[cursor + 1 : end].strip() |
| 142 | + close_index = parse_optional_title_and_close(text, end + 1) |
| 143 | + return destination, close_index |
| 144 | + |
| 145 | + start = cursor |
| 146 | + depth = 0 |
| 147 | + while cursor < len(text): |
| 148 | + char = text[cursor] |
| 149 | + if char == "\\": |
| 150 | + cursor += 2 |
| 151 | + continue |
| 152 | + if char == "(": |
| 153 | + depth += 1 |
| 154 | + cursor += 1 |
| 155 | + continue |
| 156 | + if char == ")": |
| 157 | + if depth == 0: |
| 158 | + break |
| 159 | + depth -= 1 |
| 160 | + cursor += 1 |
| 161 | + continue |
| 162 | + if char.isspace() and depth == 0: |
| 163 | + break |
| 164 | + cursor += 1 |
| 165 | + |
| 166 | + destination = text[start:cursor].strip() |
| 167 | + close_index = parse_optional_title_and_close(text, cursor) |
| 168 | + return destination, close_index |
| 169 | + |
| 170 | + |
| 171 | +def extract_links(text: str) -> tuple[list[tuple[str, str]], dict[str, str]]: |
| 172 | + body = strip_fenced_code_blocks(text) |
| 173 | + definitions = parse_reference_definitions(body) |
| 174 | + links: list[tuple[str, str]] = [] |
| 175 | + |
| 176 | + cursor = 0 |
| 177 | + while cursor < len(body): |
| 178 | + char = body[cursor] |
| 179 | + |
| 180 | + if char == "`": |
| 181 | + tick_count = 1 |
| 182 | + while cursor + tick_count < len(body) and body[cursor + tick_count] == "`": |
| 183 | + tick_count += 1 |
| 184 | + close = body.find("`" * tick_count, cursor + tick_count) |
| 185 | + if close == -1: |
| 186 | + break |
| 187 | + cursor = close + tick_count |
| 188 | + continue |
| 189 | + |
| 190 | + if char == "!" and cursor + 1 < len(body) and body[cursor + 1] == "[": |
| 191 | + cursor += 1 |
| 192 | + char = "[" |
| 193 | + |
| 194 | + if char != "[": |
| 195 | + cursor += 1 |
| 196 | + continue |
| 197 | + |
| 198 | + label_end, label = read_bracket_content(body, cursor) |
| 199 | + if label_end is None: |
| 200 | + cursor += 1 |
| 201 | + continue |
| 202 | + |
| 203 | + next_cursor = label_end + 1 |
| 204 | + while next_cursor < len(body) and body[next_cursor].isspace(): |
| 205 | + next_cursor += 1 |
| 206 | + |
| 207 | + if next_cursor < len(body) and body[next_cursor] == "(": |
| 208 | + destination, close_index = parse_inline_destination(body, next_cursor + 1) |
| 209 | + if destination is not None and close_index is not None: |
| 210 | + links.append(("inline", destination)) |
| 211 | + cursor = close_index |
| 212 | + continue |
| 213 | + |
| 214 | + if next_cursor < len(body) and body[next_cursor] == "[": |
| 215 | + ref_end, ref = read_bracket_content(body, next_cursor) |
| 216 | + if ref_end is not None: |
| 217 | + ref_label = ref if ref.strip() else label |
| 218 | + links.append(("reference", ref_label)) |
| 219 | + cursor = ref_end + 1 |
| 220 | + continue |
| 221 | + |
| 222 | + cursor = label_end + 1 |
| 223 | + |
| 224 | + return links, definitions |
| 225 | + |
| 226 | + |
| 227 | +def should_skip_destination(destination: str) -> bool: |
| 228 | + if not destination: |
| 229 | + return True |
| 230 | + if destination.startswith("#"): |
| 231 | + return True |
| 232 | + if destination.startswith("//"): |
| 233 | + return True |
| 234 | + if URI_SCHEME_RE.match(destination): |
| 235 | + return True |
| 236 | + return False |
| 237 | + |
| 238 | + |
| 239 | +def resolve_candidate_path(file_path: Path, destination: str) -> Path: |
| 240 | + destination = destination.strip() |
| 241 | + if destination.startswith("<") and destination.endswith(">"): |
| 242 | + destination = destination[1:-1].strip() |
| 243 | + destination = destination.split("#", 1)[0].split("?", 1)[0].strip() |
| 244 | + destination = unquote(destination) |
| 245 | + destination = destination.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " ") |
| 246 | + if destination.startswith("/"): |
| 247 | + return ROOT_DIR / destination.lstrip("/") |
| 248 | + return file_path.parent / destination |
| 249 | + |
| 250 | + |
| 251 | +def check_file(file_path: Path) -> list[str]: |
| 252 | + text = file_path.read_text(encoding="utf-8", errors="replace") |
| 253 | + links, definitions = extract_links(text) |
| 254 | + failures: list[str] = [] |
| 255 | + relative_file = file_path.relative_to(ROOT_DIR).as_posix() |
| 256 | + |
| 257 | + for link_type, raw_target in links: |
| 258 | + if link_type == "reference": |
| 259 | + key = normalize_reference_label(raw_target) |
| 260 | + target = definitions.get(key) |
| 261 | + if target is None: |
| 262 | + failures.append(f"{relative_file}: unresolved reference [{raw_target}]") |
| 263 | + continue |
| 264 | + else: |
| 265 | + target = raw_target |
| 266 | + |
| 267 | + target = target.strip() |
| 268 | + if should_skip_destination(target): |
| 269 | + continue |
| 270 | + |
| 271 | + candidate = resolve_candidate_path(file_path, target) |
| 272 | + if not candidate.exists(): |
| 273 | + failures.append(f"{relative_file}: {target}") |
| 274 | + |
| 275 | + return failures |
| 276 | + |
| 277 | + |
| 278 | +def main() -> int: |
| 279 | + markdown_files = sorted(ROOT_DIR.rglob("*.md")) |
| 280 | + broken_links: list[str] = [] |
| 281 | + |
| 282 | + for file_path in markdown_files: |
| 283 | + broken_links.extend(check_file(file_path)) |
| 284 | + |
| 285 | + if broken_links: |
| 286 | + print("Broken markdown links found:") |
| 287 | + for failure in broken_links: |
| 288 | + print(f" - {failure}") |
| 289 | + return 1 |
| 290 | + |
| 291 | + print("No broken markdown links found.") |
| 292 | + return 0 |
| 293 | + |
| 294 | + |
| 295 | +if __name__ == "__main__": |
| 296 | + sys.exit(main()) |
0 commit comments