Skip to content

Commit ac68463

Browse files
authored
Merge pull request #23138 from cockroachdb/edueng-613-validate-diagram-anchors
Add diagram anchor integrity check (EDUENG-613)
2 parents 4f0109a + 929b0c7 commit ac68463

2 files changed

Lines changed: 490 additions & 0 deletions

File tree

Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
#!/usr/bin/env python3
2+
"""
3+
validate_diagram_anchors.py (EDUENG-613)
4+
5+
For doc files that contain remote_include tags pulling from
6+
cockroachdb/generated-diagrams grammar_svg, fetches each referenced diagram
7+
HTML and verifies that every sql-grammar.html#ANCHOR link inside it resolves
8+
against stmt_block.html on the same branch.
9+
10+
This is the exact failure that blocked production builds on 2026-01-29:
11+
show_statement_hints.html referenced sql-grammar.html#opt_with_show_hints_options
12+
but that anchor did not yet exist in stmt_block.html on release-26.1.
13+
14+
Usage:
15+
# Check specific files (e.g. changed files in a PR):
16+
python .github/scripts/validate_diagram_anchors.py file1.md file2.md ...
17+
18+
# Full scan:
19+
python .github/scripts/validate_diagram_anchors.py
20+
21+
Exit codes:
22+
0 all checks passed
23+
1 one or more broken anchors found
24+
2 fatal error (versions.csv not found)
25+
26+
Environment:
27+
GITHUB_TOKEN Optional. Raises GitHub API rate limit from 60 to 5000 req/hr.
28+
GITHUB_ACTIONS Set automatically in CI. Enables pr-comment.md output.
29+
"""
30+
31+
import base64
32+
import csv
33+
import json
34+
import os
35+
import re
36+
import sys
37+
import urllib.error
38+
import urllib.parse
39+
import urllib.request
40+
from html.parser import HTMLParser
41+
from pathlib import Path
42+
from typing import Optional
43+
44+
GENERATED_DIAGRAMS_REPO = "cockroachdb/generated-diagrams"
45+
GITHUB_API_BASE = "https://api.github.com"
46+
VERSIONS_CSV = Path("src/current/_data/versions.csv")
47+
DOCS_ROOT = Path("src/current")
48+
49+
# {% remote_include https://raw.githubusercontent.com/cockroachdb/generated-diagrams/
50+
# {{ page.release_info.crdb_branch_name }}/grammar_svg/show_statement_hints.html %}
51+
REMOTE_INCLUDE_RE = re.compile(
52+
r"\{%-?\s*remote_include\s+"
53+
r"https://raw\.githubusercontent\.com/cockroachdb/generated-diagrams/"
54+
r"\{\{[^}]*crdb_branch_name[^}]*\}\}/grammar_svg/"
55+
r"([\w.-]+\.html)"
56+
r"\s*-?%\}"
57+
)
58+
59+
# href="sql-grammar.html#opt_with_show_hints_options"
60+
ANCHOR_REF_RE = re.compile(r'href=["\']sql-grammar\.html#([^"\']+)["\']')
61+
62+
# ---------------------------------------------------------------------------
63+
# HTTP
64+
# ---------------------------------------------------------------------------
65+
66+
def _fetch_github_content(repo: str, path: str, ref: str) -> Optional[str]:
67+
"""Fetch a file from GitHub using the Contents API.
68+
69+
Uses the REST API endpoint so that GITHUB_TOKEN properly raises rate
70+
limits and authenticates against private repos. Falls back to the
71+
download_url for files larger than 1 MB (the API returns the field but
72+
omits the base64 payload in that case).
73+
"""
74+
encoded_ref = urllib.parse.quote(ref, safe="")
75+
encoded_path = urllib.parse.quote(path, safe="/")
76+
url = (
77+
f"{GITHUB_API_BASE}/repos/{repo}/contents/{encoded_path}"
78+
f"?ref={encoded_ref}"
79+
)
80+
81+
req = urllib.request.Request(url)
82+
req.add_header("Accept", "application/vnd.github+json")
83+
req.add_header("X-GitHub-Api-Version", "2022-11-28")
84+
token = os.environ.get("GITHUB_TOKEN")
85+
if token:
86+
req.add_header("Authorization", f"Bearer {token}")
87+
88+
try:
89+
with urllib.request.urlopen(req, timeout=20) as resp:
90+
data = json.loads(resp.read().decode())
91+
92+
# Normal case: inline base64 payload
93+
if data.get("encoding") == "base64" and data.get("content"):
94+
return base64.b64decode(data["content"].encode()).decode(
95+
"utf-8", errors="replace"
96+
)
97+
98+
# Large file (>1 MB): fall back to the raw download_url
99+
download_url = data.get("download_url")
100+
if download_url:
101+
with urllib.request.urlopen(download_url, timeout=20) as resp:
102+
return resp.read().decode("utf-8", errors="replace")
103+
104+
return None
105+
except urllib.error.HTTPError as exc:
106+
if exc.code == 404:
107+
return None
108+
raise
109+
except Exception as exc:
110+
print(f" Warning: fetch {repo}/{path}@{ref} failed: {exc}", file=sys.stderr)
111+
return None
112+
113+
114+
# ---------------------------------------------------------------------------
115+
# Cached lookups
116+
# ---------------------------------------------------------------------------
117+
118+
_stmt_block_cache: dict[str, Optional[set]] = {}
119+
120+
121+
class _IDCollector(HTMLParser):
122+
"""Collects all id= attribute values from an HTML document."""
123+
124+
def __init__(self) -> None:
125+
super().__init__()
126+
self.ids: set[str] = set()
127+
128+
def handle_starttag(
129+
self, tag: str, attrs: list[tuple[str, Optional[str]]]
130+
) -> None:
131+
for name, value in attrs:
132+
if name == "id" and value:
133+
self.ids.add(value)
134+
135+
136+
def get_stmt_block_anchors(branch: str) -> Optional[set]:
137+
"""Return all id= values in stmt_block.html for the given branch."""
138+
if branch not in _stmt_block_cache:
139+
content = _fetch_github_content(
140+
GENERATED_DIAGRAMS_REPO, "grammar_svg/stmt_block.html", branch
141+
)
142+
if content is None:
143+
_stmt_block_cache[branch] = None
144+
else:
145+
collector = _IDCollector()
146+
collector.feed(content)
147+
_stmt_block_cache[branch] = collector.ids
148+
return _stmt_block_cache[branch]
149+
150+
151+
# ---------------------------------------------------------------------------
152+
# Parsing helpers
153+
# ---------------------------------------------------------------------------
154+
155+
def load_versions_csv() -> dict[str, str]:
156+
"""Return {major_version: crdb_branch_name} for all valid rows."""
157+
if not VERSIONS_CSV.exists():
158+
print(f"Error: {VERSIONS_CSV} not found. Run from the repo root.", file=sys.stderr)
159+
sys.exit(2)
160+
result = {}
161+
with open(VERSIONS_CSV, newline="") as f:
162+
for row in csv.DictReader(f):
163+
v = row.get("major_version", "").strip()
164+
b = row.get("crdb_branch_name", "").strip()
165+
if v and b and b != "N/A":
166+
result[v] = b
167+
return result
168+
169+
170+
def version_from_path(path: Path) -> Optional[str]:
171+
for part in path.parts:
172+
if re.match(r"^v\d+\.\d+$", part):
173+
return part
174+
return None
175+
176+
177+
def scan_files(files: list[Path]) -> dict[tuple[str, str], list[Path]]:
178+
"""
179+
Scan markdown files for SQL diagram remote_include tags.
180+
Returns {(version, diagram_filename): [source_paths]}.
181+
"""
182+
result: dict[tuple[str, str], list[Path]] = {}
183+
for path in files:
184+
if path.suffix not in (".md", ".markdown") or not path.exists():
185+
continue
186+
version = version_from_path(path)
187+
if not version:
188+
continue
189+
try:
190+
content = path.read_text(encoding="utf-8", errors="replace")
191+
except OSError:
192+
continue
193+
for m in REMOTE_INCLUDE_RE.finditer(content):
194+
key = (version, m.group(1))
195+
result.setdefault(key, []).append(path)
196+
return result
197+
198+
199+
# ---------------------------------------------------------------------------
200+
# Core logic
201+
# ---------------------------------------------------------------------------
202+
203+
def run_checks(
204+
diagram_includes: dict[tuple[str, str], list[Path]],
205+
version_to_branch: dict[str, str],
206+
) -> list[dict]:
207+
failures = []
208+
209+
# Group by branch to share stmt_block.html fetches.
210+
branch_to_pairs: dict[str, list[tuple[str, str, list[Path]]]] = {}
211+
for (version, diagram), source_files in diagram_includes.items():
212+
branch = version_to_branch.get(version)
213+
if branch:
214+
branch_to_pairs.setdefault(branch, []).append((version, diagram, source_files))
215+
216+
for branch, pairs in sorted(branch_to_pairs.items()):
217+
print(f" Branch {branch}:")
218+
print(f" Fetching stmt_block.html ...", end=" ", flush=True)
219+
known_anchors = get_stmt_block_anchors(branch)
220+
if known_anchors is None:
221+
print("NOT FOUND — skipping this branch")
222+
continue
223+
print(f"{len(known_anchors)} anchors")
224+
225+
for version, diagram, source_files in sorted(pairs):
226+
content = _fetch_github_content(
227+
GENERATED_DIAGRAMS_REPO, f"grammar_svg/{diagram}", branch
228+
)
229+
if content is None:
230+
print(f" {diagram}: NOT FOUND in generated-diagrams (skipping)")
231+
continue
232+
233+
refs = ANCHOR_REF_RE.findall(content)
234+
missing = [r for r in refs if r not in known_anchors]
235+
236+
if missing:
237+
print(f" {diagram}: {len(missing)} MISSING anchor(s)")
238+
for anchor in missing:
239+
failures.append({
240+
"diagram": diagram,
241+
"branch": branch,
242+
"anchor": anchor,
243+
"source_files": [str(f) for f in source_files],
244+
"message": (
245+
f"Diagram {diagram!r} on {branch!r} links to "
246+
f"sql-grammar.html#{anchor}, "
247+
f"but that anchor is absent from stmt_block.html."
248+
),
249+
})
250+
else:
251+
print(f" {diagram}: OK ({len(refs)} anchor ref(s))")
252+
253+
return failures
254+
255+
256+
# ---------------------------------------------------------------------------
257+
# Output
258+
# ---------------------------------------------------------------------------
259+
260+
def format_comment(failures: list[dict]) -> str:
261+
if not failures:
262+
return (
263+
"## Diagram Anchor Check: Passed\n\n"
264+
"All `sql-grammar.html#anchor` references in SQL diagram files "
265+
"resolve correctly against `stmt_block.html`."
266+
)
267+
268+
lines = [
269+
"## Diagram Anchor Check: Failed",
270+
"",
271+
f"Found **{len(failures)}** broken anchor(s) that will cause docs build failures.",
272+
"",
273+
"> **Context**: [EDUENG-613](https://cockroachlabs.atlassian.net/browse/EDUENG-613) — "
274+
"same failure mode as 2026-01-29 (`opt_with_show_hints_options` missing from `stmt_block.html`).",
275+
"",
276+
]
277+
for f in failures:
278+
lines.append(
279+
f"- **`{f['diagram']}`** on `{f['branch']}` "
280+
f"→ missing anchor `#{f['anchor']}`"
281+
)
282+
for s in f["source_files"]:
283+
lines.append(f" - referenced by `{s}`")
284+
285+
return "\n".join(lines)
286+
287+
288+
# ---------------------------------------------------------------------------
289+
# Entry point
290+
# ---------------------------------------------------------------------------
291+
292+
def main() -> None:
293+
version_to_branch = load_versions_csv()
294+
295+
if len(sys.argv) > 1:
296+
files = [Path(a) for a in sys.argv[1:]]
297+
else:
298+
files = list(DOCS_ROOT.rglob("*.md"))
299+
300+
print(f"Scanning {len(files)} file(s) for SQL diagram remote_include tags...")
301+
diagram_includes = scan_files(files)
302+
print(f"Found {len(diagram_includes)} unique (version, diagram) pair(s).\n")
303+
304+
failures = run_checks(diagram_includes, version_to_branch)
305+
306+
comment = format_comment(failures)
307+
if os.environ.get("GITHUB_ACTIONS"):
308+
summary = os.environ.get("GITHUB_STEP_SUMMARY")
309+
if summary:
310+
Path(summary).write_text(comment, encoding="utf-8")
311+
Path("pr-comment.md").write_text(comment, encoding="utf-8")
312+
313+
if failures:
314+
print(f"\n--- Issues ---", file=sys.stderr)
315+
for f in failures:
316+
print(f" {f['message']}", file=sys.stderr)
317+
for s in f["source_files"]:
318+
print(f" referenced by: {s}", file=sys.stderr)
319+
print(f"\nTotal: {len(failures)} broken anchor(s).", file=sys.stderr)
320+
sys.exit(1)
321+
else:
322+
print("\nAll diagram anchor checks passed.")
323+
sys.exit(0)
324+
325+
326+
if __name__ == "__main__":
327+
main()

0 commit comments

Comments
 (0)