Skip to content

Commit 4c90fa9

Browse files
author
ebembi-crdb
committed
Address review: use GitHub Contents API and html.parser for anchor extraction
- Replace raw.githubusercontent.com fetch with GitHub Contents API (GET /repos/.../contents/...?ref=) so that GITHUB_TOKEN properly raises the rate limit and works for private repos; add >1 MB fallback via download_url - Replace fragile id= regex with stdlib html.parser (_IDCollector) for robust attribute extraction across edge HTML - URL-encode path and ref components in the new fetch helper
1 parent 148e88f commit 4c90fa9

1 file changed

Lines changed: 66 additions & 12 deletions

File tree

.github/scripts/validate_diagram_anchors.py

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,25 @@
2424
2 fatal error (versions.csv not found)
2525
2626
Environment:
27-
GITHUB_TOKEN Optional. Raises API rate limit from 60 to 5000 req/hr.
27+
GITHUB_TOKEN Optional. Raises GitHub API rate limit from 60 to 5000 req/hr.
2828
GITHUB_ACTIONS Set automatically in CI. Enables pr-comment.md output.
2929
"""
3030

31+
import base64
3132
import csv
33+
import json
3234
import os
3335
import re
3436
import sys
3537
import urllib.error
38+
import urllib.parse
3639
import urllib.request
40+
from html.parser import HTMLParser
3741
from pathlib import Path
3842
from typing import Optional
3943

4044
GENERATED_DIAGRAMS_REPO = "cockroachdb/generated-diagrams"
41-
RAW_BASE = "https://raw.githubusercontent.com"
45+
GITHUB_API_BASE = "https://api.github.com"
4246
VERSIONS_CSV = Path("src/current/_data/versions.csv")
4347
DOCS_ROOT = Path("src/current")
4448

@@ -59,20 +63,51 @@
5963
# HTTP
6064
# ---------------------------------------------------------------------------
6165

62-
def _fetch_raw(url: str) -> Optional[str]:
66+
def _fetch_github_content(repo: str, path: str, ref: str) -> Optional[str]:
67+
"""Fetch a file from GitHub using the Contents API.
68+
69+
Uses the REST API endpoint so that GITHUB_TOKEN properly raises rate
70+
limits and authenticates against private repos. Falls back to the
71+
download_url for files larger than 1 MB (the API returns the field but
72+
omits the base64 payload in that case).
73+
"""
74+
encoded_ref = urllib.parse.quote(ref, safe="")
75+
encoded_path = urllib.parse.quote(path, safe="/")
76+
url = (
77+
f"{GITHUB_API_BASE}/repos/{repo}/contents/{encoded_path}"
78+
f"?ref={encoded_ref}"
79+
)
80+
6381
req = urllib.request.Request(url)
82+
req.add_header("Accept", "application/vnd.github+json")
83+
req.add_header("X-GitHub-Api-Version", "2022-11-28")
6484
token = os.environ.get("GITHUB_TOKEN")
6585
if token:
6686
req.add_header("Authorization", f"Bearer {token}")
87+
6788
try:
6889
with urllib.request.urlopen(req, timeout=20) as resp:
69-
return resp.read().decode("utf-8", errors="replace")
90+
data = json.loads(resp.read().decode())
91+
92+
# Normal case: inline base64 payload
93+
if data.get("encoding") == "base64" and data.get("content"):
94+
return base64.b64decode(data["content"].encode()).decode(
95+
"utf-8", errors="replace"
96+
)
97+
98+
# Large file (>1 MB): fall back to the raw download_url
99+
download_url = data.get("download_url")
100+
if download_url:
101+
with urllib.request.urlopen(download_url, timeout=20) as resp:
102+
return resp.read().decode("utf-8", errors="replace")
103+
104+
return None
70105
except urllib.error.HTTPError as exc:
71106
if exc.code == 404:
72107
return None
73108
raise
74109
except Exception as exc:
75-
print(f" Warning: fetch {url} failed: {exc}", file=sys.stderr)
110+
print(f" Warning: fetch {repo}/{path}@{ref} failed: {exc}", file=sys.stderr)
76111
return None
77112

78113

@@ -83,15 +118,33 @@ def _fetch_raw(url: str) -> Optional[str]:
83118
_stmt_block_cache: dict[str, Optional[set]] = {}
84119

85120

121+
class _IDCollector(HTMLParser):
122+
"""Collects all id= attribute values from an HTML document."""
123+
124+
def __init__(self) -> None:
125+
super().__init__()
126+
self.ids: set[str] = set()
127+
128+
def handle_starttag(
129+
self, tag: str, attrs: list[tuple[str, Optional[str]]]
130+
) -> None:
131+
for name, value in attrs:
132+
if name == "id" and value:
133+
self.ids.add(value)
134+
135+
86136
def get_stmt_block_anchors(branch: str) -> Optional[set]:
87137
"""Return all id= values in stmt_block.html for the given branch."""
88138
if branch not in _stmt_block_cache:
89-
url = f"{RAW_BASE}/{GENERATED_DIAGRAMS_REPO}/{branch}/grammar_svg/stmt_block.html"
90-
content = _fetch_raw(url)
91-
_stmt_block_cache[branch] = (
92-
set(re.findall(r'\bid=["\']([^"\']+)["\']', content))
93-
if content is not None else None
139+
content = _fetch_github_content(
140+
GENERATED_DIAGRAMS_REPO, "grammar_svg/stmt_block.html", branch
94141
)
142+
if content is None:
143+
_stmt_block_cache[branch] = None
144+
else:
145+
collector = _IDCollector()
146+
collector.feed(content)
147+
_stmt_block_cache[branch] = collector.ids
95148
return _stmt_block_cache[branch]
96149

97150

@@ -170,8 +223,9 @@ def run_checks(
170223
print(f"{len(known_anchors)} anchors")
171224

172225
for version, diagram, source_files in sorted(pairs):
173-
url = f"{RAW_BASE}/{GENERATED_DIAGRAMS_REPO}/{branch}/grammar_svg/{diagram}"
174-
content = _fetch_raw(url)
226+
content = _fetch_github_content(
227+
GENERATED_DIAGRAMS_REPO, f"grammar_svg/{diagram}", branch
228+
)
175229
if content is None:
176230
print(f" {diagram}: NOT FOUND in generated-diagrams (skipping)")
177231
continue

0 commit comments

Comments
 (0)