2424 2 fatal error (versions.csv not found)
2525
2626Environment:
27- GITHUB_TOKEN Optional. Raises API rate limit from 60 to 5000 req/hr.
27+ GITHUB_TOKEN Optional. Raises GitHub API rate limit from 60 to 5000 req/hr.
2828 GITHUB_ACTIONS Set automatically in CI. Enables pr-comment.md output.
2929"""
3030
31+ import base64
3132import csv
33+ import json
3234import os
3335import re
3436import sys
3537import urllib .error
38+ import urllib .parse
3639import urllib .request
40+ from html .parser import HTMLParser
3741from pathlib import Path
3842from typing import Optional
3943
4044GENERATED_DIAGRAMS_REPO = "cockroachdb/generated-diagrams"
41- RAW_BASE = "https://raw.githubusercontent .com"
45+ GITHUB_API_BASE = "https://api.github .com"
4246VERSIONS_CSV = Path ("src/current/_data/versions.csv" )
4347DOCS_ROOT = Path ("src/current" )
4448
5963# HTTP
6064# ---------------------------------------------------------------------------
6165
62- def _fetch_raw (url : str ) -> Optional [str ]:
66+ def _fetch_github_content (repo : str , path : str , ref : str ) -> Optional [str ]:
67+ """Fetch a file from GitHub using the Contents API.
68+
69+ Uses the REST API endpoint so that GITHUB_TOKEN properly raises rate
70+ limits and authenticates against private repos. Falls back to the
71+ download_url for files larger than 1 MB (the API returns the field but
72+ omits the base64 payload in that case).
73+ """
74+ encoded_ref = urllib .parse .quote (ref , safe = "" )
75+ encoded_path = urllib .parse .quote (path , safe = "/" )
76+ url = (
77+ f"{ GITHUB_API_BASE } /repos/{ repo } /contents/{ encoded_path } "
78+ f"?ref={ encoded_ref } "
79+ )
80+
6381 req = urllib .request .Request (url )
82+ req .add_header ("Accept" , "application/vnd.github+json" )
83+ req .add_header ("X-GitHub-Api-Version" , "2022-11-28" )
6484 token = os .environ .get ("GITHUB_TOKEN" )
6585 if token :
6686 req .add_header ("Authorization" , f"Bearer { token } " )
87+
6788 try :
6889 with urllib .request .urlopen (req , timeout = 20 ) as resp :
69- return resp .read ().decode ("utf-8" , errors = "replace" )
90+ data = json .loads (resp .read ().decode ())
91+
92+ # Normal case: inline base64 payload
93+ if data .get ("encoding" ) == "base64" and data .get ("content" ):
94+ return base64 .b64decode (data ["content" ].encode ()).decode (
95+ "utf-8" , errors = "replace"
96+ )
97+
98+ # Large file (>1 MB): fall back to the raw download_url
99+ download_url = data .get ("download_url" )
100+ if download_url :
101+ with urllib .request .urlopen (download_url , timeout = 20 ) as resp :
102+ return resp .read ().decode ("utf-8" , errors = "replace" )
103+
104+ return None
70105 except urllib .error .HTTPError as exc :
71106 if exc .code == 404 :
72107 return None
73108 raise
74109 except Exception as exc :
75- print (f" Warning: fetch { url } failed: { exc } " , file = sys .stderr )
110+ print (f" Warning: fetch { repo } / { path } @ { ref } failed: { exc } " , file = sys .stderr )
76111 return None
77112
78113
@@ -83,15 +118,33 @@ def _fetch_raw(url: str) -> Optional[str]:
83118_stmt_block_cache : dict [str , Optional [set ]] = {}
84119
85120
121+ class _IDCollector (HTMLParser ):
122+ """Collects all id= attribute values from an HTML document."""
123+
124+ def __init__ (self ) -> None :
125+ super ().__init__ ()
126+ self .ids : set [str ] = set ()
127+
128+ def handle_starttag (
129+ self , tag : str , attrs : list [tuple [str , Optional [str ]]]
130+ ) -> None :
131+ for name , value in attrs :
132+ if name == "id" and value :
133+ self .ids .add (value )
134+
135+
86136def get_stmt_block_anchors (branch : str ) -> Optional [set ]:
87137 """Return all id= values in stmt_block.html for the given branch."""
88138 if branch not in _stmt_block_cache :
89- url = f"{ RAW_BASE } /{ GENERATED_DIAGRAMS_REPO } /{ branch } /grammar_svg/stmt_block.html"
90- content = _fetch_raw (url )
91- _stmt_block_cache [branch ] = (
92- set (re .findall (r'\bid=["\']([^"\']+)["\']' , content ))
93- if content is not None else None
139+ content = _fetch_github_content (
140+ GENERATED_DIAGRAMS_REPO , "grammar_svg/stmt_block.html" , branch
94141 )
142+ if content is None :
143+ _stmt_block_cache [branch ] = None
144+ else :
145+ collector = _IDCollector ()
146+ collector .feed (content )
147+ _stmt_block_cache [branch ] = collector .ids
95148 return _stmt_block_cache [branch ]
96149
97150
@@ -170,8 +223,9 @@ def run_checks(
170223 print (f"{ len (known_anchors )} anchors" )
171224
172225 for version , diagram , source_files in sorted (pairs ):
173- url = f"{ RAW_BASE } /{ GENERATED_DIAGRAMS_REPO } /{ branch } /grammar_svg/{ diagram } "
174- content = _fetch_raw (url )
226+ content = _fetch_github_content (
227+ GENERATED_DIAGRAMS_REPO , f"grammar_svg/{ diagram } " , branch
228+ )
175229 if content is None :
176230 print (f" { diagram } : NOT FOUND in generated-diagrams (skipping)" )
177231 continue
0 commit comments