Skip to content

Commit d823340

Browse files
apartsinclaude
andcommitted
Fix 48 broken ToC links (zero-padded section numbers), add audit scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2a0be8a commit d823340

4 files changed

Lines changed: 1541 additions & 10 deletions

File tree

scripts/audit_inline_svgs.py

Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
"""Audit all inline SVG diagrams in HTML section files."""
2+
3+
import glob
4+
import os
5+
import re
6+
import sys
7+
from collections import Counter, defaultdict
8+
from html import unescape
9+
10+
11+
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
12+
13+
PATTERNS = [
14+
os.path.join(ROOT, "part-*", "module-*", "section-*.html"),
15+
os.path.join(ROOT, "appendices", "appendix-*", "section-*.html"),
16+
]
17+
18+
19+
def find_html_files():
20+
files = []
21+
for pat in PATTERNS:
22+
files.extend(sorted(glob.glob(pat)))
23+
return files
24+
25+
26+
def extract_inline_svgs(html):
27+
"""Return list of (start_index, svg_string) for inline <svg elements."""
28+
results = []
29+
# Find all <svg occurrences not preceded by <img (i.e. truly inline)
30+
for m in re.finditer(r"<svg[\s>]", html, re.IGNORECASE):
31+
start = m.start()
32+
# Check this is not inside an <img tag src attribute
33+
preceding = html[max(0, start - 200):start]
34+
if re.search(r'<img[^>]*$', preceding, re.IGNORECASE):
35+
continue
36+
# Find the matching </svg>
37+
depth = 0
38+
i = start
39+
while i < len(html):
40+
open_m = re.search(r"<svg[\s>]", html[i:], re.IGNORECASE)
41+
close_m = re.search(r"</svg>", html[i:], re.IGNORECASE)
42+
if close_m is None:
43+
break
44+
if open_m and open_m.start() < close_m.start():
45+
depth += 1
46+
i += open_m.start() + 4
47+
else:
48+
if depth <= 1:
49+
end = i + close_m.end()
50+
results.append((start, html[start:end]))
51+
break
52+
else:
53+
depth -= 1
54+
i += close_m.end()
55+
else:
56+
# Fallback: grab up to first </svg>
57+
close_m = re.search(r"</svg>", html[start:], re.IGNORECASE)
58+
if close_m:
59+
results.append((start, html[start:start + close_m.end()]))
60+
return results
61+
62+
63+
def get_surrounding_figure(html, svg_start):
64+
"""Look for enclosing <figure> and extract id, caption."""
65+
# Search backwards for <figure
66+
search_back = html[max(0, svg_start - 2000):svg_start]
67+
fig_m = list(re.finditer(r"<figure[^>]*>", search_back, re.IGNORECASE))
68+
figure_id = ""
69+
caption = ""
70+
if fig_m:
71+
fig_tag = fig_m[-1].group(0)
72+
id_m = re.search(r'id=["\']([^"\']+)["\']', fig_tag)
73+
if id_m:
74+
figure_id = id_m.group(1)
75+
# Search forward for <figcaption
76+
search_fwd = html[svg_start:svg_start + 5000]
77+
cap_m = re.search(r"<figcaption[^>]*>(.*?)</figcaption>", search_fwd, re.IGNORECASE | re.DOTALL)
78+
if cap_m:
79+
caption = re.sub(r"<[^>]+>", "", cap_m.group(1)).strip()
80+
caption = unescape(caption)
81+
caption = re.sub(r"\s+", " ", caption)
82+
# Also check backwards for figcaption (caption above SVG)
83+
if not caption:
84+
cap_m = re.search(r"<figcaption[^>]*>(.*?)</figcaption>", search_back, re.IGNORECASE | re.DOTALL)
85+
if cap_m:
86+
caption = re.sub(r"<[^>]+>", "", cap_m.group(1)).strip()
87+
caption = unescape(caption)
88+
caption = re.sub(r"\s+", " ", caption)
89+
return figure_id, caption
90+
91+
92+
def analyze_svg(svg_str):
93+
"""Classify SVG content and extract key info."""
94+
# Dimensions
95+
width = ""
96+
height = ""
97+
viewbox = ""
98+
w_m = re.search(r'\bwidth=["\']([^"\']+)["\']', svg_str[:500])
99+
h_m = re.search(r'\bheight=["\']([^"\']+)["\']', svg_str[:500])
100+
vb_m = re.search(r'viewBox=["\']([^"\']+)["\']', svg_str[:500], re.IGNORECASE)
101+
if w_m:
102+
width = w_m.group(1)
103+
if h_m:
104+
height = h_m.group(1)
105+
if vb_m:
106+
viewbox = vb_m.group(1)
107+
108+
dims = ""
109+
if width and height:
110+
dims = f"{width} x {height}"
111+
if viewbox:
112+
dims = (dims + ", " if dims else "") + f"viewBox={viewbox}"
113+
if not dims:
114+
dims = "(no dimensions)"
115+
116+
# Element counts
117+
elements = {}
118+
for tag in ["text", "rect", "circle", "ellipse", "path", "line", "polygon", "polyline", "g", "use"]:
119+
count = len(re.findall(rf"<{tag}[\s>/]", svg_str, re.IGNORECASE))
120+
if count > 0:
121+
elements[tag] = count
122+
123+
# Extract text contents (first 5)
124+
text_contents = []
125+
for tm in re.finditer(r"<text[^>]*>(.*?)</text>", svg_str, re.IGNORECASE | re.DOTALL):
126+
txt = re.sub(r"<[^>]+>", "", tm.group(1)).strip()
127+
txt = unescape(txt)
128+
txt = re.sub(r"\s+", " ", txt)
129+
if txt and len(txt) < 200:
130+
text_contents.append(txt)
131+
if len(text_contents) >= 5:
132+
break
133+
134+
# Also grab tspan text if no text found
135+
if not text_contents:
136+
for tm in re.finditer(r"<tspan[^>]*>(.*?)</tspan>", svg_str, re.IGNORECASE | re.DOTALL):
137+
txt = re.sub(r"<[^>]+>", "", tm.group(1)).strip()
138+
txt = unescape(txt)
139+
txt = re.sub(r"\s+", " ", txt)
140+
if txt and len(txt) < 200:
141+
text_contents.append(txt)
142+
if len(text_contents) >= 5:
143+
break
144+
145+
return dims, elements, text_contents
146+
147+
148+
def chapter_key(filepath):
149+
"""Extract chapter/appendix identifier from path."""
150+
rel = os.path.relpath(filepath, ROOT).replace("\\", "/")
151+
# e.g. part-1-foundations/module-04-transformer-architecture/section-4.1.html
152+
parts = rel.split("/")
153+
if "appendices" in rel:
154+
# appendices/appendix-l-langchain/section-l.1.html
155+
if len(parts) >= 2:
156+
return parts[1] # appendix-l-langchain
157+
return parts[0]
158+
else:
159+
if len(parts) >= 2:
160+
return parts[1] # module-04-transformer-architecture
161+
return parts[0]
162+
163+
164+
def main():
165+
import io
166+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
167+
168+
output_lines = []
169+
170+
def emit(line=""):
171+
output_lines.append(line)
172+
print(line)
173+
174+
html_files = find_html_files()
175+
emit(f"Scanning {len(html_files)} HTML files for inline SVGs...")
176+
emit("=" * 80)
177+
178+
# Collect all SVG records grouped by chapter
179+
by_chapter = defaultdict(list)
180+
total_svgs = 0
181+
182+
for fpath in html_files:
183+
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
184+
html = f.read()
185+
186+
svgs = extract_inline_svgs(html)
187+
if not svgs:
188+
continue
189+
190+
rel_path = os.path.relpath(fpath, ROOT).replace("\\", "/")
191+
chap = chapter_key(fpath)
192+
193+
for svg_start, svg_str in svgs:
194+
total_svgs += 1
195+
fig_id, caption = get_surrounding_figure(html, svg_start)
196+
dims, elements, text_contents = analyze_svg(svg_str)
197+
198+
record = {
199+
"file": rel_path,
200+
"figure_id": fig_id,
201+
"caption": caption,
202+
"dims": dims,
203+
"elements": elements,
204+
"texts": text_contents,
205+
"svg_len": len(svg_str),
206+
}
207+
by_chapter[chap].append(record)
208+
209+
# Output grouped by chapter
210+
emit("")
211+
emit("INLINE SVG INVENTORY BY CHAPTER")
212+
emit("=" * 80)
213+
214+
caption_words = Counter()
215+
216+
for chap in sorted(by_chapter.keys()):
217+
records = by_chapter[chap]
218+
emit("")
219+
emit(f" {chap} ({len(records)} SVG(s))")
220+
emit(f" {'-' * 70}")
221+
222+
for rec in records:
223+
emit(f" File: {rec['file']}")
224+
if rec["figure_id"]:
225+
emit(f" Figure ID: {rec['figure_id']}")
226+
if rec["caption"]:
227+
emit(f" Caption: {rec['caption'][:120]}")
228+
# Collect caption words for themes
229+
words = re.findall(r"[a-zA-Z]{3,}", rec["caption"].lower())
230+
caption_words.update(words)
231+
emit(f" Dimensions: {rec['dims']}")
232+
emit(f" SVG size: {rec['svg_len']:,} chars")
233+
234+
# Element summary
235+
if rec["elements"]:
236+
parts = [f"{tag}:{cnt}" for tag, cnt in sorted(rec["elements"].items(), key=lambda x: -x[1])]
237+
emit(f" Elements: {', '.join(parts)}")
238+
239+
# Text labels
240+
if rec["texts"]:
241+
labels = rec["texts"][:5]
242+
emit(f" Text labels: {labels}")
243+
244+
emit("")
245+
246+
# Stats
247+
emit("=" * 80)
248+
emit("SUMMARY STATISTICS")
249+
emit("=" * 80)
250+
emit(f"Total inline SVGs found: {total_svgs}")
251+
emit(f"Chapters/appendices with SVGs: {len(by_chapter)}")
252+
emit("")
253+
254+
emit("SVGs per chapter:")
255+
for chap in sorted(by_chapter.keys()):
256+
emit(f" {chap}: {len(by_chapter[chap])}")
257+
258+
emit("")
259+
emit("Top 30 caption keywords (concept themes):")
260+
# Filter out common stop words
261+
stop = {
262+
"the", "and", "for", "with", "from", "that", "this", "are", "was",
263+
"how", "its", "can", "each", "into", "has", "between", "over",
264+
"figure", "fig", "diagram", "shows", "show", "illustrates",
265+
"through", "across", "where", "while", "using", "used", "which",
266+
"during", "after", "before", "about", "than", "more", "most",
267+
"all", "both", "their", "they", "not", "but", "when", "then",
268+
"also", "will", "been", "have", "does", "being", "other",
269+
}
270+
top_words = [(w, c) for w, c in caption_words.most_common(80) if w not in stop][:30]
271+
for word, count in top_words:
272+
emit(f" {word}: {count}")
273+
274+
# Save report
275+
report_path = os.path.join(ROOT, "scripts", "audit_inline_svgs_report.txt")
276+
with open(report_path, "w", encoding="utf-8") as f:
277+
f.write("\n".join(output_lines) + "\n")
278+
print(f"\nReport saved to: {report_path}")
279+
280+
281+
if __name__ == "__main__":
282+
main()

0 commit comments

Comments
 (0)