Skip to content

Commit 61f070a

Browse files
committed
tomd tidying and review feedback
1 parent 89e8c2e commit 61f070a

31 files changed

Lines changed: 1433 additions & 537 deletions

tomd/CLAUDE.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,18 +99,18 @@ Auto-resolution via `--llm` flag is deferred to v2. For v1, the tool produces a
9999
## File Map
100100

101101
- `main.py` - CLI entry point. Argparse, glob expansion, output path logic, main(). No conversion logic.
102-
- `lib/__init__.py` - Package marker for shared library.
102+
- `lib/__init__.py` - Shared text utilities and constants for PDF and HTML converters: `ascii_escape`, `strip_format_chars`, `format_front_matter`, `ALLOWED_LINK_SCHEMES`, and shared regex patterns (`EMAIL_RE`, `DATE_RE`, `DOC_NUM_RE`, `SECTION_NUM_PREFIX_RE`).
103103
- `lib/similarity.py` - Dual-algorithm string similarity (SequenceMatcher + Jaccard). Per-algorithm thresholds, 200-char circuit breaker. Format-agnostic.
104104
- `lib/toc.py` - Table of Contents detection. Matches section texts against known headings using fuzzy similarity. Bridges small gaps. Format-agnostic - no dependency on PDF types.
105105
- `lib/pdf/__init__.py` - Exports `convert_pdf()`. Orchestrates the full pipeline in order. Includes monospace propagation, wording classification, and page 0 color extraction via space-color proxy.
106106
- `lib/pdf/wording.py` - Wording section detection via multi-signal HSV color + drawing decoration analysis. Detects ins/del markup. Confidence levels with prompts file for ambiguous cases.
107-
- `lib/pdf/types.py` - Data classes (`Block`, `Span`, `Line`, `Section`, `PageEdgeItem`), enums (`Confidence`, `SectionKind`), named constants, precompiled regex, `is_readable()`.
107+
- `lib/pdf/types.py` - Data classes (`Block`, `Span`, `Line`, `Section`, `PageEdgeItem`), enums (`Confidence`, `SectionKind`), named constants (all public, no underscore prefix), precompiled regex, `is_readable()`.
108108
- `lib/pdf/extract.py` - Dual extraction: `extract_mupdf()` (dict API) and `extract_spatial()` (rawdict + four spatial rules). Link collection and attachment. Calls `classify_monospace` during span construction.
109109
- `lib/pdf/mono.py` - Triple-signal monospace detection. Font name decomposition (strip modifiers, split camelCase, check keywords), glyph width uniformity, glyph spacing uniformity.
110-
- `lib/pdf/cleanup.py` - Header/footer detection (top-3/bottom-3 edge items), repeating strip, span whitespace (NBSP, multi-space on non-mono), dehyphenation, cross-page join, zero-width char strip.
110+
- `lib/pdf/cleanup.py` - Header/footer detection (edge items per page), repeating strip, span whitespace (NBSP, multi-space on non-mono), dehyphenation, cross-page join, hidden region detection.
111111
- `lib/pdf/spans.py` - Span normalization. Snaps bold/italic style boundaries to word edges. Monospace exempt.
112112
- `lib/pdf/table.py` - Table detection from MuPDF block/line positions. Detects columnar layout (x-gap between lines), extracts as high-confidence TABLE sections, excludes table regions from spatial path.
113-
- `lib/pdf/structure.py` - Dual-path comparison, metadata extraction, heading intelligence (multi-signal), position-based list detection (x-coordinates), paragraph merging, code block detection, language label detection, nesting validation.
113+
- `lib/pdf/structure.py` - Dual-path comparison, metadata extraction, heading intelligence (multi-signal, `heading_confidence` public), position-based list detection (x-coordinates), paragraph merging, code block detection, language label detection, nesting validation.
114114
- `lib/pdf/emit.py` - Markdown generation (headings, paragraphs, code blocks, tables, nested lists) with span-level formatting (inline code, bold, italic, links). Prompts file generation for uncertain regions.
115115

116116
## Header/Footer Stripping

tomd/lib-review.md

Lines changed: 198 additions & 0 deletions
Large diffs are not rendered by default.

tomd/lib/__init__.py

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
"""tomd shared library - format-agnostic modules and converter packages."""
1+
"""Shared text utilities and constants for PDF and HTML converters."""
22

3-
import html as _html_mod
3+
import re
4+
import unicodedata
45

56
_NAMED_ENTITIES = {
67
0xC0: "À", 0xC1: "Á", 0xC2: "Â", 0xC3: "Ã",
@@ -39,3 +40,72 @@ def ascii_escape(text: str) -> str:
3940
else:
4041
out.append(f"&#{cp};")
4142
return "".join(out)
43+
44+
45+
FORMAT_CHARS = frozenset(
46+
chr(c) for c in range(0x110000)
47+
if unicodedata.category(chr(c)) == 'Cf'
48+
)
49+
50+
51+
def strip_format_chars(text: str) -> str:
52+
"""Remove Unicode format characters (category Cf)."""
53+
return "".join(c for c in text if c not in FORMAT_CHARS)
54+
55+
56+
FRONT_MATTER_ORDER = ("title", "document", "date", "audience", "reply-to")
57+
58+
59+
def _yaml_escape(s: str) -> str:
60+
"""Escape a string for safe inclusion in double-quoted YAML."""
61+
return s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
62+
63+
64+
def _yaml_value(key: str, val) -> str:
65+
"""Format a single YAML value, quoting where needed."""
66+
if isinstance(val, list):
67+
items = [f' - "{_yaml_escape(str(v))}"' for v in val]
68+
return f"{key}:\n" + "\n".join(items)
69+
val = str(val) if not isinstance(val, str) else val
70+
if any(c in val for c in ':{}[]#&*?|>!%@`"\'\n\\'):
71+
return f'{key}: "{_yaml_escape(val)}"'
72+
return f"{key}: {val}"
73+
74+
75+
def format_front_matter(metadata: dict) -> str:
76+
"""Format metadata dict as YAML front matter.
77+
78+
Field order: title, document, date, audience, reply-to.
79+
Title and values containing YAML-special characters are double-quoted
80+
with backslash-escaping for embedded quotes, backslashes, and newlines.
81+
Reply-to is a YAML list of double-quoted strings.
82+
Returns empty string if metadata is empty.
83+
"""
84+
if not metadata:
85+
return ""
86+
lines = ["---"]
87+
for key in FRONT_MATTER_ORDER:
88+
if key in metadata:
89+
lines.append(_yaml_value(key, metadata[key]))
90+
for key, val in metadata.items():
91+
if key not in FRONT_MATTER_ORDER:
92+
lines.append(_yaml_value(key, val))
93+
lines.append("---")
94+
return "\n".join(lines)
95+
96+
97+
ALLOWED_LINK_SCHEMES = frozenset({"http", "https", "mailto"})
98+
99+
EMAIL_RE = re.compile(r"[\w.+-]+@[\w.-]+\.\w+")
100+
101+
DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
102+
103+
DOC_NUM_RE = re.compile(
104+
r"\b([DPN]\d{3,5}R\d+)\b"
105+
r"|\b([DPN]\d{3,5})\b"
106+
r"|\b(N\d{3,5})\b"
107+
r"|\b(SD-\d+)\b",
108+
re.IGNORECASE,
109+
)
110+
111+
SECTION_NUM_PREFIX_RE = re.compile(r"^\d+(?:\.\d+)*\.?\s+")

tomd/lib/html/__init__.py

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
11
"""HTML to Markdown converter for WG21 papers."""
22

33
import logging
4+
import os
45
from pathlib import Path
56

6-
from .. import ascii_escape
7+
from .. import ascii_escape, format_front_matter
78
from . import extract as _extract
89
from . import render as _render
910

1011
_log = logging.getLogger(__name__)
1112

1213

13-
def convert_html(path: Path) -> tuple[str, str | None]:
14+
def convert_html(path: Path | os.PathLike[str]) -> tuple[str, str | None]:
1415
"""Convert an HTML file to Markdown.
1516
17+
Reads the file as UTF-8 (with replacement for decode errors).
1618
Returns (markdown_text, prompts_text_or_none).
1719
HTML conversion produces a prompts file only when sections
1820
cannot be converted cleanly.
@@ -31,23 +33,7 @@ def convert_html(path: Path) -> tuple[str, str | None]:
3133

3234
parts = []
3335
if metadata:
34-
fm_lines = ["---"]
35-
order = ["title", "document", "date", "audience", "reply-to"]
36-
for key in order:
37-
if key in metadata:
38-
val = metadata[key]
39-
if isinstance(val, list):
40-
items = [f' - "{v}"' for v in val]
41-
fm_lines.append(f"{key}:\n" + "\n".join(items))
42-
elif key == "title":
43-
fm_lines.append(f'{key}: "{val}"')
44-
else:
45-
fm_lines.append(f"{key}: {val}")
46-
for key, val in metadata.items():
47-
if key not in order:
48-
fm_lines.append(f"{key}: {val}")
49-
fm_lines.append("---")
50-
parts.append("\n".join(fm_lines))
36+
parts.append(format_front_matter(metadata))
5137

5238
if body_md.strip():
5339
parts.append(body_md.strip())

tomd/lib/html/extract.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
1-
"""HTML parsing, generator detection, and metadata extraction."""
1+
"""HTML parsing, generator detection, metadata extraction, and boilerplate stripping."""
22

33
import logging
44
import re
55

66
from bs4 import BeautifulSoup, Tag
77

8-
_log = logging.getLogger(__name__)
8+
from .. import EMAIL_RE, DATE_RE, DOC_NUM_RE
99

10-
_EMAIL_RE = re.compile(r"[\w.+-]+@[\w.-]+\.\w+")
11-
_DOC_NUM_RE = re.compile(r"[DPN]\d{3,5}(?:R\d+)?", re.IGNORECASE)
12-
_DATE_RE = re.compile(r"\d{4}-\d{2}-\d{2}")
10+
_log = logging.getLogger(__name__)
1311

1412

1513
def parse_html(text: str) -> BeautifulSoup:
@@ -18,7 +16,11 @@ def parse_html(text: str) -> BeautifulSoup:
1816

1917

2018
def detect_generator(soup: BeautifulSoup) -> str:
21-
"""Identify which tool generated this HTML paper."""
19+
"""Identify which tool generated this HTML paper.
20+
21+
Returns one of: "mpark", "bikeshed", "hackmd", "hand-written", "unknown".
22+
Checks meta generator tag first, then structural heuristics.
23+
"""
2224
for meta in soup.find_all("meta"):
2325
name = (meta.get("name") or "").lower()
2426
content = meta.get("content") or ""
@@ -42,7 +44,11 @@ def detect_generator(soup: BeautifulSoup) -> str:
4244

4345

4446
def extract_metadata(soup: BeautifulSoup, generator: str) -> dict:
45-
"""Extract WG21 metadata fields from the HTML."""
47+
"""Extract WG21 metadata fields from the HTML.
48+
49+
Returns a dict with possible keys: title, document, date,
50+
audience, reply-to.
51+
"""
4652
if generator == "mpark":
4753
return _extract_mpark_metadata(soup)
4854
if generator == "bikeshed":
@@ -76,13 +82,13 @@ def _extract_mpark_metadata(soup: BeautifulSoup) -> dict:
7682

7783
if "document" in label:
7884
text = value_cell.get_text(strip=True)
79-
m = _DOC_NUM_RE.search(text)
85+
m = DOC_NUM_RE.search(text)
8086
if m:
8187
metadata["document"] = m.group(0).upper()
8288

8389
elif label == "date":
8490
text = value_cell.get_text(strip=True)
85-
m = _DATE_RE.search(text)
91+
m = DATE_RE.search(text)
8692
if m:
8793
metadata["date"] = m.group(0)
8894

@@ -110,7 +116,7 @@ def _parse_mpark_authors(cell: Tag) -> list[str]:
110116
line = line.strip().strip("<>").strip()
111117
if not line:
112118
continue
113-
email_match = _EMAIL_RE.search(line)
119+
email_match = EMAIL_RE.search(line)
114120
if email_match:
115121
email = email_match.group(0)
116122
name_part = line[:email_match.start()].strip().strip("<>").strip()
@@ -125,7 +131,7 @@ def _parse_mpark_authors(cell: Tag) -> list[str]:
125131
authors.append(f"<{email}>")
126132
else:
127133
cleaned = re.sub(r"[<>]", "", line).strip()
128-
if cleaned and not _DOC_NUM_RE.match(cleaned):
134+
if cleaned and not DOC_NUM_RE.match(cleaned):
129135
if pending_name:
130136
authors.append(pending_name)
131137
pending_name = cleaned
@@ -140,8 +146,8 @@ def _extract_bikeshed_metadata(soup: BeautifulSoup) -> dict:
140146

141147
h1 = soup.find("h1", class_="p-name")
142148
if h1:
143-
text = h1.get_text(strip=True)
144-
m = _DOC_NUM_RE.match(text)
149+
text = h1.get_text(" ", strip=True)
150+
m = DOC_NUM_RE.match(text)
145151
if m:
146152
doc = m.group(0).upper()
147153
title = text[m.end():].strip()
@@ -154,7 +160,7 @@ def _extract_bikeshed_metadata(soup: BeautifulSoup) -> dict:
154160
time_tag = soup.find("time", class_="dt-updated")
155161
if time_tag:
156162
dt = time_tag.get("datetime") or time_tag.get_text(strip=True)
157-
m = _DATE_RE.search(dt)
163+
m = DATE_RE.search(dt)
158164
if m:
159165
metadata["date"] = m.group(0)
160166

@@ -195,13 +201,13 @@ def _extract_handwritten_metadata(soup: BeautifulSoup) -> dict:
195201
if not line:
196202
continue
197203
if "document" in line.lower() and "number" in line.lower():
198-
m = _DOC_NUM_RE.search(line)
204+
m = DOC_NUM_RE.search(line)
199205
if m:
200206
metadata["document"] = m.group(0).upper()
201207
elif line.lower().startswith("audience"):
202208
metadata["audience"] = line.split(":", 1)[-1].strip()
203-
elif _DATE_RE.search(line):
204-
metadata["date"] = _DATE_RE.search(line).group(0)
209+
elif DATE_RE.search(line):
210+
metadata["date"] = DATE_RE.search(line).group(0)
205211

206212
for a in addr.find_all("a"):
207213
href = a.get("href", "")
@@ -225,11 +231,11 @@ def _extract_handwritten_metadata(soup: BeautifulSoup) -> dict:
225231
label = th.get_text(strip=True).rstrip(":").lower()
226232
value = td.get_text(strip=True)
227233
if "document" in label:
228-
m = _DOC_NUM_RE.search(value)
234+
m = DOC_NUM_RE.search(value)
229235
if m:
230236
metadata["document"] = m.group(0).upper()
231237
elif "date" in label:
232-
m = _DATE_RE.search(value)
238+
m = DATE_RE.search(value)
233239
if m:
234240
metadata["date"] = m.group(0)
235241
elif "audience" in label:
@@ -262,11 +268,11 @@ def _extract_generic_metadata(soup: BeautifulSoup) -> dict:
262268
label = cells[0].get_text(strip=True).rstrip(":").lower()
263269
value = cells[-1].get_text(strip=True)
264270
if "document" in label or "doc" in label:
265-
m = _DOC_NUM_RE.search(value)
271+
m = DOC_NUM_RE.search(value)
266272
if m:
267273
metadata["document"] = m.group(0).upper()
268274
elif "date" in label:
269-
m = _DATE_RE.search(value)
275+
m = DATE_RE.search(value)
270276
if m:
271277
metadata["date"] = m.group(0)
272278
elif "audience" in label:
@@ -276,7 +282,10 @@ def _extract_generic_metadata(soup: BeautifulSoup) -> dict:
276282

277283

278284
def strip_boilerplate(soup: BeautifulSoup, generator: str) -> list[str]:
279-
"""Remove non-content elements. Returns list of problem descriptions."""
285+
"""Remove non-content elements from `soup` in-place.
286+
287+
Returns list of problem descriptions.
288+
"""
280289
problems = []
281290

282291
for tag in soup.find_all(["style", "script", "link"]):

0 commit comments

Comments
 (0)