cppalliance
diff --git a/‎tomd/CLAUDE.md‎
Lines changed: 4 additions & 4 deletions b/‎tomd/CLAUDE.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tomd/lib-review.md‎
Lines changed: 198 additions & 0 deletions b/‎tomd/lib-review.md‎
Lines changed: 198 additions & 0 deletions
diff --git a/‎tomd/lib/__init__.py‎
Lines changed: 72 additions & 2 deletions b/‎tomd/lib/__init__.py‎
Lines changed: 72 additions & 2 deletions
diff --git a/‎tomd/lib/html/__init__.py‎
Lines changed: 5 additions & 19 deletions b/‎tomd/lib/html/__init__.py‎
Lines changed: 5 additions & 19 deletions
diff --git a/‎tomd/lib/html/extract.py‎
Lines changed: 31 additions & 22 deletions b/‎tomd/lib/html/extract.py‎
Lines changed: 31 additions & 22 deletions
@@ -99,18 +99,18 @@ Auto-resolution via `--llm` flag is deferred to v2. For v1, the tool produces a
 ## File Map
 
 - `main.py` - CLI entry point. Argparse, glob expansion, output path logic, main(). No conversion logic.
-- `lib/__init__.py` - Package marker for shared library.
+- `lib/__init__.py` - Shared text utilities and constants for PDF and HTML converters: `ascii_escape`, `strip_format_chars`, `format_front_matter`, `ALLOWED_LINK_SCHEMES`, and shared regex patterns (`EMAIL_RE`, `DATE_RE`, `DOC_NUM_RE`, `SECTION_NUM_PREFIX_RE`).
 - `lib/similarity.py` - Dual-algorithm string similarity (SequenceMatcher + Jaccard). Per-algorithm thresholds, 200-char circuit breaker. Format-agnostic.
 - `lib/toc.py` - Table of Contents detection. Matches section texts against known headings using fuzzy similarity. Bridges small gaps. Format-agnostic - no dependency on PDF types.
 - `lib/pdf/__init__.py` - Exports `convert_pdf()`. Orchestrates the full pipeline in order. Includes monospace propagation, wording classification, and page 0 color extraction via space-color proxy.
 - `lib/pdf/wording.py` - Wording section detection via multi-signal HSV color + drawing decoration analysis. Detects ins/del markup. Confidence levels with prompts file for ambiguous cases.
-- `lib/pdf/types.py` - Data classes (`Block`, `Span`, `Line`, `Section`, `PageEdgeItem`), enums (`Confidence`, `SectionKind`), named constants, precompiled regex, `is_readable()`.
+- `lib/pdf/types.py` - Data classes (`Block`, `Span`, `Line`, `Section`, `PageEdgeItem`), enums (`Confidence`, `SectionKind`), named constants (all public, no underscore prefix), precompiled regex, `is_readable()`.
 - `lib/pdf/extract.py` - Dual extraction: `extract_mupdf()` (dict API) and `extract_spatial()` (rawdict + four spatial rules). Link collection and attachment. Calls `classify_monospace` during span construction.
 - `lib/pdf/mono.py` - Triple-signal monospace detection. Font name decomposition (strip modifiers, split camelCase, check keywords), glyph width uniformity, glyph spacing uniformity.
-- `lib/pdf/cleanup.py` - Header/footer detection (top-3/bottom-3 edge items), repeating strip, span whitespace (NBSP, multi-space on non-mono), dehyphenation, cross-page join, zero-width char strip.
+- `lib/pdf/cleanup.py` - Header/footer detection (edge items per page), repeating strip, span whitespace (NBSP, multi-space on non-mono), dehyphenation, cross-page join, hidden region detection.
 - `lib/pdf/spans.py` - Span normalization. Snaps bold/italic style boundaries to word edges. Monospace exempt.
 - `lib/pdf/table.py` - Table detection from MuPDF block/line positions. Detects columnar layout (x-gap between lines), extracts as high-confidence TABLE sections, excludes table regions from spatial path.
-- `lib/pdf/structure.py` - Dual-path comparison, metadata extraction, heading intelligence (multi-signal), position-based list detection (x-coordinates), paragraph merging, code block detection, language label detection, nesting validation.
+- `lib/pdf/structure.py` - Dual-path comparison, metadata extraction, heading intelligence (multi-signal, `heading_confidence` public), position-based list detection (x-coordinates), paragraph merging, code block detection, language label detection, nesting validation.
 - `lib/pdf/emit.py` - Markdown generation (headings, paragraphs, code blocks, tables, nested lists) with span-level formatting (inline code, bold, italic, links). Prompts file generation for uncertain regions.
 
 ## Header/Footer Stripping
 
@@ -1,6 +1,7 @@
-"""tomd shared library - format-agnostic modules and converter packages."""
+"""Shared text utilities and constants for PDF and HTML converters."""
 
-import html as _html_mod
+import re
+import unicodedata
 
 _NAMED_ENTITIES = {
     0xC0: "&Agrave;", 0xC1: "&Aacute;", 0xC2: "&Acirc;", 0xC3: "&Atilde;",
@@ -39,3 +40,72 @@ def ascii_escape(text: str) -> str:
         else:
             out.append(f"&#{cp};")
     return "".join(out)
+
+
+FORMAT_CHARS = frozenset(
+    chr(c) for c in range(0x110000)
+    if unicodedata.category(chr(c)) == 'Cf'
+)
+
+
+def strip_format_chars(text: str) -> str:
+    """Remove Unicode format characters (category Cf)."""
+    return "".join(c for c in text if c not in FORMAT_CHARS)
+
+
+FRONT_MATTER_ORDER = ("title", "document", "date", "audience", "reply-to")
+
+
+def _yaml_escape(s: str) -> str:
+    """Escape a string for safe inclusion in double-quoted YAML."""
+    return s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
+
+
+def _yaml_value(key: str, val) -> str:
+    """Format a single YAML value, quoting where needed."""
+    if isinstance(val, list):
+        items = [f'  - "{_yaml_escape(str(v))}"' for v in val]
+        return f"{key}:\n" + "\n".join(items)
+    val = str(val) if not isinstance(val, str) else val
+    if any(c in val for c in ':{}[]#&*?|>!%@`"\'\n\\'):
+        return f'{key}: "{_yaml_escape(val)}"'
+    return f"{key}: {val}"
+
+
+def format_front_matter(metadata: dict) -> str:
+    """Format metadata dict as YAML front matter.
+
+    Field order: title, document, date, audience, reply-to.
+    Title and values containing YAML-special characters are double-quoted
+    with backslash-escaping for embedded quotes, backslashes, and newlines.
+    Reply-to is a YAML list of double-quoted strings.
+    Returns empty string if metadata is empty.
+    """
+    if not metadata:
+        return ""
+    lines = ["---"]
+    for key in FRONT_MATTER_ORDER:
+        if key in metadata:
+            lines.append(_yaml_value(key, metadata[key]))
+    for key, val in metadata.items():
+        if key not in FRONT_MATTER_ORDER:
+            lines.append(_yaml_value(key, val))
+    lines.append("---")
+    return "\n".join(lines)
+
+
+ALLOWED_LINK_SCHEMES = frozenset({"http", "https", "mailto"})
+
+EMAIL_RE = re.compile(r"[\w.+-]+@[\w.-]+\.\w+")
+
+DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
+
+DOC_NUM_RE = re.compile(
+    r"\b([DPN]\d{3,5}R\d+)\b"
+    r"|\b([DPN]\d{3,5})\b"
+    r"|\b(N\d{3,5})\b"
+    r"|\b(SD-\d+)\b",
+    re.IGNORECASE,
+)
+
+SECTION_NUM_PREFIX_RE = re.compile(r"^\d+(?:\.\d+)*\.?\s+")
@@ -1,18 +1,20 @@
 """HTML to Markdown converter for WG21 papers."""
 
 import logging
+import os
 from pathlib import Path
 
-from .. import ascii_escape
+from .. import ascii_escape, format_front_matter
 from . import extract as _extract
 from . import render as _render
 
 _log = logging.getLogger(__name__)
 
 
-def convert_html(path: Path) -> tuple[str, str | None]:
+def convert_html(path: Path | os.PathLike[str]) -> tuple[str, str | None]:
     """Convert an HTML file to Markdown.
 
+    Reads the file as UTF-8 (with replacement for decode errors).
     Returns (markdown_text, prompts_text_or_none).
     HTML conversion produces a prompts file only when sections
     cannot be converted cleanly.
@@ -31,23 +33,7 @@ def convert_html(path: Path) -> tuple[str, str | None]:
 
     parts = []
     if metadata:
-        fm_lines = ["---"]
-        order = ["title", "document", "date", "audience", "reply-to"]
-        for key in order:
-            if key in metadata:
-                val = metadata[key]
-                if isinstance(val, list):
-                    items = [f'  - "{v}"' for v in val]
-                    fm_lines.append(f"{key}:\n" + "\n".join(items))
-                elif key == "title":
-                    fm_lines.append(f'{key}: "{val}"')
-                else:
-                    fm_lines.append(f"{key}: {val}")
-        for key, val in metadata.items():
-            if key not in order:
-                fm_lines.append(f"{key}: {val}")
-        fm_lines.append("---")
-        parts.append("\n".join(fm_lines))
+        parts.append(format_front_matter(metadata))
 
     if body_md.strip():
         parts.append(body_md.strip())
 
@@ -1,15 +1,13 @@
-"""HTML parsing, generator detection, and metadata extraction."""
+"""HTML parsing, generator detection, metadata extraction, and boilerplate stripping."""
 
 import logging
 import re
 
 from bs4 import BeautifulSoup, Tag
 
-_log = logging.getLogger(__name__)
+from .. import EMAIL_RE, DATE_RE, DOC_NUM_RE
 
-_EMAIL_RE = re.compile(r"[\w.+-]+@[\w.-]+\.\w+")
-_DOC_NUM_RE = re.compile(r"[DPN]\d{3,5}(?:R\d+)?", re.IGNORECASE)
-_DATE_RE = re.compile(r"\d{4}-\d{2}-\d{2}")
+_log = logging.getLogger(__name__)
 
 
 def parse_html(text: str) -> BeautifulSoup:
@@ -18,7 +16,11 @@ def parse_html(text: str) -> BeautifulSoup:
 
 
 def detect_generator(soup: BeautifulSoup) -> str:
-    """Identify which tool generated this HTML paper."""
+    """Identify which tool generated this HTML paper.
+
+    Returns one of: "mpark", "bikeshed", "hackmd", "hand-written", "unknown".
+    Checks meta generator tag first, then structural heuristics.
+    """
     for meta in soup.find_all("meta"):
         name = (meta.get("name") or "").lower()
         content = meta.get("content") or ""
@@ -42,7 +44,11 @@ def detect_generator(soup: BeautifulSoup) -> str:
 
 
 def extract_metadata(soup: BeautifulSoup, generator: str) -> dict:
-    """Extract WG21 metadata fields from the HTML."""
+    """Extract WG21 metadata fields from the HTML.
+
+    Returns a dict with possible keys: title, document, date,
+    audience, reply-to.
+    """
     if generator == "mpark":
         return _extract_mpark_metadata(soup)
     if generator == "bikeshed":
@@ -76,13 +82,13 @@ def _extract_mpark_metadata(soup: BeautifulSoup) -> dict:
 
         if "document" in label:
             text = value_cell.get_text(strip=True)
-            m = _DOC_NUM_RE.search(text)
+            m = DOC_NUM_RE.search(text)
             if m:
                 metadata["document"] = m.group(0).upper()
 
         elif label == "date":
             text = value_cell.get_text(strip=True)
-            m = _DATE_RE.search(text)
+            m = DATE_RE.search(text)
             if m:
                 metadata["date"] = m.group(0)
 
@@ -110,7 +116,7 @@ def _parse_mpark_authors(cell: Tag) -> list[str]:
         line = line.strip().strip("<>").strip()
         if not line:
             continue
-        email_match = _EMAIL_RE.search(line)
+        email_match = EMAIL_RE.search(line)
         if email_match:
             email = email_match.group(0)
             name_part = line[:email_match.start()].strip().strip("<>").strip()
@@ -125,7 +131,7 @@ def _parse_mpark_authors(cell: Tag) -> list[str]:
                 authors.append(f"<{email}>")
         else:
             cleaned = re.sub(r"[<>]", "", line).strip()
-            if cleaned and not _DOC_NUM_RE.match(cleaned):
+            if cleaned and not DOC_NUM_RE.match(cleaned):
                 if pending_name:
                     authors.append(pending_name)
                 pending_name = cleaned
@@ -140,8 +146,8 @@ def _extract_bikeshed_metadata(soup: BeautifulSoup) -> dict:
 
     h1 = soup.find("h1", class_="p-name")
     if h1:
-        text = h1.get_text(strip=True)
-        m = _DOC_NUM_RE.match(text)
+        text = h1.get_text(" ", strip=True)
+        m = DOC_NUM_RE.match(text)
         if m:
             doc = m.group(0).upper()
             title = text[m.end():].strip()
@@ -154,7 +160,7 @@ def _extract_bikeshed_metadata(soup: BeautifulSoup) -> dict:
     time_tag = soup.find("time", class_="dt-updated")
     if time_tag:
         dt = time_tag.get("datetime") or time_tag.get_text(strip=True)
-        m = _DATE_RE.search(dt)
+        m = DATE_RE.search(dt)
         if m:
             metadata["date"] = m.group(0)
 
@@ -195,13 +201,13 @@ def _extract_handwritten_metadata(soup: BeautifulSoup) -> dict:
             if not line:
                 continue
             if "document" in line.lower() and "number" in line.lower():
-                m = _DOC_NUM_RE.search(line)
+                m = DOC_NUM_RE.search(line)
                 if m:
                     metadata["document"] = m.group(0).upper()
             elif line.lower().startswith("audience"):
                 metadata["audience"] = line.split(":", 1)[-1].strip()
-            elif _DATE_RE.search(line):
-                metadata["date"] = _DATE_RE.search(line).group(0)
+            elif DATE_RE.search(line):
+                metadata["date"] = DATE_RE.search(line).group(0)
 
         for a in addr.find_all("a"):
             href = a.get("href", "")
@@ -225,11 +231,11 @@ def _extract_handwritten_metadata(soup: BeautifulSoup) -> dict:
                 label = th.get_text(strip=True).rstrip(":").lower()
                 value = td.get_text(strip=True)
                 if "document" in label:
-                    m = _DOC_NUM_RE.search(value)
+                    m = DOC_NUM_RE.search(value)
                     if m:
                         metadata["document"] = m.group(0).upper()
                 elif "date" in label:
-                    m = _DATE_RE.search(value)
+                    m = DATE_RE.search(value)
                     if m:
                         metadata["date"] = m.group(0)
                 elif "audience" in label:
@@ -262,11 +268,11 @@ def _extract_generic_metadata(soup: BeautifulSoup) -> dict:
                 label = cells[0].get_text(strip=True).rstrip(":").lower()
                 value = cells[-1].get_text(strip=True)
                 if "document" in label or "doc" in label:
-                    m = _DOC_NUM_RE.search(value)
+                    m = DOC_NUM_RE.search(value)
                     if m:
                         metadata["document"] = m.group(0).upper()
                 elif "date" in label:
-                    m = _DATE_RE.search(value)
+                    m = DATE_RE.search(value)
                     if m:
                         metadata["date"] = m.group(0)
                 elif "audience" in label:
@@ -276,7 +282,10 @@ def _extract_generic_metadata(soup: BeautifulSoup) -> dict:
 
 
 def strip_boilerplate(soup: BeautifulSoup, generator: str) -> list[str]:
-    """Remove non-content elements. Returns list of problem descriptions."""
+    """Remove non-content elements from `soup` in-place.
+
+    Returns list of problem descriptions.
+    """
     problems = []
 
     for tag in soup.find_all(["style", "script", "link"]):