cppalliance
diff --git a/‎paperworks/lib/inventory.py‎
Lines changed: 3 additions & 1 deletion b/‎paperworks/lib/inventory.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎paperworks/lib/templates/index.html‎
Lines changed: 6 additions & 2 deletions b/‎paperworks/lib/templates/index.html‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎tomd/CLAUDE.md‎
Lines changed: 124 additions & 0 deletions b/‎tomd/CLAUDE.md‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎tomd/lib/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tomd/lib/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tomd/lib/__pycache__/__init__.cpython-310.pyc‎
226 Bytes b/‎tomd/lib/__pycache__/__init__.cpython-310.pyc‎
226 Bytes
diff --git a/‎tomd/lib/__pycache__/similarity.cpython-310.pyc‎
1.97 KB b/‎tomd/lib/__pycache__/similarity.cpython-310.pyc‎
1.97 KB
diff --git a/‎tomd/lib/__pycache__/toc.cpython-310.pyc‎
3.26 KB b/‎tomd/lib/__pycache__/toc.cpython-310.pyc‎
3.26 KB
diff --git a/‎tomd/lib/pdf/__init__.py‎
Lines changed: 104 additions & 0 deletions b/‎tomd/lib/pdf/__init__.py‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎tomd/lib/pdf/__pycache__/__init__.cpython-310.pyc‎
3.11 KB b/‎tomd/lib/pdf/__pycache__/__init__.cpython-310.pyc‎
3.11 KB
diff --git a/‎tomd/lib/pdf/__pycache__/cleanup.cpython-310.pyc‎
7.38 KB b/‎tomd/lib/pdf/__pycache__/cleanup.cpython-310.pyc‎
7.38 KB
@@ -128,7 +128,7 @@ def scan_markdown_dirs(watch_dirs):
     Returns dict keyed by normalized doc_number.
     """
     papers = {}
-    for entry in watch_dirs:
+    for dir_idx, entry in enumerate(watch_dirs, 1):
         if not entry.get("enabled", True):
             continue
         dirpath = Path(entry["path"])
@@ -167,6 +167,7 @@ def scan_markdown_dirs(watch_dirs):
                 "brutal_summary": brutal,
                 "md_path": str(md_path),
                 "md_mtime": md_path.stat().st_mtime,
+                "folder_idx": dir_idx,
             }
     return papers
 
@@ -323,6 +324,7 @@ def build_inventory(watch_dirs, output_dir, remote_papers=None):
             "stale_pdf": stale_pdf,
             "stale_remote_meta": stale_remote_meta,
             "warnings": warnings,
+            "folder_idx": md.get("folder_idx") if md else None,
         }
 
     # Group by base, keep only latest revision, attach prior revisions
 
@@ -224,6 +224,7 @@
     <div class="table-wrap" id="table-wrap">
       <table>
         <thead><tr>
+          <th style="width:24px;text-align:center;cursor:pointer" onclick="toggleFolderSort()" title="Sort by folder">#</th>
           <th style="width:95px">Document</th>
           <th style="width:30px"></th>
           <th style="width:34px"></th>
@@ -345,7 +346,7 @@ <h2>Activity Log</h2>
 <div id="toast"></div>
 
 <script>
-let _papers=[], _filter='', _workingSet=new Set(), _dirtySet=new Set(), _queueDepth=0, _expandedIdx=null;
+let _papers=[], _filter='', _workingSet=new Set(), _dirtySet=new Set(), _queueDepth=0, _expandedIdx=null, _folderSort=false;
 
 // -- Tabs --
 function showTab(name) {
@@ -401,15 +402,18 @@ <h2>Activity Log</h2>
 
 // -- Papers --
 
+function toggleFolderSort(){_folderSort=!_folderSort;renderPapers();}
 function renderPapers() {
   const tbody=document.getElementById('tbody'), empty=document.getElementById('empty'), tw=document.getElementById('table-wrap');
   const ft=_filter.toLowerCase();
   const indexed=_papers.map((p,i)=>({p,i})).filter(({p})=>!ft||[p.doc_number,p.title,p.audience].filter(Boolean).join(' ').toLowerCase().includes(ft));
+  if(_folderSort) indexed.sort((a,b)=>{const fa=a.p.folder_idx||999,fb=b.p.folder_idx||999;return fa!==fb?fa-fb:0;});
   if(!indexed.length){tbody.innerHTML='';empty.style.display='block';tw.style.display='none';return;}
   empty.style.display='none';tw.style.display='block';
   tbody.innerHTML=indexed.map(({p,i:idx})=>{
     const dirty=p.doc_number&&p.pdf_path&&_dirtySet.has(p.doc_number)?'<span class="dirty-dot"></span>':'';
     return `<tr onclick="toggleDetail(${idx})" data-idx="${idx}">
+      <td class="mono muted" style="width:24px;text-align:center;font-size:10px">${esc(''+(p.folder_idx||''))}</td>
       <td class="mono" style="padding-right:2px">${esc(p.doc_number)||'-'}${dirty}</td>
       <td style="width:30px;text-align:center;padding-left:0;padding-right:4px">${p.md_path?`<a class="file-btn file-btn-md" href="/api/file?path=${encodeURIComponent(p.md_path)}" target="_blank" onclick="event.stopPropagation()" title="Open Markdown">MD</a>`:''}</td>
       <td style="width:34px;text-align:center;padding-left:0;padding-right:4px">${p.pdf_path?`<a class="file-btn file-btn-pdf" href="/api/file?path=${encodeURIComponent(p.pdf_path)}" target="_blank" onclick="event.stopPropagation()" title="Open PDF">PDF</a>`:''}</td>
@@ -444,7 +448,7 @@ <h2>Activity Log</h2>
   let acts='';
   if(p.md_path) acts+=`<button class="btn btn-blue${wk}" onclick="event.stopPropagation();renderSingle(${idx})"${dis}>RENDER</button>`;
   if(p.pdf_path&&p.remote) acts+=`<button class="btn btn-blue${wk}" onclick="event.stopPropagation();submitUpload(${idx})"${dis}>UPLOAD</button>`;
-  d.innerHTML=`<td colspan="6"><div class="detail-content"><div class="detail-grid">
+  d.innerHTML=`<td colspan="7"><div class="detail-content"><div class="detail-grid">
     <span class="dl">Document</span><span class="dv">${esc(p.doc_number)||'-'}</span>
     <span class="dl">Title</span><span class="dv">${esc(p.title)||'-'}</span>
     <span class="dl">Authors</span><span class="dv">${esc(p.authors)||'-'}</span>
 
@@ -0,0 +1,124 @@
+# tomd - Agent Rules
+
+## What This Is
+
+tomd is a hybrid PDF-to-Markdown converter. It uses deterministic text extraction and multi-signal classification to produce Markdown, with optional LLM resolution for ambiguous sections. HTML conversion is planned as a second converter.
+
+## Architecture
+
+PDF -> strip headers/footers -> dual extract + links -> compare -> clean up text -> structure confident regions -> emit .md + .prompts.md
+
+## Multi-Signal Confidence (Critical)
+
+Never classify based on a single signal. Every structural decision (heading, paragraph, list, code, table) must consider all available signals and produce a confidence level.
+
+Available signals and their reliability:
+- **Section numbering** (highest) - dotted decimal numbers give unambiguous depth
+- **Font size** (high) - relative to the most common (body) size
+- **Font weight/style** (medium) - bold/italic flags from font metadata
+- **Known section names** (high for WG21) - `Abstract`, `References`, `Wording`, etc.
+- **Line geometry** (medium) - length, indentation, vertical gaps
+- **Dual-path agreement** (high) - MuPDF and spatial rules agree on boundaries
+
+When signals agree, confidence is high. When they disagree, flag for LLM review. Never silently pick one signal over another - the disagreement is the data.
+
+## Preserve All Metadata
+
+Never discard information from the PDF during extraction. Text is the primary output, but font size, font name, font flags, coordinates, and page boundaries are preserved as annotations. Downstream phases use this metadata for confidence scoring and LLM prompt context.
+
+Discard nothing. Use everything.
+
+## Dual Extraction Path
+
+Every PDF page is processed through two independent extraction paths:
+1. **MuPDF path** - `page.get_text("dict")` for MuPDF's block/line/span grouping
+2. **Spatial path** - `page.get_text("rawdict")` with four coordinate rules:
+   - Horizontal close -> same word
+   - Horizontal far -> word break
+   - Vertical close + left reset -> line continuation (same paragraph)
+   - Vertical far -> paragraph break
+
+Both produce the same intermediate format. Agreement = confident. Disagreement = uncertain. Never skip one path. The comparison is the confidence mechanism.
+
+When paths disagree: MuPDF version goes in the output (it's more battle-tested). Both versions go in the LLM prompt for reconciliation. The prompt must require all data verbatim - the LLM fixes structure, never content.
+
+## Heading Rules
+
+- Heading level is derived from section numbering depth: `2.1.3` = depth 3 = `####` (depth + 1 because `#` is reserved for the document title)
+- Font size provides an independent heading level estimate by ranking sizes larger than body
+- All signals are evaluated; confidence depends on agreement count
+- Nesting must be validated: no heading may skip more than one level deeper than its predecessor
+- When signals conflict, section number wins if present; font-size ranking wins otherwise at lower confidence
+- Known unnumbered sections (`Abstract`, `Revision History`, `References`, `Acknowledgements`, `Motivation`, `Wording`, `Proposed Wording`, `Design Decisions`) are top-level (`##`)
+- Title is the first non-metadata text block before any numbered section, must have font size larger than body
+
+## Honest Output
+
+The tool must never silently produce bad Markdown.
+
+- If a region is uncertain, emit the MuPDF version in the output marked with `<!-- tomd:uncertain:L{start}-L{end} -->`
+- The companion prompts file includes BOTH extraction versions, surrounding context, and all raw metadata
+- LLM prompts must require verbatim data preservation - the LLM fixes structure, never content
+- If no prompts file is needed (zero uncertain regions), don't write one.
+- High-confidence output should look like a human wrote the Markdown - proper heading nesting, unwrapped paragraph lines, correct list formatting, blank lines between blocks
+
+## Markdown Quality
+
+The output Markdown must be clean and readable:
+- Paragraphs are single unwrapped lines (no hard wraps from PDF line breaks)
+- One blank line between all block elements (paragraphs, headings, lists, code blocks)
+- Headings use ATX style (`##` not underlines)
+- Lists use the marker from the source when detectable (`-`, `*`, `1.`)
+- No trailing whitespace on lines
+- No redundant blank lines (max one between blocks)
+- Dehyphenate broken words across lines (`imple-` + `mentation` -> `implementation`)
+- Join paragraphs that span page breaks (no terminal punctuation + lowercase continuation = same paragraph)
+- Hyperlinks become `[text](url)` - only http, https, mailto schemes
+- WG21 metadata block becomes YAML front matter
+- Collapse multiple spaces, replace non-breaking spaces, normalize whitespace
+
+## LLM Integration (v2)
+
+Auto-resolution via `--llm` flag is deferred to v2. For v1, the tool produces a companion `.prompts.md` file that the user feeds to any LLM manually. The prompts file is plain Markdown - usable by any LLM, any interface.
+
+## File Map
+
+- `main.py` - CLI entry point. Argparse, glob expansion, output path logic, main(). No conversion logic.
+- `lib/__init__.py` - Empty package marker.
+- `lib/similarity.py` - Dual-algorithm string similarity (SequenceMatcher + Jaccard). Per-algorithm thresholds, 200-char circuit breaker. Format-agnostic.
+- `lib/toc.py` - Table of Contents detection. Matches section texts against known headings using fuzzy similarity. Finds runs of 3+ consecutive matches. Format-agnostic - no dependency on PDF types.
+- `lib/pdf/__init__.py` - Exports `convert_pdf()`. Wires the pipeline: cleanup -> extract -> structure -> emit.
+- `lib/pdf/types.py` - Data classes (`Block`, `Span`, `Section`), confidence enum, named constants (thresholds), precompiled regex patterns. Imported by all other pdf modules.
+- `lib/pdf/cleanup.py` - Header/footer detection (top-3/bottom-3, runs before extraction), dehyphenation, cross-page paragraph joining, whitespace normalization (runs after comparison).
+- `lib/pdf/mono.py` - Triple-signal monospace detection. Font name patterns, glyph width uniformity, glyph spacing uniformity. Called during extraction; result stored on Span.
+- `lib/pdf/table.py` - Table detection from MuPDF block/line positions. Detects columnar layout (x-gap between lines), extracts as high-confidence TABLE sections before dual-path comparison.
+- `lib/pdf/extract.py` - `extract_mupdf()` and `extract_spatial()`. Both return `list[Block]`. No structuring logic.
+- `lib/pdf/structure.py` - Dual-path comparison, heading intelligence (multi-signal confidence), paragraph grouping, list detection, WG21 metadata block parsing.
+- `lib/pdf/emit.py` - Markdown generation from structured sections. Prompts file generation for uncertain regions.
+
+## Header/Footer Stripping
+
+Before dual extraction, scan all pages for repeating content at page edges.
+
+- For each page, capture the top 3 and bottom 3 text items by y-coordinate
+- Compare across pages: same text at same y on 50%+ of pages = repeating = strip
+- Page numbers: same y, content is a bare number or "Page N" or "N of M" = strip
+- Running doc numbers: same y, content matches document number pattern = strip
+- Strip these items from page data before extraction runs. They are not content.
+
+## Text Cleanup Rules
+
+- **Dehyphenation**: line ends with `-`, next line starts lowercase -> join word, remove hyphen. Skip known compound prefixes (`self-`, `non-`, `well-`, `cross-`).
+- **Cross-page join**: last block on page N has no terminal punctuation, first block on page N+1 starts lowercase -> same paragraph, join.
+- **Link extraction**: collected during Phase 2 via `page.get_links()`, matched to text by bounding rect -> `[text](url)`. Only http/https/mailto.
+- **Whitespace**: collapse runs, replace non-breaking spaces, strip trailing.
+- **WG21 metadata**: Document Number / Date / Reply-to / Audience at top of page 1 -> YAML front matter.
+
+## tomd-Specific Extensions
+
+These extend general rules in the root CLAUDE.md with project-specific instances.
+
+- `fitz.open()` must always be paired with `doc.close()` in a `finally` block. Never rely on garbage collection.
+- Font metadata thresholds (what counts as "larger than body," "horizontal close," etc.) must be named constants, not magic numbers scattered in code.
+- The four spatial rules are the foundation. Changes to their thresholds affect everything downstream. Test thoroughly.
+- Regex patterns for section numbers, known section names, list markers, and metadata fields must be precompiled at module level and defined in one place.
@@ -0,0 +1 @@
+"""tomd shared library - format-agnostic modules and converter packages."""
@@ -0,0 +1,104 @@
+"""PDF to Markdown converter - pipeline entry point."""
+
+import logging
+from pathlib import Path
+
+from .cleanup import _get_edge_items, detect_repeating, strip_repeating, cleanup_text
+from .extract import extract_mupdf, extract_spatial, collect_links, attach_links
+from .structure import compare_extractions, structure_sections
+from .table import detect_tables, exclude_table_regions
+from .emit import emit_markdown, emit_prompts
+from .types import SectionKind, is_readable
+from ..toc import find_toc_indices
+
+_log = logging.getLogger(__name__)
+
+
+def convert_pdf(path: Path) -> tuple[str, str | None]:
+    """Convert a PDF file to Markdown.
+
+    Returns (markdown_text, prompts_text_or_none).
+    """
+    import fitz
+
+    path = Path(path)
+    doc = fitz.open(str(path))
+    try:
+        page_count = doc.page_count
+        if page_count == 0:
+            return "", None
+
+        all_mupdf_blocks = []
+        all_spatial_blocks = []
+        all_edge_items = []
+
+        for pg_num in range(page_count):
+            page = doc[pg_num]
+            page_height = page.rect.height
+
+            mupdf_blocks = extract_mupdf(page, pg_num)
+            spatial_blocks = extract_spatial(page, pg_num)
+
+            edge_items = _get_edge_items(mupdf_blocks, pg_num, page_height)
+            all_edge_items.append(edge_items)
+
+            links = collect_links(page)
+            attach_links(mupdf_blocks, links)
+            attach_links(spatial_blocks, links)
+
+            all_mupdf_blocks.extend(mupdf_blocks)
+            all_spatial_blocks.extend(spatial_blocks)
+    finally:
+        doc.close()
+
+    mupdf_text = "\n".join(b.text for b in all_mupdf_blocks)
+    if not is_readable(mupdf_text):
+        _log.warning("Extracted text is not readable (encrypted/scanned PDF?)")
+        return "", None
+
+    repeating = detect_repeating(all_edge_items, page_count)
+    if repeating:
+        _log.info("Stripping %d repeating header/footer patterns", len(repeating))
+        all_mupdf_blocks = strip_repeating(all_mupdf_blocks, repeating)
+        all_spatial_blocks = strip_repeating(all_spatial_blocks, repeating)
+
+    all_mupdf_blocks = cleanup_text(all_mupdf_blocks)
+    all_spatial_blocks = cleanup_text(all_spatial_blocks)
+
+    table_sections, all_mupdf_blocks = detect_tables(all_mupdf_blocks)
+    if table_sections:
+        _log.info("Detected %d table(s)", len(table_sections))
+        all_spatial_blocks = exclude_table_regions(
+            all_spatial_blocks, table_sections)
+
+    sections = compare_extractions(all_mupdf_blocks, all_spatial_blocks)
+
+    for ts in table_sections:
+        inserted = False
+        for i, sec in enumerate(sections):
+            if sec.page_num > ts.page_num:
+                sections.insert(i, ts)
+                inserted = True
+                break
+            if (sec.page_num == ts.page_num and sec.lines
+                    and ts.lines
+                    and sec.lines[0].bbox[1] > ts.lines[0].bbox[1]):
+                sections.insert(i, ts)
+                inserted = True
+                break
+        if not inserted:
+            sections.append(ts)
+
+    metadata, sections = structure_sections(sections)
+
+    texts = [sec.text.split("\n")[0].strip() for sec in sections]
+    heading_texts = {sec.text.split("\n")[0].strip()
+                     for sec in sections if sec.kind == SectionKind.HEADING}
+    toc_indices = find_toc_indices(texts, heading_texts)
+    if toc_indices:
+        sections = [s for i, s in enumerate(sections) if i not in toc_indices]
+
+    md = emit_markdown(metadata, sections)
+    prompts = emit_prompts(metadata, sections)
+
+    return md, prompts
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""tomd shared library - format-agnostic modules and converter packages."""`