Skip to content

Commit 5509121

Browse files
committed
Add tomd tool
1 parent c22810d commit 5509121

27 files changed

Lines changed: 2303 additions & 3 deletions

paperworks/lib/inventory.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def scan_markdown_dirs(watch_dirs):
128128
Returns dict keyed by normalized doc_number.
129129
"""
130130
papers = {}
131-
for entry in watch_dirs:
131+
for dir_idx, entry in enumerate(watch_dirs, 1):
132132
if not entry.get("enabled", True):
133133
continue
134134
dirpath = Path(entry["path"])
@@ -167,6 +167,7 @@ def scan_markdown_dirs(watch_dirs):
167167
"brutal_summary": brutal,
168168
"md_path": str(md_path),
169169
"md_mtime": md_path.stat().st_mtime,
170+
"folder_idx": dir_idx,
170171
}
171172
return papers
172173

@@ -323,6 +324,7 @@ def build_inventory(watch_dirs, output_dir, remote_papers=None):
323324
"stale_pdf": stale_pdf,
324325
"stale_remote_meta": stale_remote_meta,
325326
"warnings": warnings,
327+
"folder_idx": md.get("folder_idx") if md else None,
326328
}
327329

328330
# Group by base, keep only latest revision, attach prior revisions

paperworks/lib/templates/index.html

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@
224224
<div class="table-wrap" id="table-wrap">
225225
<table>
226226
<thead><tr>
227+
<th style="width:24px;text-align:center;cursor:pointer" onclick="toggleFolderSort()" title="Sort by folder">#</th>
227228
<th style="width:95px">Document</th>
228229
<th style="width:30px"></th>
229230
<th style="width:34px"></th>
@@ -345,7 +346,7 @@ <h2>Activity Log</h2>
345346
<div id="toast"></div>
346347

347348
<script>
348-
let _papers=[], _filter='', _workingSet=new Set(), _dirtySet=new Set(), _queueDepth=0, _expandedIdx=null;
349+
let _papers=[], _filter='', _workingSet=new Set(), _dirtySet=new Set(), _queueDepth=0, _expandedIdx=null, _folderSort=false;
349350

350351
// -- Tabs --
351352
function showTab(name) {
@@ -401,15 +402,18 @@ <h2>Activity Log</h2>
401402

402403
// -- Papers --
403404

405+
function toggleFolderSort(){_folderSort=!_folderSort;renderPapers();}
404406
function renderPapers() {
405407
const tbody=document.getElementById('tbody'), empty=document.getElementById('empty'), tw=document.getElementById('table-wrap');
406408
const ft=_filter.toLowerCase();
407409
const indexed=_papers.map((p,i)=>({p,i})).filter(({p})=>!ft||[p.doc_number,p.title,p.audience].filter(Boolean).join(' ').toLowerCase().includes(ft));
410+
if(_folderSort) indexed.sort((a,b)=>{const fa=a.p.folder_idx||999,fb=b.p.folder_idx||999;return fa!==fb?fa-fb:0;});
408411
if(!indexed.length){tbody.innerHTML='';empty.style.display='block';tw.style.display='none';return;}
409412
empty.style.display='none';tw.style.display='block';
410413
tbody.innerHTML=indexed.map(({p,i:idx})=>{
411414
const dirty=p.doc_number&&p.pdf_path&&_dirtySet.has(p.doc_number)?'<span class="dirty-dot"></span>':'';
412415
return `<tr onclick="toggleDetail(${idx})" data-idx="${idx}">
416+
<td class="mono muted" style="width:24px;text-align:center;font-size:10px">${esc(''+(p.folder_idx||''))}</td>
413417
<td class="mono" style="padding-right:2px">${esc(p.doc_number)||'-'}${dirty}</td>
414418
<td style="width:30px;text-align:center;padding-left:0;padding-right:4px">${p.md_path?`<a class="file-btn file-btn-md" href="/api/file?path=${encodeURIComponent(p.md_path)}" target="_blank" onclick="event.stopPropagation()" title="Open Markdown">MD</a>`:''}</td>
415419
<td style="width:34px;text-align:center;padding-left:0;padding-right:4px">${p.pdf_path?`<a class="file-btn file-btn-pdf" href="/api/file?path=${encodeURIComponent(p.pdf_path)}" target="_blank" onclick="event.stopPropagation()" title="Open PDF">PDF</a>`:''}</td>
@@ -444,7 +448,7 @@ <h2>Activity Log</h2>
444448
let acts='';
445449
if(p.md_path) acts+=`<button class="btn btn-blue${wk}" onclick="event.stopPropagation();renderSingle(${idx})"${dis}>RENDER</button>`;
446450
if(p.pdf_path&&p.remote) acts+=`<button class="btn btn-blue${wk}" onclick="event.stopPropagation();submitUpload(${idx})"${dis}>UPLOAD</button>`;
447-
d.innerHTML=`<td colspan="6"><div class="detail-content"><div class="detail-grid">
451+
d.innerHTML=`<td colspan="7"><div class="detail-content"><div class="detail-grid">
448452
<span class="dl">Document</span><span class="dv">${esc(p.doc_number)||'-'}</span>
449453
<span class="dl">Title</span><span class="dv">${esc(p.title)||'-'}</span>
450454
<span class="dl">Authors</span><span class="dv">${esc(p.authors)||'-'}</span>

tomd/CLAUDE.md

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# tomd - Agent Rules
2+
3+
## What This Is
4+
5+
tomd is a hybrid PDF-to-Markdown converter. It uses deterministic text extraction and multi-signal classification to produce Markdown, with optional LLM resolution for ambiguous sections. HTML conversion is planned as a second converter.
6+
7+
## Architecture
8+
9+
PDF -> strip headers/footers -> dual extract + links -> compare -> clean up text -> structure confident regions -> emit .md + .prompts.md
10+
11+
## Multi-Signal Confidence (Critical)
12+
13+
Never classify based on a single signal. Every structural decision (heading, paragraph, list, code, table) must consider all available signals and produce a confidence level.
14+
15+
Available signals and their reliability:
16+
- **Section numbering** (highest) - dotted decimal numbers give unambiguous depth
17+
- **Font size** (high) - relative to the most common (body) size
18+
- **Font weight/style** (medium) - bold/italic flags from font metadata
19+
- **Known section names** (high for WG21) - `Abstract`, `References`, `Wording`, etc.
20+
- **Line geometry** (medium) - length, indentation, vertical gaps
21+
- **Dual-path agreement** (high) - MuPDF and spatial rules agree on boundaries
22+
23+
When signals agree, confidence is high. When they disagree, flag for LLM review. Never silently pick one signal over another - the disagreement is the data.
24+
25+
## Preserve All Metadata
26+
27+
Never discard information from the PDF during extraction. Text is the primary output, but font size, font name, font flags, coordinates, and page boundaries are preserved as annotations. Downstream phases use this metadata for confidence scoring and LLM prompt context.
28+
29+
Discard nothing. Use everything.
30+
31+
## Dual Extraction Path
32+
33+
Every PDF page is processed through two independent extraction paths:
34+
1. **MuPDF path** - `page.get_text("dict")` for MuPDF's block/line/span grouping
35+
2. **Spatial path** - `page.get_text("rawdict")` with four coordinate rules:
36+
- Horizontal close -> same word
37+
- Horizontal far -> word break
38+
- Vertical close + left reset -> line continuation (same paragraph)
39+
- Vertical far -> paragraph break
40+
41+
Both produce the same intermediate format. Agreement = confident. Disagreement = uncertain. Never skip one path. The comparison is the confidence mechanism.
42+
43+
When paths disagree: MuPDF version goes in the output (it's more battle-tested). Both versions go in the LLM prompt for reconciliation. The prompt must require all data verbatim - the LLM fixes structure, never content.
44+
45+
## Heading Rules
46+
47+
- Heading level is derived from section numbering depth: `2.1.3` = depth 3 = `####` (depth + 1 because `#` is reserved for the document title)
48+
- Font size provides an independent heading level estimate by ranking sizes larger than body
49+
- All signals are evaluated; confidence depends on agreement count
50+
- Nesting must be validated: no heading may skip more than one level deeper than its predecessor
51+
- When signals conflict, section number wins if present; font-size ranking wins otherwise at lower confidence
52+
- Known unnumbered sections (`Abstract`, `Revision History`, `References`, `Acknowledgements`, `Motivation`, `Wording`, `Proposed Wording`, `Design Decisions`) are top-level (`##`)
53+
- Title is the first non-metadata text block before any numbered section, must have font size larger than body
54+
55+
## Honest Output
56+
57+
The tool must never silently produce bad Markdown.
58+
59+
- If a region is uncertain, emit the MuPDF version in the output marked with `<!-- tomd:uncertain:L{start}-L{end} -->`
60+
- The companion prompts file includes BOTH extraction versions, surrounding context, and all raw metadata
61+
- LLM prompts must require verbatim data preservation - the LLM fixes structure, never content
62+
- If no prompts file is needed (zero uncertain regions), don't write one.
63+
- High-confidence output should look like a human wrote the Markdown - proper heading nesting, unwrapped paragraph lines, correct list formatting, blank lines between blocks
64+
65+
## Markdown Quality
66+
67+
The output Markdown must be clean and readable:
68+
- Paragraphs are single unwrapped lines (no hard wraps from PDF line breaks)
69+
- One blank line between all block elements (paragraphs, headings, lists, code blocks)
70+
- Headings use ATX style (`##` not underlines)
71+
- Lists use the marker from the source when detectable (`-`, `*`, `1.`)
72+
- No trailing whitespace on lines
73+
- No redundant blank lines (max one between blocks)
74+
- Dehyphenate broken words across lines (`imple-` + `mentation` -> `implementation`)
75+
- Join paragraphs that span page breaks (no terminal punctuation + lowercase continuation = same paragraph)
76+
- Hyperlinks become `[text](url)` - only http, https, mailto schemes
77+
- WG21 metadata block becomes YAML front matter
78+
- Collapse multiple spaces, replace non-breaking spaces, normalize whitespace
79+
80+
## LLM Integration (v2)
81+
82+
Auto-resolution via `--llm` flag is deferred to v2. For v1, the tool produces a companion `.prompts.md` file that the user feeds to any LLM manually. The prompts file is plain Markdown - usable by any LLM, any interface.
83+
84+
## File Map
85+
86+
- `main.py` - CLI entry point. Argparse, glob expansion, output path logic, main(). No conversion logic.
87+
- `lib/__init__.py` - Empty package marker.
88+
- `lib/similarity.py` - Dual-algorithm string similarity (SequenceMatcher + Jaccard). Per-algorithm thresholds, 200-char circuit breaker. Format-agnostic.
89+
- `lib/toc.py` - Table of Contents detection. Matches section texts against known headings using fuzzy similarity. Finds runs of 3+ consecutive matches. Format-agnostic - no dependency on PDF types.
90+
- `lib/pdf/__init__.py` - Exports `convert_pdf()`. Wires the pipeline: cleanup -> extract -> structure -> emit.
91+
- `lib/pdf/types.py` - Data classes (`Block`, `Span`, `Section`), confidence enum, named constants (thresholds), precompiled regex patterns. Imported by all other pdf modules.
92+
- `lib/pdf/cleanup.py` - Header/footer detection (top-3/bottom-3, runs before extraction), dehyphenation, cross-page paragraph joining, whitespace normalization (runs after comparison).
93+
- `lib/pdf/mono.py` - Triple-signal monospace detection. Font name patterns, glyph width uniformity, glyph spacing uniformity. Called during extraction; result stored on Span.
94+
- `lib/pdf/table.py` - Table detection from MuPDF block/line positions. Detects columnar layout (x-gap between lines), extracts as high-confidence TABLE sections before dual-path comparison.
95+
- `lib/pdf/extract.py` - `extract_mupdf()` and `extract_spatial()`. Both return `list[Block]`. No structuring logic.
96+
- `lib/pdf/structure.py` - Dual-path comparison, heading intelligence (multi-signal confidence), paragraph grouping, list detection, WG21 metadata block parsing.
97+
- `lib/pdf/emit.py` - Markdown generation from structured sections. Prompts file generation for uncertain regions.
98+
99+
## Header/Footer Stripping
100+
101+
Before dual extraction, scan all pages for repeating content at page edges.
102+
103+
- For each page, capture the top 3 and bottom 3 text items by y-coordinate
104+
- Compare across pages: same text at same y on 50%+ of pages = repeating = strip
105+
- Page numbers: same y, content is a bare number or "Page N" or "N of M" = strip
106+
- Running doc numbers: same y, content matches document number pattern = strip
107+
- Strip these items from page data before extraction runs. They are not content.
108+
109+
## Text Cleanup Rules
110+
111+
- **Dehyphenation**: line ends with `-`, next line starts lowercase -> join word, remove hyphen. Skip known compound prefixes (`self-`, `non-`, `well-`, `cross-`).
112+
- **Cross-page join**: last block on page N has no terminal punctuation, first block on page N+1 starts lowercase -> same paragraph, join.
113+
- **Link extraction**: collected during Phase 2 via `page.get_links()`, matched to text by bounding rect -> `[text](url)`. Only http/https/mailto.
114+
- **Whitespace**: collapse runs, replace non-breaking spaces, strip trailing.
115+
- **WG21 metadata**: Document Number / Date / Reply-to / Audience at top of page 1 -> YAML front matter.
116+
117+
## tomd-Specific Extensions
118+
119+
These extend general rules in the root CLAUDE.md with project-specific instances.
120+
121+
- `fitz.open()` must always be paired with `doc.close()` in a `finally` block. Never rely on garbage collection.
122+
- Font metadata thresholds (what counts as "larger than body," "horizontal close," etc.) must be named constants, not magic numbers scattered in code.
123+
- The four spatial rules are the foundation. Changes to their thresholds affect everything downstream. Test thoroughly.
124+
- Regex patterns for section numbers, known section names, list markers, and metadata fields must be precompiled at module level and defined in one place.

tomd/lib/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""tomd shared library - format-agnostic modules and converter packages."""
226 Bytes
Binary file not shown.
1.97 KB
Binary file not shown.
3.26 KB
Binary file not shown.

tomd/lib/pdf/__init__.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
"""PDF to Markdown converter - pipeline entry point."""
2+
3+
import logging
4+
from pathlib import Path
5+
6+
from .cleanup import _get_edge_items, detect_repeating, strip_repeating, cleanup_text
7+
from .extract import extract_mupdf, extract_spatial, collect_links, attach_links
8+
from .structure import compare_extractions, structure_sections
9+
from .table import detect_tables, exclude_table_regions
10+
from .emit import emit_markdown, emit_prompts
11+
from .types import SectionKind, is_readable
12+
from ..toc import find_toc_indices
13+
14+
_log = logging.getLogger(__name__)
15+
16+
17+
def convert_pdf(path: Path) -> tuple[str, str | None]:
18+
"""Convert a PDF file to Markdown.
19+
20+
Returns (markdown_text, prompts_text_or_none).
21+
"""
22+
import fitz
23+
24+
path = Path(path)
25+
doc = fitz.open(str(path))
26+
try:
27+
page_count = doc.page_count
28+
if page_count == 0:
29+
return "", None
30+
31+
all_mupdf_blocks = []
32+
all_spatial_blocks = []
33+
all_edge_items = []
34+
35+
for pg_num in range(page_count):
36+
page = doc[pg_num]
37+
page_height = page.rect.height
38+
39+
mupdf_blocks = extract_mupdf(page, pg_num)
40+
spatial_blocks = extract_spatial(page, pg_num)
41+
42+
edge_items = _get_edge_items(mupdf_blocks, pg_num, page_height)
43+
all_edge_items.append(edge_items)
44+
45+
links = collect_links(page)
46+
attach_links(mupdf_blocks, links)
47+
attach_links(spatial_blocks, links)
48+
49+
all_mupdf_blocks.extend(mupdf_blocks)
50+
all_spatial_blocks.extend(spatial_blocks)
51+
finally:
52+
doc.close()
53+
54+
mupdf_text = "\n".join(b.text for b in all_mupdf_blocks)
55+
if not is_readable(mupdf_text):
56+
_log.warning("Extracted text is not readable (encrypted/scanned PDF?)")
57+
return "", None
58+
59+
repeating = detect_repeating(all_edge_items, page_count)
60+
if repeating:
61+
_log.info("Stripping %d repeating header/footer patterns", len(repeating))
62+
all_mupdf_blocks = strip_repeating(all_mupdf_blocks, repeating)
63+
all_spatial_blocks = strip_repeating(all_spatial_blocks, repeating)
64+
65+
all_mupdf_blocks = cleanup_text(all_mupdf_blocks)
66+
all_spatial_blocks = cleanup_text(all_spatial_blocks)
67+
68+
table_sections, all_mupdf_blocks = detect_tables(all_mupdf_blocks)
69+
if table_sections:
70+
_log.info("Detected %d table(s)", len(table_sections))
71+
all_spatial_blocks = exclude_table_regions(
72+
all_spatial_blocks, table_sections)
73+
74+
sections = compare_extractions(all_mupdf_blocks, all_spatial_blocks)
75+
76+
for ts in table_sections:
77+
inserted = False
78+
for i, sec in enumerate(sections):
79+
if sec.page_num > ts.page_num:
80+
sections.insert(i, ts)
81+
inserted = True
82+
break
83+
if (sec.page_num == ts.page_num and sec.lines
84+
and ts.lines
85+
and sec.lines[0].bbox[1] > ts.lines[0].bbox[1]):
86+
sections.insert(i, ts)
87+
inserted = True
88+
break
89+
if not inserted:
90+
sections.append(ts)
91+
92+
metadata, sections = structure_sections(sections)
93+
94+
texts = [sec.text.split("\n")[0].strip() for sec in sections]
95+
heading_texts = {sec.text.split("\n")[0].strip()
96+
for sec in sections if sec.kind == SectionKind.HEADING}
97+
toc_indices = find_toc_indices(texts, heading_texts)
98+
if toc_indices:
99+
sections = [s for i, s in enumerate(sections) if i not in toc_indices]
100+
101+
md = emit_markdown(metadata, sections)
102+
prompts = emit_prompts(metadata, sections)
103+
104+
return md, prompts
3.11 KB
Binary file not shown.
7.38 KB
Binary file not shown.

0 commit comments

Comments
 (0)