Skip to content

Commit ab65a76

Browse files
committed
Add tomd tool
1 parent c22810d commit ab65a76

22 files changed

Lines changed: 3684 additions & 3 deletions

paperworks/lib/inventory.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def scan_markdown_dirs(watch_dirs):
128128
Returns dict keyed by normalized doc_number.
129129
"""
130130
papers = {}
131-
for entry in watch_dirs:
131+
for dir_idx, entry in enumerate(watch_dirs, 1):
132132
if not entry.get("enabled", True):
133133
continue
134134
dirpath = Path(entry["path"])
@@ -167,6 +167,7 @@ def scan_markdown_dirs(watch_dirs):
167167
"brutal_summary": brutal,
168168
"md_path": str(md_path),
169169
"md_mtime": md_path.stat().st_mtime,
170+
"folder_idx": dir_idx,
170171
}
171172
return papers
172173

@@ -323,6 +324,7 @@ def build_inventory(watch_dirs, output_dir, remote_papers=None):
323324
"stale_pdf": stale_pdf,
324325
"stale_remote_meta": stale_remote_meta,
325326
"warnings": warnings,
327+
"folder_idx": md.get("folder_idx") if md else None,
326328
}
327329

328330
# Group by base, keep only latest revision, attach prior revisions

paperworks/lib/templates/index.html

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@
224224
<div class="table-wrap" id="table-wrap">
225225
<table>
226226
<thead><tr>
227+
<th style="width:24px;text-align:center;cursor:pointer" onclick="toggleFolderSort()" title="Sort by folder">#</th>
227228
<th style="width:95px">Document</th>
228229
<th style="width:30px"></th>
229230
<th style="width:34px"></th>
@@ -345,7 +346,7 @@ <h2>Activity Log</h2>
345346
<div id="toast"></div>
346347

347348
<script>
348-
let _papers=[], _filter='', _workingSet=new Set(), _dirtySet=new Set(), _queueDepth=0, _expandedIdx=null;
349+
let _papers=[], _filter='', _workingSet=new Set(), _dirtySet=new Set(), _queueDepth=0, _expandedIdx=null, _folderSort=false;
349350

350351
// -- Tabs --
351352
function showTab(name) {
@@ -401,15 +402,18 @@ <h2>Activity Log</h2>
401402

402403
// -- Papers --
403404

405+
function toggleFolderSort(){_folderSort=!_folderSort;renderPapers();}
404406
function renderPapers() {
405407
const tbody=document.getElementById('tbody'), empty=document.getElementById('empty'), tw=document.getElementById('table-wrap');
406408
const ft=_filter.toLowerCase();
407409
const indexed=_papers.map((p,i)=>({p,i})).filter(({p})=>!ft||[p.doc_number,p.title,p.audience].filter(Boolean).join(' ').toLowerCase().includes(ft));
410+
if(_folderSort) indexed.sort((a,b)=>{const fa=a.p.folder_idx||999,fb=b.p.folder_idx||999;return fa!==fb?fa-fb:0;});
408411
if(!indexed.length){tbody.innerHTML='';empty.style.display='block';tw.style.display='none';return;}
409412
empty.style.display='none';tw.style.display='block';
410413
tbody.innerHTML=indexed.map(({p,i:idx})=>{
411414
const dirty=p.doc_number&&p.pdf_path&&_dirtySet.has(p.doc_number)?'<span class="dirty-dot"></span>':'';
412415
return `<tr onclick="toggleDetail(${idx})" data-idx="${idx}">
416+
<td class="mono muted" style="width:24px;text-align:center;font-size:10px">${esc(''+(p.folder_idx||''))}</td>
413417
<td class="mono" style="padding-right:2px">${esc(p.doc_number)||'-'}${dirty}</td>
414418
<td style="width:30px;text-align:center;padding-left:0;padding-right:4px">${p.md_path?`<a class="file-btn file-btn-md" href="/api/file?path=${encodeURIComponent(p.md_path)}" target="_blank" onclick="event.stopPropagation()" title="Open Markdown">MD</a>`:''}</td>
415419
<td style="width:34px;text-align:center;padding-left:0;padding-right:4px">${p.pdf_path?`<a class="file-btn file-btn-pdf" href="/api/file?path=${encodeURIComponent(p.pdf_path)}" target="_blank" onclick="event.stopPropagation()" title="Open PDF">PDF</a>`:''}</td>
@@ -444,7 +448,7 @@ <h2>Activity Log</h2>
444448
let acts='';
445449
if(p.md_path) acts+=`<button class="btn btn-blue${wk}" onclick="event.stopPropagation();renderSingle(${idx})"${dis}>RENDER</button>`;
446450
if(p.pdf_path&&p.remote) acts+=`<button class="btn btn-blue${wk}" onclick="event.stopPropagation();submitUpload(${idx})"${dis}>UPLOAD</button>`;
447-
d.innerHTML=`<td colspan="6"><div class="detail-content"><div class="detail-grid">
451+
d.innerHTML=`<td colspan="7"><div class="detail-content"><div class="detail-grid">
448452
<span class="dl">Document</span><span class="dv">${esc(p.doc_number)||'-'}</span>
449453
<span class="dl">Title</span><span class="dv">${esc(p.title)||'-'}</span>
450454
<span class="dl">Authors</span><span class="dv">${esc(p.authors)||'-'}</span>

tomd/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
**/__pycache__/
2+
**/.pytest_cache/
3+
*.pyc

tomd/CLAUDE.md

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# tomd - Agent Rules
2+
3+
## What This Is
4+
5+
tomd is a hybrid PDF-to-Markdown converter. It uses deterministic text extraction and multi-signal classification to produce Markdown, with optional LLM resolution for ambiguous sections. HTML conversion is planned as a second converter.
6+
7+
## Architecture
8+
9+
Pipeline execution order:
10+
11+
1. Per-page: dual extract (MuPDF + spatial) + edge items + link collection
12+
2. Close document
13+
3. Readability check (early exit if garbage)
14+
4. Header/footer detection and stripping (both paths)
15+
5. Text cleanup: NBSP, whitespace, dehyphenation, cross-page join (both paths)
16+
6. Span normalization: snap bold/italic boundaries to word edges (both paths)
17+
7. Table detection from MuPDF block positions; exclude table regions from spatial
18+
8. Dual-path comparison -> Sections (confident or uncertain per page)
19+
9. Merge table sections into position
20+
10. Structure: metadata extraction, heading/list/paragraph classification, position-based list detection, paragraph merging, code block detection, language label stripping, nesting validation
21+
11. TOC stripping (fuzzy match against headings)
22+
12. Emit .md + optional .prompts.md
23+
24+
## Multi-Signal Confidence (Critical)
25+
26+
Never classify based on a single signal. Every structural decision (heading, paragraph, list, code, table) must consider all available signals and produce a confidence level.
27+
28+
Available signals and their reliability:
29+
- **Section numbering** (highest) - dotted decimal numbers give unambiguous depth
30+
- **Font size** (high) - relative to the most common (body) size
31+
- **Font weight/style** (medium) - bold/italic flags from font metadata
32+
- **Known section names** (high for WG21) - `Abstract`, `References`, `Wording`, etc.
33+
- **Line geometry** (medium) - length, indentation, vertical gaps
34+
- **Dual-path agreement** (high) - MuPDF and spatial rules agree on boundaries
35+
36+
When signals agree, confidence is high. When they disagree, flag for LLM review. Never silently pick one signal over another - the disagreement is the data.
37+
38+
## Preserve All Metadata
39+
40+
Never discard information from the PDF during extraction. Text is the primary output, but font size, font name, font flags, coordinates, and page boundaries are preserved as annotations. Downstream phases use this metadata for confidence scoring and LLM prompt context.
41+
42+
Discard nothing. Use everything.
43+
44+
## Dual Extraction Path
45+
46+
Every PDF page is processed through two independent extraction paths:
47+
1. **MuPDF path** - `page.get_text("dict")` for MuPDF's block/line/span grouping
48+
2. **Spatial path** - `page.get_text("rawdict")` with four coordinate rules:
49+
- Horizontal close -> same word
50+
- Horizontal far -> word break
51+
- Vertical close + left reset -> line continuation (same paragraph)
52+
- Vertical far -> paragraph break
53+
54+
Both produce the same intermediate format. Agreement = confident. Disagreement = uncertain. Never skip one path. The comparison is the confidence mechanism.
55+
56+
When paths disagree: MuPDF version goes in the output (it's more battle-tested). Both versions go in the LLM prompt for reconciliation. The prompt must require all data verbatim - the LLM fixes structure, never content.
57+
58+
## Heading Rules
59+
60+
- Heading level is derived from section numbering depth: `2.1.3` = depth 3 = `####` (depth + 1 because `#` is reserved for the document title)
61+
- Font size provides an independent heading level estimate by ranking sizes larger than body
62+
- All signals are evaluated; confidence depends on agreement count
63+
- Nesting must be validated: no heading may skip more than one level deeper than its predecessor
64+
- When signals conflict, section number wins if present; font-size ranking wins otherwise at lower confidence
65+
- Known unnumbered sections (`Abstract`, `Revision History`, `References`, `Acknowledgements`, `Motivation`, `Wording`, `Proposed Wording`, `Design Decisions`) are top-level (`##`)
66+
- Title is the first non-metadata text block before any numbered section, must have font size larger than body
67+
68+
## Honest Output
69+
70+
The tool must never silently produce bad Markdown.
71+
72+
- If a region is uncertain, emit the MuPDF version in the output marked with `<!-- tomd:uncertain:L{start}-L{end} -->`
73+
- The companion prompts file includes BOTH extraction versions, surrounding context, and all raw metadata
74+
- LLM prompts must require verbatim data preservation - the LLM fixes structure, never content
75+
- If no prompts file is needed (zero uncertain regions), don't write one.
76+
- High-confidence output should look like a human wrote the Markdown - proper heading nesting, unwrapped paragraph lines, correct list formatting, blank lines between blocks
77+
78+
## Markdown Quality
79+
80+
The output Markdown must be clean and readable:
81+
- Paragraphs are single unwrapped lines (no hard wraps from PDF line breaks)
82+
- One blank line between all block elements (paragraphs, headings, lists, code blocks)
83+
- Headings use ATX style (`##` not underlines)
84+
- Lists use the marker from the source when detectable (`-`, `*`, `1.`)
85+
- No trailing whitespace on lines
86+
- No redundant blank lines (max one between blocks)
87+
- Dehyphenate broken words across lines (`imple-` + `mentation` -> `implementation`)
88+
- Join paragraphs that span page breaks (no terminal punctuation + lowercase continuation = same paragraph)
89+
- Hyperlinks become `[text](url)` - only http, https, mailto schemes
90+
- WG21 metadata block becomes YAML front matter
91+
- Collapse multiple spaces, replace non-breaking spaces, normalize whitespace
92+
93+
## LLM Integration (v2)
94+
95+
Auto-resolution via `--llm` flag is deferred to v2. For v1, the tool produces a companion `.prompts.md` file that the user feeds to any LLM manually. The prompts file is plain Markdown - usable by any LLM, any interface.
96+
97+
## File Map
98+
99+
- `main.py` - CLI entry point. Argparse, glob expansion, output path logic, main(). No conversion logic.
100+
- `lib/__init__.py` - Package marker for shared library.
101+
- `lib/similarity.py` - Dual-algorithm string similarity (SequenceMatcher + Jaccard). Per-algorithm thresholds, 200-char circuit breaker. Format-agnostic.
102+
- `lib/toc.py` - Table of Contents detection. Matches section texts against known headings using fuzzy similarity. Bridges small gaps. Format-agnostic - no dependency on PDF types.
103+
- `lib/pdf/__init__.py` - Exports `convert_pdf()`. Orchestrates the full pipeline in order.
104+
- `lib/pdf/types.py` - Data classes (`Block`, `Span`, `Line`, `Section`, `PageEdgeItem`), enums (`Confidence`, `SectionKind`), named constants, precompiled regex, `is_readable()`.
105+
- `lib/pdf/extract.py` - Dual extraction: `extract_mupdf()` (dict API) and `extract_spatial()` (rawdict + four spatial rules). Link collection and attachment. Calls `classify_monospace` during span construction.
106+
- `lib/pdf/mono.py` - Triple-signal monospace detection. Font name decomposition (strip modifiers, split camelCase, check keywords), glyph width uniformity, glyph spacing uniformity.
107+
- `lib/pdf/cleanup.py` - Header/footer detection (top-3/bottom-3 edge items), repeating strip, span whitespace (NBSP, multi-space on non-mono), dehyphenation, cross-page join, zero-width char strip.
108+
- `lib/pdf/spans.py` - Span normalization. Snaps bold/italic style boundaries to word edges. Monospace exempt.
109+
- `lib/pdf/table.py` - Table detection from MuPDF block/line positions. Detects columnar layout (x-gap between lines), extracts as high-confidence TABLE sections, excludes table regions from spatial path.
110+
- `lib/pdf/structure.py` - Dual-path comparison, metadata extraction, heading intelligence (multi-signal), position-based list detection (x-coordinates), paragraph merging, code block detection, language label detection, nesting validation.
111+
- `lib/pdf/emit.py` - Markdown generation (headings, paragraphs, code blocks, tables, nested lists) with span-level formatting (inline code, bold, italic, links). Prompts file generation for uncertain regions.
112+
113+
## Header/Footer Stripping
114+
115+
Before dual extraction, scan all pages for repeating content at page edges.
116+
117+
- For each page, capture the top 3 and bottom 3 text items by y-coordinate
118+
- Compare across pages: same text at same y on 50%+ of pages = repeating = strip
119+
- Page numbers: same y, content is a bare number or "Page N" or "N of M" = strip
120+
- Running doc numbers: same y, content matches document number pattern = strip
121+
- Strip these items from page data before extraction runs. They are not content.
122+
123+
## Text Cleanup Rules
124+
125+
- **Dehyphenation**: line ends with `-`, next line starts lowercase -> join word, remove hyphen. Skip known compound prefixes (`self-`, `non-`, `well-`, `cross-`).
126+
- **Cross-page join**: last block on page N has no terminal punctuation, first block on page N+1 starts lowercase -> same paragraph, join.
127+
- **Link extraction**: collected during Phase 2 via `page.get_links()`, matched to text by bounding rect -> `[text](url)`. Only http/https/mailto.
128+
- **Whitespace**: collapse runs, replace non-breaking spaces, strip trailing.
129+
- **WG21 metadata**: Document Number / Date / Reply-to / Audience at top of page 1 -> YAML front matter.
130+
131+
## tomd-Specific Extensions
132+
133+
These extend general rules in the root CLAUDE.md with project-specific instances.
134+
135+
- `fitz.open()` must always be paired with `doc.close()` in a `finally` block. Never rely on garbage collection.
136+
- Font metadata thresholds (what counts as "larger than body," "horizontal close," etc.) must be named constants, not magic numbers scattered in code.
137+
- The four spatial rules are the foundation. Changes to their thresholds affect everything downstream. Test thoroughly.
138+
- Regex patterns for section numbers, known section names, list markers, and metadata fields must be precompiled at module level and defined in one place.

0 commit comments

Comments
 (0)