Skip to content

Commit e04fa1e

Browse files
committed
Update paperworks handling of revisions
1 parent b49bc6d commit e04fa1e

4 files changed

Lines changed: 107 additions & 5 deletions

File tree

paperworks/lib/inventory.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -237,27 +237,38 @@ def build_inventory(watch_dirs, output_dir, remote_papers=None):
237237
if base:
238238
md_by_base[f"{base}R{rev}"] = v
239239

240-
pdf_by_base = {}
240+
# PDF filenames are revision-ambiguous; index by base only
241+
pdf_by_base_only = {}
241242
for k, v in pdf_papers.items():
242243
_, base, rev = _parse_doc_number(k)
243244
if base:
244-
pdf_by_base[f"{base}R{rev}"] = v
245+
pdf_by_base_only[base] = v
245246

246247
all_keys = set()
247248
all_keys.update(md_by_base.keys())
248-
all_keys.update(pdf_by_base.keys())
249249
all_keys.update(remote_by_base.keys())
250250

251+
# Add PDF-only keys (orphans with no markdown or remote)
252+
bases_covered = {
253+
_parse_doc_number(k)[1] for k in all_keys
254+
if _parse_doc_number(k)[1]
255+
}
256+
for k, v in pdf_papers.items():
257+
_, base, rev = _parse_doc_number(k)
258+
if base and base not in bases_covered:
259+
all_keys.add(f"{base}R{rev}")
260+
251261
records = {}
252262
for key in all_keys:
253263
md = md_by_base.get(key)
254-
pdf = pdf_by_base.get(key)
255264
remote = remote_by_base.get(key)
256265

257266
full, base, rev = _parse_doc_number(key)
258267
if not base:
259268
continue
260269

270+
pdf = pdf_by_base_only.get(base)
271+
261272
# Markdown is source of truth for metadata
262273
title = (md or {}).get("title") or (pdf or {}).get("title") or (remote or {}).get("title", "")
263274
authors = (md or {}).get("authors") or (pdf or {}).get("authors") or (remote or {}).get("author", "")

paperworks/lib/pdf_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
)
1616

1717
_DOC_FIELD_RE = re.compile(
18-
r"Document\s+Number[:\s]+([DPN]\d{3,5}(?:R\d+)?|N\d{3,5})",
18+
r"Document(?:\s+Number)?[:\s]+([DPN]\d{3,5}(?:R\d+)?|N\d{3,5})",
1919
re.IGNORECASE,
2020
)
2121

paperworks/tests/test_inventory.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
"""Tests for inventory.build_inventory merge logic."""
2+
3+
import unittest
4+
from unittest.mock import patch
5+
6+
7+
def _md(doc, title="Test", rev=0, base="P9999"):
8+
return {
9+
"doc_number": doc, "base": base, "revision": rev,
10+
"title": title, "authors": "A", "date": "2026-01-01",
11+
"audience": "LEWG", "intent": "ask", "brutal_summary": "Test.",
12+
"md_path": f"/src/{doc.lower()}.md", "md_mtime": 1000.0,
13+
"folder_idx": 1,
14+
}
15+
16+
17+
def _pdf(doc, rev=0, base="P9999"):
18+
return {
19+
"doc_number": doc, "base": base, "revision": rev,
20+
"title": "", "authors": "", "date": "", "audience": "",
21+
"brutal_summary": None,
22+
"pdf_path": f"/out/{doc.lower()}.pdf", "pdf_mtime": 2000.0,
23+
}
24+
25+
26+
class TestBuildInventory(unittest.TestCase):
27+
28+
def _build(self, md_papers, pdf_papers, remote_papers=None):
29+
with patch("lib.inventory.scan_markdown_dirs", return_value=md_papers), \
30+
patch("lib.inventory.scan_pdf_dir", return_value=pdf_papers):
31+
from lib.inventory import build_inventory
32+
return build_inventory([], "/out", remote_papers)
33+
34+
def _find(self, papers, base):
35+
return next((p for p in papers if p["base"] == base), None)
36+
37+
def test_pdf_matches_markdown_at_r0(self):
38+
"""Baseline: R0 markdown + R0-keyed PDF merge correctly."""
39+
md = {"D4035R0": _md("D4035R0", rev=0, base="P4035")}
40+
pdf = {"D4035R0": _pdf("D4035R0", rev=0, base="P4035")}
41+
result = self._build(md, pdf)
42+
p = self._find(result, "P4035")
43+
self.assertIsNotNone(p)
44+
self.assertIsNotNone(p["md_path"])
45+
self.assertIsNotNone(p["pdf_path"])
46+
47+
def test_pdf_matches_markdown_at_r2(self):
48+
"""The bug: filename-derived R0 PDF must match R2 markdown."""
49+
md = {"P4003R2": _md("P4003R2", rev=2, base="P4003")}
50+
# PDF keyed as R0 (filename fallback: d4003-io-awaitables.pdf -> D4003 -> R0)
51+
pdf = {"D4003R0": _pdf("D4003R0", rev=0, base="P4003")}
52+
result = self._build(md, pdf)
53+
p = self._find(result, "P4003")
54+
self.assertIsNotNone(p)
55+
self.assertEqual(p["revision"], 2)
56+
self.assertIsNotNone(p["md_path"])
57+
self.assertIsNotNone(p["pdf_path"])
58+
59+
def test_orphan_pdf_still_appears(self):
60+
"""PDF with no markdown or remote is still in the inventory."""
61+
pdf = {"D4099R0": _pdf("D4099R0", rev=0, base="P4099")}
62+
result = self._build({}, pdf)
63+
p = self._find(result, "P4099")
64+
self.assertIsNotNone(p)
65+
self.assertIsNotNone(p["pdf_path"])
66+
self.assertIsNone(p["md_path"])
67+
68+
def test_pdf_plus_remote_at_non_r0(self):
69+
"""PDF matched by base when remote provides the authoritative revision."""
70+
pdf = {"D4007R0": _pdf("D4007R0", rev=0, base="P4007")}
71+
remote = [{"doc_number": "P4007R2", "title": "Open Issues",
72+
"author": "A", "status": "Draft", "date": "2026-04-08",
73+
"form_id": "99", "form_url": "https://isocpp.org/papers/form/99"}]
74+
result = self._build({}, pdf, remote)
75+
p = self._find(result, "P4007")
76+
self.assertIsNotNone(p)
77+
self.assertEqual(p["revision"], 2)
78+
self.assertIsNotNone(p["pdf_path"])
79+
80+
81+
if __name__ == "__main__":
82+
unittest.main()

paperworks/tests/test_pdf_reader.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,12 @@ def test_case_insensitive_field(self):
2525
def test_n_paper_structured(self):
2626
text = "Document Number: N4950\nWorking Draft"
2727
assert _extract_doc_number(text) == "N4950"
28+
29+
def test_single_word_document_label(self):
30+
"""Scrivener wg21 style renders 'Document:' not 'Document Number:'."""
31+
text = "Document: P4003R2\nDate: 2026-04-05"
32+
assert _extract_doc_number(text) == "P4003R2"
33+
34+
def test_single_word_document_label_no_revision(self):
35+
text = "Document: D4035\nDate: 2026-03-20"
36+
assert _extract_doc_number(text) == "D4035"

0 commit comments

Comments
 (0)