tomd: review follow-ups

gregjkal · web-flow · commit 829f6d4c79cc · 2026-04-16T12:51:24.000-04:00
tomd: kimi review follow-ups
diff --git a/tomd/lib/pdf/cleanup.py b/tomd/lib/pdf/cleanup.py
@@ -11,6 +11,7 @@
     Y_TOLERANCE, REPEATING_THRESHOLD, EDGE_ITEMS_PER_PAGE,
     TERMINAL_PUNCTUATION,
     PAGE_NUM_RE, COMPOUND_PREFIXES,
+    compute_bbox,
 )
 
 _log = logging.getLogger(__name__)
@@ -167,13 +168,7 @@ def _join_cross_page(blocks: list[Block]) -> list[Block]:
                 and prev_text[-1] not in TERMINAL_PUNCTUATION
                 and cur_text[0].islower()):
             prev.lines.extend(block.lines)
-            bboxes = [ln.bbox for ln in prev.lines]
-            prev.bbox = (
-                min(b[0] for b in bboxes),
-                min(b[1] for b in bboxes),
-                max(b[2] for b in bboxes),
-                max(b[3] for b in bboxes),
-            )
+            prev.bbox = compute_bbox([ln.bbox for ln in prev.lines])
         else:
             result.append(replace(block, lines=list(block.lines)))
 
diff --git a/tomd/lib/pdf/extract.py b/tomd/lib/pdf/extract.py
@@ -8,22 +8,13 @@
     Block, Line, Span,
     WORD_GAP_RATIO, LINE_SPACING_RATIO, PARA_SPACING_RATIO,
     FALLBACK_FONT_SIZE,
+    compute_bbox,
 )
 from .mono import classify_monospace
 
 _log = logging.getLogger(__name__)
 
 
-def _compute_bbox(bboxes: list[tuple]) -> tuple[float, float, float, float]:
-    """Compute the bounding box enclosing all given bbox tuples."""
-    return (
-        min(b[0] for b in bboxes),
-        min(b[1] for b in bboxes),
-        max(b[2] for b in bboxes),
-        max(b[3] for b in bboxes),
-    )
-
-
 def extract_mupdf(page, page_num: int) -> list[Block]:
     """Extract text using MuPDF's built-in block/line/span hierarchy.
 
@@ -145,7 +136,7 @@ def _flush_line():
         _flush_word()
         if not cur_spans:
             return
-        bbox = _compute_bbox([s.bbox for s in cur_spans])
+        bbox = compute_bbox([s.bbox for s in cur_spans])
         cur_lines.append(Line(
             spans=list(cur_spans),
             bbox=bbox,
@@ -157,7 +148,7 @@ def _flush_block():
         _flush_line()
         if not cur_lines:
             return
-        bbox = _compute_bbox([ln.bbox for ln in cur_lines])
+        bbox = compute_bbox([ln.bbox for ln in cur_lines])
         blocks.append(Block(
             lines=list(cur_lines),
             bbox=bbox,
diff --git a/tomd/lib/pdf/types.py b/tomd/lib/pdf/types.py
@@ -208,6 +208,20 @@ class PageEdgeItem:
 
 TERMINAL_PUNCTUATION = frozenset(".?!:")
 
+
+def compute_bbox(bboxes: list[tuple]) -> tuple[float, float, float, float]:
+    """Compute the bounding box enclosing all given bbox tuples.
+
+    Raises ValueError (via min/max) if bboxes is empty. Callers must
+    ensure at least one bbox is present.
+    """
+    return (
+        min(b[0] for b in bboxes),
+        min(b[1] for b in bboxes),
+        max(b[2] for b in bboxes),
+        max(b[3] for b in bboxes),
+    )
+
 FALLBACK_FONT_SIZE = 12.0
 FALLBACK_BODY_SIZE = 11.0
 MIN_UNCERTAIN_WORDS = 10
diff --git a/tomd/lib/pdf/wg21.py b/tomd/lib/pdf/wg21.py
@@ -25,6 +25,9 @@
 
 _PARENS_RE = re.compile(r"[()]")
 
+# Maximum number of continuation blocks consumed after a reply-to label.
+REPLY_TO_CONTINUATION_CAP = 5
+
 
 def _clean(text: str) -> str:
     """Strip zero-width chars and whitespace."""
@@ -160,11 +163,14 @@ def extract_metadata_from_blocks(blocks: list[Block],
         if found_any:
             consumed.add(i)
             if "reply" in " ".join(_clean(ln.text) for ln in block.lines).lower():
+                continuation_count = 0
                 for j, next_block in page0_blocks:
                     if j <= i:
                         continue
                     if j in consumed:
                         continue
+                    if continuation_count >= REPLY_TO_CONTINUATION_CAP:
+                        break
                     next_text = _clean(next_block.lines[0].text) if next_block.lines else ""
                     if not next_text or _LABEL_RE.match(next_text):
                         break
@@ -175,6 +181,7 @@ def extract_metadata_from_blocks(blocks: list[Block],
                             existing = metadata.get("reply-to", [])
                             metadata["reply-to"] = existing + extra_authors
                             consumed.add(j)
+                            continuation_count += 1
                     else:
                         break
 
diff --git a/tomd/tests/test_extract.py b/tomd/tests/test_extract.py
@@ -1,8 +1,9 @@
 """Tests for lib.pdf.extract."""
 
+import pytest
 from unittest.mock import MagicMock
 from lib.pdf.extract import extract_spatial, attach_links
-from lib.pdf.types import Span, Line, Block
+from lib.pdf.types import Span, Line, Block, compute_bbox
 
 
 def _make_page(chars_by_span):
@@ -95,6 +96,19 @@ def test_extract_spatial_sorts_across_blocks_in_y_band():
     assert text.index("L") < text.index("R"), f"got text={text!r}"
 
 
+class TestComputeBbox:
+    def test_single_box(self):
+        assert compute_bbox([(1.0, 2.0, 3.0, 4.0)]) == (1.0, 2.0, 3.0, 4.0)
+
+    def test_multiple_boxes(self):
+        result = compute_bbox([(10, 20, 30, 40), (5, 25, 35, 38)])
+        assert result == (5, 20, 35, 40)
+
+    def test_empty_raises(self):
+        with pytest.raises(ValueError):
+            compute_bbox([])
+
+
 def _make_block_with_span(text, bbox):
     span = Span(text=text, bbox=bbox)
     line = Line(spans=[span])
diff --git a/tomd/tests/test_wg21.py b/tomd/tests/test_wg21.py
@@ -1,7 +1,7 @@
 """Tests for lib.pdf.wg21."""
 
 from lib.pdf.types import Block, Line, Span
-from lib.pdf.wg21 import extract_metadata_from_blocks
+from lib.pdf.wg21 import extract_metadata_from_blocks, REPLY_TO_CONTINUATION_CAP
 
 
 def _meta_block(lines_text, page_num=0, font_size=9.0):
@@ -95,3 +95,24 @@ def test_reply_to_name_then_email_on_next_line():
     meta, consumed = extract_metadata_from_blocks([b])
     assert "reply-to" in meta
     assert any("Bob Jones" in a for a in meta["reply-to"])
+
+
+def test_reply_to_continuation_capped():
+    """Reply-to loop must stop after REPLY_TO_CONTINUATION_CAP blocks,
+    even if later blocks still contain emails."""
+    reply_block = _meta_block(["Reply-to: Alice <alice@x.com>"])
+    # Generate more continuation blocks than the cap allows
+    extra_count = REPLY_TO_CONTINUATION_CAP + 5
+    extras = [
+        _meta_block([f"Person{n} <p{n}@x.com>"])
+        for n in range(extra_count)
+    ]
+    blocks = [reply_block] + extras
+    meta, consumed = extract_metadata_from_blocks(blocks)
+    # The continuation blocks consumed must not exceed the cap
+    # (block 0 is consumed as the reply-to label block itself)
+    continuation_consumed = consumed - {0}
+    assert len(continuation_consumed) == REPLY_TO_CONTINUATION_CAP
+    # The block just past the cap must not be consumed
+    past_cap_idx = 1 + REPLY_TO_CONTINUATION_CAP
+    assert past_cap_idx not in consumed