Skip to content

Commit 829f6d4

Browse files
authored
tomd: review follow-ups
tomd: kimi review follow-ups
1 parent bc623af commit 829f6d4

6 files changed

Lines changed: 63 additions & 21 deletions

File tree

tomd/lib/pdf/cleanup.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
Y_TOLERANCE, REPEATING_THRESHOLD, EDGE_ITEMS_PER_PAGE,
1212
TERMINAL_PUNCTUATION,
1313
PAGE_NUM_RE, COMPOUND_PREFIXES,
14+
compute_bbox,
1415
)
1516

1617
_log = logging.getLogger(__name__)
@@ -167,13 +168,7 @@ def _join_cross_page(blocks: list[Block]) -> list[Block]:
167168
and prev_text[-1] not in TERMINAL_PUNCTUATION
168169
and cur_text[0].islower()):
169170
prev.lines.extend(block.lines)
170-
bboxes = [ln.bbox for ln in prev.lines]
171-
prev.bbox = (
172-
min(b[0] for b in bboxes),
173-
min(b[1] for b in bboxes),
174-
max(b[2] for b in bboxes),
175-
max(b[3] for b in bboxes),
176-
)
171+
prev.bbox = compute_bbox([ln.bbox for ln in prev.lines])
177172
else:
178173
result.append(replace(block, lines=list(block.lines)))
179174

tomd/lib/pdf/extract.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,13 @@
88
Block, Line, Span,
99
WORD_GAP_RATIO, LINE_SPACING_RATIO, PARA_SPACING_RATIO,
1010
FALLBACK_FONT_SIZE,
11+
compute_bbox,
1112
)
1213
from .mono import classify_monospace
1314

1415
_log = logging.getLogger(__name__)
1516

1617

17-
def _compute_bbox(bboxes: list[tuple]) -> tuple[float, float, float, float]:
18-
"""Compute the bounding box enclosing all given bbox tuples."""
19-
return (
20-
min(b[0] for b in bboxes),
21-
min(b[1] for b in bboxes),
22-
max(b[2] for b in bboxes),
23-
max(b[3] for b in bboxes),
24-
)
25-
26-
2718
def extract_mupdf(page, page_num: int) -> list[Block]:
2819
"""Extract text using MuPDF's built-in block/line/span hierarchy.
2920
@@ -145,7 +136,7 @@ def _flush_line():
145136
_flush_word()
146137
if not cur_spans:
147138
return
148-
bbox = _compute_bbox([s.bbox for s in cur_spans])
139+
bbox = compute_bbox([s.bbox for s in cur_spans])
149140
cur_lines.append(Line(
150141
spans=list(cur_spans),
151142
bbox=bbox,
@@ -157,7 +148,7 @@ def _flush_block():
157148
_flush_line()
158149
if not cur_lines:
159150
return
160-
bbox = _compute_bbox([ln.bbox for ln in cur_lines])
151+
bbox = compute_bbox([ln.bbox for ln in cur_lines])
161152
blocks.append(Block(
162153
lines=list(cur_lines),
163154
bbox=bbox,

tomd/lib/pdf/types.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,20 @@ class PageEdgeItem:
208208

209209
TERMINAL_PUNCTUATION = frozenset(".?!:")
210210

211+
212+
def compute_bbox(bboxes: list[tuple]) -> tuple[float, float, float, float]:
213+
"""Compute the bounding box enclosing all given bbox tuples.
214+
215+
Raises ValueError (via min/max) if bboxes is empty. Callers must
216+
ensure at least one bbox is present.
217+
"""
218+
return (
219+
min(b[0] for b in bboxes),
220+
min(b[1] for b in bboxes),
221+
max(b[2] for b in bboxes),
222+
max(b[3] for b in bboxes),
223+
)
224+
211225
FALLBACK_FONT_SIZE = 12.0
212226
FALLBACK_BODY_SIZE = 11.0
213227
MIN_UNCERTAIN_WORDS = 10

tomd/lib/pdf/wg21.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525

2626
_PARENS_RE = re.compile(r"[()]")
2727

28+
# Maximum number of continuation blocks consumed after a reply-to label.
29+
REPLY_TO_CONTINUATION_CAP = 5
30+
2831

2932
def _clean(text: str) -> str:
3033
"""Strip zero-width chars and whitespace."""
@@ -160,11 +163,14 @@ def extract_metadata_from_blocks(blocks: list[Block],
160163
if found_any:
161164
consumed.add(i)
162165
if "reply" in " ".join(_clean(ln.text) for ln in block.lines).lower():
166+
continuation_count = 0
163167
for j, next_block in page0_blocks:
164168
if j <= i:
165169
continue
166170
if j in consumed:
167171
continue
172+
if continuation_count >= REPLY_TO_CONTINUATION_CAP:
173+
break
168174
next_text = _clean(next_block.lines[0].text) if next_block.lines else ""
169175
if not next_text or _LABEL_RE.match(next_text):
170176
break
@@ -175,6 +181,7 @@ def extract_metadata_from_blocks(blocks: list[Block],
175181
existing = metadata.get("reply-to", [])
176182
metadata["reply-to"] = existing + extra_authors
177183
consumed.add(j)
184+
continuation_count += 1
178185
else:
179186
break
180187

tomd/tests/test_extract.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
"""Tests for lib.pdf.extract."""
22

3+
import pytest
34
from unittest.mock import MagicMock
45
from lib.pdf.extract import extract_spatial, attach_links
5-
from lib.pdf.types import Span, Line, Block
6+
from lib.pdf.types import Span, Line, Block, compute_bbox
67

78

89
def _make_page(chars_by_span):
@@ -95,6 +96,19 @@ def test_extract_spatial_sorts_across_blocks_in_y_band():
9596
assert text.index("L") < text.index("R"), f"got text={text!r}"
9697

9798

99+
class TestComputeBbox:
100+
def test_single_box(self):
101+
assert compute_bbox([(1.0, 2.0, 3.0, 4.0)]) == (1.0, 2.0, 3.0, 4.0)
102+
103+
def test_multiple_boxes(self):
104+
result = compute_bbox([(10, 20, 30, 40), (5, 25, 35, 38)])
105+
assert result == (5, 20, 35, 40)
106+
107+
def test_empty_raises(self):
108+
with pytest.raises(ValueError):
109+
compute_bbox([])
110+
111+
98112
def _make_block_with_span(text, bbox):
99113
span = Span(text=text, bbox=bbox)
100114
line = Line(spans=[span])

tomd/tests/test_wg21.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Tests for lib.pdf.wg21."""
22

33
from lib.pdf.types import Block, Line, Span
4-
from lib.pdf.wg21 import extract_metadata_from_blocks
4+
from lib.pdf.wg21 import extract_metadata_from_blocks, REPLY_TO_CONTINUATION_CAP
55

66

77
def _meta_block(lines_text, page_num=0, font_size=9.0):
@@ -95,3 +95,24 @@ def test_reply_to_name_then_email_on_next_line():
9595
meta, consumed = extract_metadata_from_blocks([b])
9696
assert "reply-to" in meta
9797
assert any("Bob Jones" in a for a in meta["reply-to"])
98+
99+
100+
def test_reply_to_continuation_capped():
101+
"""Reply-to loop must stop after REPLY_TO_CONTINUATION_CAP blocks,
102+
even if later blocks still contain emails."""
103+
reply_block = _meta_block(["Reply-to: Alice <alice@x.com>"])
104+
# Generate more continuation blocks than the cap allows
105+
extra_count = REPLY_TO_CONTINUATION_CAP + 5
106+
extras = [
107+
_meta_block([f"Person{n} <p{n}@x.com>"])
108+
for n in range(extra_count)
109+
]
110+
blocks = [reply_block] + extras
111+
meta, consumed = extract_metadata_from_blocks(blocks)
112+
# The continuation blocks consumed must not exceed the cap
113+
# (block 0 is consumed as the reply-to label block itself)
114+
continuation_consumed = consumed - {0}
115+
assert len(continuation_consumed) == REPLY_TO_CONTINUATION_CAP
116+
# The block just past the cap must not be consumed
117+
past_cap_idx = 1 + REPLY_TO_CONTINUATION_CAP
118+
assert past_cap_idx not in consumed

0 commit comments

Comments
 (0)