Skip to content

Commit 45912a5

Browse files
bundoleeclaude
andcommitted
fix: use nearest edges for gap calculation, add positive-control test
Address CodeRabbit review feedback: - Gap calculation now uses nearest edges (bottomY-topY) instead of outer edges, preventing multi-line footers with <30pt actual spacing from being incorrectly rejected. - Add testCloseFooterLinesAreGrouped to verify that two footer lines 11pt apart are grouped into a single footer element. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8b8bf69 commit 45912a5

2 files changed

Lines changed: 44 additions & 2 deletions

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HeaderFooterProcessor.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,9 @@ private static boolean isAdjacentToExistingHeaderOrFooter(
203203
IObject previousElement = pageContents.get(previousIndex);
204204
double gap;
205205
if (isHeaderDetection) {
206-
gap = previousElement.getTopY() - candidate.getBottomY();
206+
gap = previousElement.getBottomY() - candidate.getTopY();
207207
} else {
208-
gap = candidate.getTopY() - previousElement.getBottomY();
208+
gap = candidate.getBottomY() - previousElement.getTopY();
209209
}
210210
return gap <= MAX_HEADER_FOOTER_GAP;
211211
}

java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/HeaderFooterProcessorTest.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,4 +156,46 @@ public void testRepeatedBodyTextNotAbsorbedIntoFooter() {
156156
"Page " + page + ": repeated note text should remain in body, not be absorbed into footer");
157157
}
158158
}
159+
160+
/**
161+
* Positive control: two closely spaced footer lines (gap < 30pt) should be
162+
* grouped into a single footer. Ensures the proximity check does not reject
163+
* legitimate multi-line footers.
164+
*/
165+
@Test
166+
public void testCloseFooterLinesAreGrouped() {
167+
initContainers();
168+
List<List<IObject>> contents = new ArrayList<>();
169+
for (int page = 0; page < 3; page++) {
170+
List<IObject> pageContents = new ArrayList<>();
171+
// Body text at top
172+
pageContents.add(new TextLine(new TextChunk(
173+
new BoundingBox(page, 37.0, 500.0, 300.0, 530.0),
174+
"Body text page " + (page + 1), 10, 530.0)));
175+
176+
// Two footer lines close together (11pt gap between nearest edges)
177+
// Line 1: y=[55, 67] Line 2: y=[35, 44] gap = 55-44 = 11pt
178+
pageContents.add(new TextLine(new TextChunk(
179+
new BoundingBox(page, 37.0, 55.0, 280.0, 67.0),
180+
"Copyright 2026", 7.5, 67.0)));
181+
pageContents.add(new TextLine(new TextChunk(
182+
new BoundingBox(page, 37.0, 35.0, 280.0, 44.0),
183+
"Company Footer", 7.5, 44.0)));
184+
185+
contents.add(pageContents);
186+
}
187+
188+
HeaderFooterProcessor.processHeadersAndFooters(contents, false);
189+
190+
for (int page = 0; page < 3; page++) {
191+
List<IObject> pageContent = contents.get(page);
192+
IObject lastElement = pageContent.get(pageContent.size() - 1);
193+
Assertions.assertTrue(lastElement instanceof SemanticHeaderOrFooter,
194+
"Page " + page + ": last element should be footer");
195+
SemanticHeaderOrFooter footer = (SemanticHeaderOrFooter) lastElement;
196+
Assertions.assertEquals(SemanticType.FOOTER, footer.getSemanticType());
197+
Assertions.assertEquals(2, footer.getContents().size(),
198+
"Page " + page + ": footer should contain both close footer lines (gap=11pt < 30pt)");
199+
}
200+
}
159201
}

0 commit comments

Comments
 (0)