Skip to content

Commit 8b8bf69

Browse files
bundoleeclaude
andcommitted
fix: prevent footer detection from absorbing distant body text (#385)
Body text that repeats across adjacent pages (e.g. a "※" note appearing on consecutive pages) was incorrectly classified as footer content by the cross-page pattern matcher. Add a spatial proximity check so that footer expansion stops when the next candidate element is more than 30pt away from the previously accepted footer element. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent cb0c5b5 commit 8b8bf69

2 files changed

Lines changed: 121 additions & 4 deletions

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HeaderFooterProcessor.java

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,14 @@ private static List<IObject> processHeaderOrFooterContent(List<IObject> contents
147147
return newContents;
148148
}
149149

150+
/**
151+
* Maximum vertical gap (in points) allowed between consecutive footer/header elements.
152+
* If a candidate element is farther than this from the previously accepted element,
153+
* it is not included in the header/footer region. This prevents body text that happens
154+
* to repeat across pages from being absorbed into the footer.
155+
*/
156+
private static final double MAX_HEADER_FOOTER_GAP = 30.0;
157+
150158
private static List<Integer> getNumberOfHeaderOrFooterContentsForEachPage(List<List<IObject>> sortedContents, boolean isHeaderDetection) {
151159
List<Integer> numberOfHeaderOrFooterContentsForEachPage = new ArrayList<>(sortedContents.size());
152160
for (int pageNumber = 0; pageNumber < sortedContents.size(); pageNumber++) {
@@ -163,7 +171,13 @@ private static List<Integer> getNumberOfHeaderOrFooterContentsForEachPage(List<L
163171
List<IObject> pageContents = sortedContents.get(pageNumber);
164172
int index = isHeaderDetection ? currentIndex : pageContents.size() - 1 - currentIndex;
165173
if (index >= 0 && index < pageContents.size()) {
166-
contents.add(pageContents.get(index));
174+
IObject candidate = pageContents.get(index);
175+
if (currentIndex > 0 && !isAdjacentToExistingHeaderOrFooter(
176+
pageContents, currentIndex, isHeaderDetection, candidate)) {
177+
contents.add(null);
178+
} else {
179+
contents.add(candidate);
180+
}
167181
} else {
168182
contents.add(null);
169183
}
@@ -180,6 +194,22 @@ private static List<Integer> getNumberOfHeaderOrFooterContentsForEachPage(List<L
180194
return numberOfHeaderOrFooterContentsForEachPage;
181195
}
182196

197+
private static boolean isAdjacentToExistingHeaderOrFooter(
198+
List<IObject> pageContents, int currentIndex, boolean isHeaderDetection, IObject candidate) {
199+
int previousIndex = isHeaderDetection ? currentIndex - 1 : pageContents.size() - currentIndex;
200+
if (previousIndex < 0 || previousIndex >= pageContents.size()) {
201+
return true;
202+
}
203+
IObject previousElement = pageContents.get(previousIndex);
204+
double gap;
205+
if (isHeaderDetection) {
206+
gap = previousElement.getTopY() - candidate.getBottomY();
207+
} else {
208+
gap = candidate.getTopY() - previousElement.getBottomY();
209+
}
210+
return gap <= MAX_HEADER_FOOTER_GAP;
211+
}
212+
183213
private static Set<Integer> getIndexesOfHeaderOrFootersContents(List<IObject> contents) {
184214
Set<Integer> result = new HashSet<>(contents.size());
185215
for (int pageNumber = 0; pageNumber < contents.size() - 1; pageNumber++) {

java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/HeaderFooterProcessorTest.java

Lines changed: 90 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,16 @@
3232

3333
public class HeaderFooterProcessorTest {
3434

35-
@Test
36-
public void testProcessHeadersAndFooters() {
35+
private void initContainers() {
3736
StaticContainers.setIsDataLoader(true);
3837
StaticContainers.setIsIgnoreCharactersWithoutUnicode(false);
3938
StaticResources.setDocument(null);
40-
StaticLayoutContainers.setCurrentContentId(0);
39+
StaticLayoutContainers.clearContainers();
40+
}
41+
42+
@Test
43+
public void testProcessHeadersAndFooters() {
44+
initContainers();
4145
List<List<IObject>> contents = new ArrayList<>();
4246
List<IObject> page1Contents = new ArrayList<>();
4347
page1Contents.add(new TextLine(new TextChunk(new BoundingBox(0, 10.0, 30.0, 20.0, 40.0),
@@ -69,4 +73,87 @@ public void testProcessHeadersAndFooters() {
6973
Assertions.assertTrue(contents.get(1).get(2) instanceof SemanticHeaderOrFooter);
7074
Assertions.assertEquals(SemanticType.FOOTER, ((SemanticHeaderOrFooter) contents.get(1).get(2)).getSemanticType());
7175
}
76+
77+
/**
78+
* Tests that body text repeated on adjacent pages is not absorbed into the footer.
79+
* Reproduces #385: pages 19-20 of CERAGEM PDF have identical note text
80+
* "※ 출수 중 출수 버튼을 터치하면 출수가 정지됩니다." at y=116 above the actual
81+
* footer at y=34. The note was incorrectly classified as footer because it matched
82+
* across pages. Page height is 595 (A4-like).
83+
*/
84+
@Test
85+
public void testRepeatedBodyTextNotAbsorbedIntoFooter() {
86+
initContainers();
87+
// Simulate 4 pages (17-20) with A4-like height (595pt)
88+
// Page bounding box: [0, 0, 420, 595]
89+
// Footer line at y=35 (bottom), body note at y=117 (well above footer)
90+
double pageHeight = 595.0;
91+
double footerY = 35.0;
92+
double bodyNoteY = 117.0;
93+
94+
List<List<IObject>> contents = new ArrayList<>();
95+
for (int page = 0; page < 4; page++) {
96+
List<IObject> pageContents = new ArrayList<>();
97+
// Body heading at top
98+
pageContents.add(new TextLine(new TextChunk(
99+
new BoundingBox(page, 37.0, pageHeight - 60, 300.0, pageHeight - 30),
100+
"Section " + (page + 1), 12, pageHeight - 30)));
101+
// Body paragraph in middle
102+
pageContents.add(new TextLine(new TextChunk(
103+
new BoundingBox(page, 37.0, pageHeight / 2, 300.0, pageHeight / 2 + 30),
104+
"Body content page " + (page + 1), 10, pageHeight / 2 + 30)));
105+
106+
// Repeated body note — same text on pages 2 and 3 (simulating pages 19-20)
107+
if (page == 2 || page == 3) {
108+
pageContents.add(new TextLine(new TextChunk(
109+
new BoundingBox(page, 223.0, bodyNoteY, 360.0, bodyNoteY + 18),
110+
"※ Repeated note text", 6.5, bodyNoteY + 18)));
111+
}
112+
113+
// Actual footer line (repeating pattern across all pages)
114+
String footerText = (page % 2 == 0)
115+
? "CGM BALANCE " + (page + 17)
116+
: (page + 17) + " CERAGEM BALANCE USER MANUAL";
117+
pageContents.add(new TextLine(new TextChunk(
118+
new BoundingBox(page, 37.0, footerY, 280.0, footerY + 9),
119+
footerText, 7.5, footerY + 9)));
120+
121+
contents.add(pageContents);
122+
}
123+
124+
HeaderFooterProcessor.processHeadersAndFooters(contents, false);
125+
126+
// Verify: each page should have footer detected
127+
for (int page = 0; page < 4; page++) {
128+
List<IObject> pageContent = contents.get(page);
129+
IObject lastElement = pageContent.get(pageContent.size() - 1);
130+
Assertions.assertTrue(lastElement instanceof SemanticHeaderOrFooter,
131+
"Page " + page + ": last element should be footer");
132+
SemanticHeaderOrFooter footer = (SemanticHeaderOrFooter) lastElement;
133+
Assertions.assertEquals(SemanticType.FOOTER, footer.getSemanticType());
134+
135+
// Critical: footer should contain only 1 element (the actual footer line),
136+
// NOT the repeated body note
137+
Assertions.assertEquals(1, footer.getContents().size(),
138+
"Page " + page + ": footer should contain only the footer line, " +
139+
"not absorb the repeated body note. Got " + footer.getContents().size() + " elements.");
140+
}
141+
142+
// Verify: the repeated note text on pages 2-3 should still be in body content
143+
for (int page = 2; page <= 3; page++) {
144+
List<IObject> pageContent = contents.get(page);
145+
boolean foundNote = false;
146+
for (IObject obj : pageContent) {
147+
if (!(obj instanceof SemanticHeaderOrFooter) && obj instanceof TextLine) {
148+
TextLine line = (TextLine) obj;
149+
if (line.getValue().contains("Repeated note")) {
150+
foundNote = true;
151+
break;
152+
}
153+
}
154+
}
155+
Assertions.assertTrue(foundNote,
156+
"Page " + page + ": repeated note text should remain in body, not be absorbed into footer");
157+
}
158+
}
72159
}

0 commit comments

Comments
 (0)