Skip to content

Commit cf8829e

Browse files
fix: suppress table caption artifacts across page boundaries
1 parent e3ed261 commit cf8829e

2 files changed

Lines changed: 152 additions & 1 deletion

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownGenerator.java

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ public class MarkdownGenerator implements Closeable {
6565
"^(pass@1|cons@\\d+|rating)(?:\\s+(pass@1|cons@\\d+|rating))+\\s*$",
6666
Pattern.CASE_INSENSITIVE
6767
);
68+
private static final Pattern FOOTNOTE_PLACEHOLDER_PATTERN = Pattern.compile(
69+
"^(?:\\d+https?://\\S+)(?:\\s+\\d+https?://\\S+)*$",
70+
Pattern.CASE_INSENSITIVE
71+
);
6872
private static final Pattern BENCHMARK_PATTERN = Pattern.compile(
6973
"(AIME 2024|MATH-500|CNMO 2024|GPQA(?: Diamond)?|LiveCodeBench|Codeforces|SWE Verified|Aider-Polyglot|MMLU(?:-Redux|-Pro)?|DROP|IF-Eval|SimpleQA|FRAMES|AlpacaEval2\\.0|ArenaHard|CLUEWSC|C-Eval|C-SimpleQA)",
7074
Pattern.CASE_INSENSITIVE
@@ -88,10 +92,15 @@ public class MarkdownGenerator implements Closeable {
8892

8993
public void writeToMarkdown(List<List<IObject>> contents) {
9094
try {
95+
List<Set<Integer>> pageSkipIndices = new java.util.ArrayList<>(contents.size());
96+
for (List<IObject> pageContents : contents) {
97+
pageSkipIndices.add(collectTableArtifactIndices(pageContents));
98+
}
99+
extendCrossPageTableArtifactSkips(contents, pageSkipIndices);
91100
for (int pageNumber = 0; pageNumber < StaticContainers.getDocument().getNumberOfPages(); pageNumber++) {
92101
writePageSeparator(pageNumber);
93102
List<IObject> pageContents = contents.get(pageNumber);
94-
Set<Integer> skipIndices = collectTableArtifactIndices(pageContents);
103+
Set<Integer> skipIndices = pageSkipIndices.get(pageNumber);
95104
for (int contentIndex = 0; contentIndex < pageContents.size(); contentIndex++) {
96105
if (skipIndices.contains(contentIndex)) {
97106
continue;
@@ -360,6 +369,26 @@ protected boolean shouldWriteTableBody() {
360369
return Config.MARKDOWN_TABLE_OUTPUT_FULL.equals(markdownTableOutput);
361370
}
362371

372+
protected void extendCrossPageTableArtifactSkips(List<List<IObject>> contents, List<Set<Integer>> pageSkipIndices) {
373+
if (Config.MARKDOWN_TABLE_OUTPUT_FULL.equals(markdownTableOutput)) {
374+
return;
375+
}
376+
for (int pageNumber = 0; pageNumber < contents.size(); pageNumber++) {
377+
List<IObject> pageContents = contents.get(pageNumber);
378+
Set<Integer> pageSkips = pageSkipIndices.get(pageNumber);
379+
380+
int firstMeaningful = findFirstMeaningfulContentIndex(pageContents, pageSkips);
381+
if (firstMeaningful >= 0 && isTableCaptionText(normalizeContentText(pageContents.get(firstMeaningful))) && pageNumber > 0) {
382+
walkTableArtifactRange(contents.get(pageNumber - 1), pageSkipIndices.get(pageNumber - 1), contents.get(pageNumber - 1).size(), -1);
383+
}
384+
385+
int lastMeaningful = findLastMeaningfulContentIndex(pageContents, pageSkips);
386+
if (lastMeaningful >= 0 && isTableCaptionText(normalizeContentText(pageContents.get(lastMeaningful))) && pageNumber + 1 < contents.size()) {
387+
walkTableArtifactRange(contents.get(pageNumber + 1), pageSkipIndices.get(pageNumber + 1), -1, 1);
388+
}
389+
}
390+
}
391+
363392
protected Set<Integer> collectTableArtifactIndices(List<IObject> pageContents) {
364393
Set<Integer> skip = new HashSet<>();
365394
if (Config.MARKDOWN_TABLE_OUTPUT_FULL.equals(markdownTableOutput)) {
@@ -411,6 +440,11 @@ protected void walkTableArtifactRange(List<IObject> pageContents, Set<Integer> s
411440
continue;
412441
}
413442
if (looksNarrativeText(text)) {
443+
if (direction > 0 && shouldSkipDanglingNarrativeFragment(pageContents, index, text)) {
444+
skip.add(index);
445+
index += direction;
446+
continue;
447+
}
414448
break;
415449
}
416450
break;
@@ -421,11 +455,51 @@ protected boolean isHeadingContent(IObject content) {
421455
return content instanceof SemanticHeading;
422456
}
423457

458+
protected int findFirstMeaningfulContentIndex(List<IObject> pageContents, Set<Integer> skip) {
459+
for (int index = 0; index < pageContents.size(); index++) {
460+
if (skip.contains(index)) {
461+
continue;
462+
}
463+
String text = normalizeContentText(pageContents.get(index));
464+
if (!text.isEmpty() || pageContents.get(index) instanceof TableBorder) {
465+
return index;
466+
}
467+
}
468+
return -1;
469+
}
470+
471+
protected int findLastMeaningfulContentIndex(List<IObject> pageContents, Set<Integer> skip) {
472+
for (int index = pageContents.size() - 1; index >= 0; index--) {
473+
if (skip.contains(index)) {
474+
continue;
475+
}
476+
String text = normalizeContentText(pageContents.get(index));
477+
if (!text.isEmpty() || pageContents.get(index) instanceof TableBorder) {
478+
return index;
479+
}
480+
}
481+
return -1;
482+
}
483+
424484
protected boolean isTableOutputOff() {
425485
return Config.MARKDOWN_TABLE_OUTPUT_OFF.equals(markdownTableOutput);
426486
}
427487

428488
protected String normalizeContentText(IObject content) {
489+
if (content instanceof PDFList) {
490+
StringBuilder builder = new StringBuilder();
491+
for (ListItem item : ((PDFList) content).getListItems()) {
492+
String value = String.valueOf(item).replaceAll("\\s+", " ").trim();
493+
if (value.isEmpty()) {
494+
continue;
495+
}
496+
if (builder.length() > 0) {
497+
builder.append(' ');
498+
}
499+
builder.append(value);
500+
}
501+
return builder.toString();
502+
}
429503
if (!(content instanceof SemanticTextNode)) {
430504
return "";
431505
}
@@ -466,6 +540,9 @@ protected boolean looksTableArtifactText(String text) {
466540
if (NUMERIC_ONLY_PATTERN.matcher(text).matches()) {
467541
return true;
468542
}
543+
if (FOOTNOTE_PLACEHOLDER_PATTERN.matcher(text).matches()) {
544+
return true;
545+
}
469546
if (TABLE_HEADER_TEXT_PATTERN.matcher(text).matches()) {
470547
return true;
471548
}
@@ -507,6 +584,26 @@ protected boolean looksTableArtifactText(String text) {
507584
return false;
508585
}
509586

587+
protected boolean shouldSkipDanglingNarrativeFragment(List<IObject> pageContents, int index, String text) {
588+
if (text.isEmpty() || !Character.isLowerCase(text.charAt(0))) {
589+
return false;
590+
}
591+
for (int nextIndex = index + 1; nextIndex < pageContents.size(); nextIndex++) {
592+
IObject next = pageContents.get(nextIndex);
593+
String nextText = normalizeContentText(next);
594+
if (nextText.isEmpty()) {
595+
continue;
596+
}
597+
if (isHeadingContent(next) || isTableCaptionText(nextText)) {
598+
return true;
599+
}
600+
if (looksNarrativeText(nextText) && !looksTableArtifactText(nextText)) {
601+
return false;
602+
}
603+
}
604+
return false;
605+
}
606+
510607
protected String getLineBreak() {
511608
if (isInsideTable()) {
512609
return MarkdownSyntax.HTML_LINE_BREAK_TAG;

java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/markdown/MarkdownGeneratorTest.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,16 @@
1010
import org.junit.jupiter.api.io.TempDir;
1111
import org.opendataloader.pdf.api.Config;
1212
import org.verapdf.wcag.algorithms.entities.IObject;
13+
import org.verapdf.wcag.algorithms.entities.SemanticHeading;
1314
import org.verapdf.wcag.algorithms.entities.SemanticParagraph;
1415
import org.verapdf.wcag.algorithms.entities.content.TextChunk;
1516
import org.verapdf.wcag.algorithms.entities.content.TextLine;
1617
import org.verapdf.wcag.algorithms.entities.geometry.BoundingBox;
1718
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorder;
1819
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderCell;
1920
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderRow;
21+
import org.verapdf.wcag.algorithms.entities.lists.ListItem;
22+
import org.verapdf.wcag.algorithms.entities.lists.PDFList;
2023
import org.junit.jupiter.api.Test;
2124
import org.junit.jupiter.params.ParameterizedTest;
2225
import org.junit.jupiter.params.provider.ValueSource;
@@ -152,6 +155,30 @@ void testCaptionOnlySkipsFlattenedTableArtifactsAroundCaption() throws IOExcepti
152155
assertTrue(markdown.contains("This paragraph should remain after the caption because it is narrative."));
153156
}
154157

158+
@Test
159+
void testCaptionOnlySkipsFootnotePlaceholderListsAndDanglingLowercaseFragments() throws IOException {
160+
Config config = new Config();
161+
config.setOutputFolder(tempDir.toString());
162+
config.setMarkdownTableOutput(Config.MARKDOWN_TABLE_OUTPUT_CAPTION_ONLY);
163+
File inputPdf = tempDir.resolve("dangling.pdf").toFile();
164+
Files.writeString(inputPdf.toPath(), "");
165+
166+
try (TestMarkdownGenerator generator = new TestMarkdownGenerator(inputPdf, config)) {
167+
generator.writePage(List.of(
168+
createFootnoteList("1https://example.com", "2https://example.com"),
169+
createCaption("Table 4 | Comparison between models."),
170+
createCaption("performance of the model will improve soon."),
171+
createHeading("3.2. Distilled Model Evaluation", 5)
172+
));
173+
}
174+
175+
String markdown = Files.readString(tempDir.resolve("dangling.md"));
176+
assertFalse(markdown.contains("1https://example.com"));
177+
assertFalse(markdown.contains("performance of the model will improve soon."));
178+
assertTrue(markdown.contains("Table 4 | Comparison between models."));
179+
assertTrue(markdown.contains("##### 3.2. Distilled Model Evaluation"));
180+
}
181+
155182
/**
156183
* Helper method that mirrors the heading prefix generation logic in
157184
* MarkdownGenerator.writeHeading().
@@ -229,6 +256,33 @@ private TableBorderCell createCell(int row, int col, String text) {
229256
return cell;
230257
}
231258

259+
private SemanticHeading createHeading(String text, int level) {
260+
SemanticHeading heading = new SemanticHeading(createCaption(text));
261+
heading.setHeadingLevel(level);
262+
return heading;
263+
}
264+
265+
private PDFList createFootnoteList(String... items) {
266+
PDFList list = new PDFList();
267+
for (String itemText : items) {
268+
ListItem item = new ListItem(new BoundingBox(), null);
269+
item.add(new TextLine(new TextChunk(
270+
new BoundingBox(0, 10.0, 10.0, 20.0, 20.0),
271+
itemText,
272+
"Font1",
273+
10,
274+
700,
275+
0,
276+
20.0,
277+
new double[]{0.0},
278+
null,
279+
0
280+
)));
281+
list.add(item);
282+
}
283+
return list;
284+
}
285+
232286
private static class TestMarkdownGenerator extends MarkdownGenerator {
233287
TestMarkdownGenerator(File inputPdf, Config config) throws IOException {
234288
super(inputPdf, config);

0 commit comments

Comments
 (0)