@@ -65,6 +65,10 @@ public class MarkdownGenerator implements Closeable {
6565 "^(pass@1|cons@\\ d+|rating)(?:\\ s+(pass@1|cons@\\ d+|rating))+\\ s*$" ,
6666 Pattern .CASE_INSENSITIVE
6767 );
68+ private static final Pattern FOOTNOTE_PLACEHOLDER_PATTERN = Pattern .compile (
69+ "^(?:\\ d+https?://\\ S+)(?:\\ s+\\ d+https?://\\ S+)*$" ,
70+ Pattern .CASE_INSENSITIVE
71+ );
6872 private static final Pattern BENCHMARK_PATTERN = Pattern .compile (
6973 "(AIME 2024|MATH-500|CNMO 2024|GPQA(?: Diamond)?|LiveCodeBench|Codeforces|SWE Verified|Aider-Polyglot|MMLU(?:-Redux|-Pro)?|DROP|IF-Eval|SimpleQA|FRAMES|AlpacaEval2\\ .0|ArenaHard|CLUEWSC|C-Eval|C-SimpleQA)" ,
7074 Pattern .CASE_INSENSITIVE
@@ -88,10 +92,15 @@ public class MarkdownGenerator implements Closeable {
8892
8993 public void writeToMarkdown (List <List <IObject >> contents ) {
9094 try {
95+ List <Set <Integer >> pageSkipIndices = new java .util .ArrayList <>(contents .size ());
96+ for (List <IObject > pageContents : contents ) {
97+ pageSkipIndices .add (collectTableArtifactIndices (pageContents ));
98+ }
99+ extendCrossPageTableArtifactSkips (contents , pageSkipIndices );
91100 for (int pageNumber = 0 ; pageNumber < StaticContainers .getDocument ().getNumberOfPages (); pageNumber ++) {
92101 writePageSeparator (pageNumber );
93102 List <IObject > pageContents = contents .get (pageNumber );
94- Set <Integer > skipIndices = collectTableArtifactIndices ( pageContents );
103+ Set <Integer > skipIndices = pageSkipIndices . get ( pageNumber );
95104 for (int contentIndex = 0 ; contentIndex < pageContents .size (); contentIndex ++) {
96105 if (skipIndices .contains (contentIndex )) {
97106 continue ;
@@ -360,6 +369,26 @@ protected boolean shouldWriteTableBody() {
360369 return Config .MARKDOWN_TABLE_OUTPUT_FULL .equals (markdownTableOutput );
361370 }
362371
372+ protected void extendCrossPageTableArtifactSkips (List <List <IObject >> contents , List <Set <Integer >> pageSkipIndices ) {
373+ if (Config .MARKDOWN_TABLE_OUTPUT_FULL .equals (markdownTableOutput )) {
374+ return ;
375+ }
376+ for (int pageNumber = 0 ; pageNumber < contents .size (); pageNumber ++) {
377+ List <IObject > pageContents = contents .get (pageNumber );
378+ Set <Integer > pageSkips = pageSkipIndices .get (pageNumber );
379+
380+ int firstMeaningful = findFirstMeaningfulContentIndex (pageContents , pageSkips );
381+ if (firstMeaningful >= 0 && isTableCaptionText (normalizeContentText (pageContents .get (firstMeaningful ))) && pageNumber > 0 ) {
382+ walkTableArtifactRange (contents .get (pageNumber - 1 ), pageSkipIndices .get (pageNumber - 1 ), contents .get (pageNumber - 1 ).size (), -1 );
383+ }
384+
385+ int lastMeaningful = findLastMeaningfulContentIndex (pageContents , pageSkips );
386+ if (lastMeaningful >= 0 && isTableCaptionText (normalizeContentText (pageContents .get (lastMeaningful ))) && pageNumber + 1 < contents .size ()) {
387+ walkTableArtifactRange (contents .get (pageNumber + 1 ), pageSkipIndices .get (pageNumber + 1 ), -1 , 1 );
388+ }
389+ }
390+ }
391+
363392 protected Set <Integer > collectTableArtifactIndices (List <IObject > pageContents ) {
364393 Set <Integer > skip = new HashSet <>();
365394 if (Config .MARKDOWN_TABLE_OUTPUT_FULL .equals (markdownTableOutput )) {
@@ -411,6 +440,11 @@ protected void walkTableArtifactRange(List<IObject> pageContents, Set<Integer> s
411440 continue ;
412441 }
413442 if (looksNarrativeText (text )) {
443+ if (direction > 0 && shouldSkipDanglingNarrativeFragment (pageContents , index , text )) {
444+ skip .add (index );
445+ index += direction ;
446+ continue ;
447+ }
414448 break ;
415449 }
416450 break ;
@@ -421,11 +455,51 @@ protected boolean isHeadingContent(IObject content) {
421455 return content instanceof SemanticHeading ;
422456 }
423457
458+ protected int findFirstMeaningfulContentIndex (List <IObject > pageContents , Set <Integer > skip ) {
459+ for (int index = 0 ; index < pageContents .size (); index ++) {
460+ if (skip .contains (index )) {
461+ continue ;
462+ }
463+ String text = normalizeContentText (pageContents .get (index ));
464+ if (!text .isEmpty () || pageContents .get (index ) instanceof TableBorder ) {
465+ return index ;
466+ }
467+ }
468+ return -1 ;
469+ }
470+
471+ protected int findLastMeaningfulContentIndex (List <IObject > pageContents , Set <Integer > skip ) {
472+ for (int index = pageContents .size () - 1 ; index >= 0 ; index --) {
473+ if (skip .contains (index )) {
474+ continue ;
475+ }
476+ String text = normalizeContentText (pageContents .get (index ));
477+ if (!text .isEmpty () || pageContents .get (index ) instanceof TableBorder ) {
478+ return index ;
479+ }
480+ }
481+ return -1 ;
482+ }
483+
424484 protected boolean isTableOutputOff () {
425485 return Config .MARKDOWN_TABLE_OUTPUT_OFF .equals (markdownTableOutput );
426486 }
427487
428488 protected String normalizeContentText (IObject content ) {
489+ if (content instanceof PDFList ) {
490+ StringBuilder builder = new StringBuilder ();
491+ for (ListItem item : ((PDFList ) content ).getListItems ()) {
492+ String value = String .valueOf (item ).replaceAll ("\\ s+" , " " ).trim ();
493+ if (value .isEmpty ()) {
494+ continue ;
495+ }
496+ if (builder .length () > 0 ) {
497+ builder .append (' ' );
498+ }
499+ builder .append (value );
500+ }
501+ return builder .toString ();
502+ }
429503 if (!(content instanceof SemanticTextNode )) {
430504 return "" ;
431505 }
@@ -466,6 +540,9 @@ protected boolean looksTableArtifactText(String text) {
466540 if (NUMERIC_ONLY_PATTERN .matcher (text ).matches ()) {
467541 return true ;
468542 }
543+ if (FOOTNOTE_PLACEHOLDER_PATTERN .matcher (text ).matches ()) {
544+ return true ;
545+ }
469546 if (TABLE_HEADER_TEXT_PATTERN .matcher (text ).matches ()) {
470547 return true ;
471548 }
@@ -507,6 +584,26 @@ protected boolean looksTableArtifactText(String text) {
507584 return false ;
508585 }
509586
587+ protected boolean shouldSkipDanglingNarrativeFragment (List <IObject > pageContents , int index , String text ) {
588+ if (text .isEmpty () || !Character .isLowerCase (text .charAt (0 ))) {
589+ return false ;
590+ }
591+ for (int nextIndex = index + 1 ; nextIndex < pageContents .size (); nextIndex ++) {
592+ IObject next = pageContents .get (nextIndex );
593+ String nextText = normalizeContentText (next );
594+ if (nextText .isEmpty ()) {
595+ continue ;
596+ }
597+ if (isHeadingContent (next ) || isTableCaptionText (nextText )) {
598+ return true ;
599+ }
600+ if (looksNarrativeText (nextText ) && !looksTableArtifactText (nextText )) {
601+ return false ;
602+ }
603+ }
604+ return false ;
605+ }
606+
510607 protected String getLineBreak () {
511608 if (isInsideTable ()) {
512609 return MarkdownSyntax .HTML_LINE_BREAK_TAG ;
0 commit comments