2121import org .verapdf .wcag .algorithms .entities .content .TextLine ;
2222import org .verapdf .wcag .algorithms .entities .content .TextChunk ;
2323import org .verapdf .wcag .algorithms .entities .content .TextColumn ;
24+ import org .verapdf .wcag .algorithms .entities .geometry .BoundingBox ;
2425import org .verapdf .wcag .algorithms .entities .lists .ListItem ;
2526import org .verapdf .wcag .algorithms .entities .lists .PDFList ;
2627import org .verapdf .wcag .algorithms .entities .tables .tableBorders .TableBorder ;
@@ -39,6 +40,8 @@ public class AutoTaggingProcessor {
3940 private static final Map <OperatorStreamKey , Integer > structParentsIntegers = new HashMap <>();
4041 // annotation StructParent entries: int key -> single struct element (Link)
4142 private static final Map <Integer , COSObject > annotationStructParents = new HashMap <>();
43+ // Caption elements keyed by their linked content ID (Raman's approach from #377)
44+ private static final Map <Long , SemanticCaption > structElementIdToCaptionMap = new HashMap <>();
4245 private static boolean isPDF2_0 = false ;
4346 private static final int MAX_TOKENS_PER_STREAM = 100_000 ;
4447
@@ -47,6 +50,7 @@ public static synchronized void createTaggedPDF(File inputPDF, String outputFold
4750 structParents .clear ();
4851 structParentsIntegers .clear ();
4952 annotationStructParents .clear ();
53+ structElementIdToCaptionMap .clear ();
5054 imageChunkFigureCounter = 0 ;
5155 if (document .getVersion () == 2.0F ) {
5256 isPDF2_0 = true ;
@@ -156,10 +160,18 @@ private static void createParentTree(COSDocument cosDocument, COSObject structTr
156160 }
157161
158162 private static COSObject addStructElement (COSObject parent , COSDocument cosDocument , String type , Integer pageNumber ) {
163+ return addStructElement (parent , cosDocument , type , pageNumber , false );
164+ }
165+
166+ private static COSObject addStructElement (COSObject parent , COSDocument cosDocument , String type , Integer pageNumber , boolean isFirstKid ) {
159167 COSObject structElement = COSIndirect .construct (COSDictionary .construct (), cosDocument );
160168 COSObject k = parent .getKey (ASAtom .K );
161169 if (k .getType () == COSObjType .COS_ARRAY ) {
162- k .add (structElement );
170+ if (isFirstKid ) {
171+ k .insert (0 , structElement );
172+ } else {
173+ k .add (structElement );
174+ }
163175 } else {
164176 k = COSArray .construct ();
165177 parent .setKey (ASAtom .K , k );
@@ -179,90 +191,71 @@ private static COSObject addStructElement(COSObject parent, COSDocument cosDocum
179191 public static COSObject createStructureTreeElements (List <List <IObject >> contents , COSObject structTreeRoot , COSDocument cosDocument ) {
180192 COSObject seDocument = addStructElement (structTreeRoot , cosDocument , TaggedPDFConstants .DOCUMENT , null );
181193 Map <SemanticHeading , Integer > normalizedLevels = buildNormalizedHeadingLevels (contents );
182- // Flatten all top-level content into a single ordered list for caption association.
183- List <IObject > flat = new ArrayList <>();
184194 for (List <IObject > pageContents : contents ) {
185- flat . addAll (pageContents );
195+ addKids (pageContents , seDocument , cosDocument , normalizedLevels );
186196 }
187- // Pre-compute which Caption maps to which float (Table/Figure) and whether it's
188- // a pre-caption (goes first) or post-caption (goes last).
189- // PDF/UA-2 §8.2.5.27: Caption must be first or last child of its parent.
190- // §Table5: Document must not directly contain Caption.
191- // Map each Caption index to its nearest adjacent float (Table/Figure) index.
192- // Captions will be attached as last child of their float parent regardless of
193- // source order, satisfying §8.2.5.27 (first or last child).
194- Map <Integer , Integer > captionToFloat = new HashMap <>(); // caption index → float index
195- for (int i = 0 ; i < flat .size (); i ++) {
196- if (!(flat .get (i ) instanceof SemanticCaption )) continue ;
197- // Look ahead for next Table/Figure (within 2 items, skipping other captions)
198- for (int j = i + 1 ; j < flat .size () && j <= i + 2 ; j ++) {
199- IObject next = flat .get (j );
200- if (isFloatElement (next )) { captionToFloat .put (i , j ); break ; }
201- if (!(next instanceof SemanticCaption )) break ;
202- }
203- if (captionToFloat .containsKey (i )) continue ;
204- // Look behind for previous Table/Figure
205- for (int j = i - 1 ; j >= 0 && j >= i - 2 ; j --) {
206- IObject prev = flat .get (j );
207- if (isFloatElement (prev )) { captionToFloat .put (i , j ); break ; }
208- if (!(prev instanceof SemanticCaption )) break ;
197+ return seDocument ;
198+ }
199+
200+ /**
201+ * Adds child struct elements, collecting Captions and attaching them to their
202+ * linked float (Figure/Table/List) via addCaptionIfPresent().
203+ * Based on Raman Kakhnovich's approach from origin/auto_tagging #377.
204+ */
205+ private static void addKids (List <IObject > contents , COSObject parentStructElem , COSDocument cosDocument ,
206+ Map <SemanticHeading , Integer > normalizedLevels ) {
207+ // First pass: collect Caption → linkedContentId mappings
208+ for (IObject content : contents ) {
209+ if (content instanceof SemanticCaption ) {
210+ structElementIdToCaptionMap .put (
211+ ((SemanticCaption ) content ).getLinkedContentId (), (SemanticCaption ) content );
209212 }
210213 }
211- // Build float index → struct element map (created on demand during iteration)
212- Map <Integer , COSObject > floatStructElems = new HashMap <>();
213- // First pass: create all non-Caption elements; defer Captions
214- for (int i = 0 ; i < flat .size (); i ++) {
215- IObject content = flat .get (i );
214+ // Second pass: create struct elements (skipping Captions — they are attached by addCaptionIfPresent)
215+ for (IObject content : contents ) {
216216 if (content instanceof SemanticCaption ) {
217- continue ; // deferred — handled after its float is created
217+ continue ;
218218 }
219- COSObject elem ;
220- if (content instanceof SemanticHeading ) {
221- elem = null ;
222- createHeadingStructElem ((SemanticHeading ) content , seDocument , cosDocument ,
223- normalizedLevels .get (content ));
224- } else if (content instanceof ImageChunk ) {
225- elem = createFigureStructElemReturning ((ImageChunk ) content , seDocument , cosDocument );
226- floatStructElems .put (i , elem );
227- } else if (content instanceof TableBorder ) {
228- TableBorder table = (TableBorder ) content ;
229- if (table .isTextBlock ()) {
230- createPartStructElemForTextBlock (table , seDocument , cosDocument );
231- elem = null ;
232- } else if (!table .isOneCellTable ()) {
233- elem = createTableStructElemReturning (table , seDocument , cosDocument );
234- floatStructElems .put (i , elem );
235- } else {
236- elem = null ;
237- }
219+ if (content instanceof SemanticHeading && normalizedLevels != null ) {
220+ createHeadingStructElem ((SemanticHeading ) content , parentStructElem , cosDocument ,
221+ normalizedLevels .get (content ));
238222 } else {
239- createStructElem (content , seDocument , cosDocument );
240- elem = null ;
223+ createStructElem (content , parentStructElem , cosDocument );
241224 }
242225 }
243- // Second pass: attach Captions to their float parent
244- for (int i = 0 ; i < flat .size (); i ++) {
245- IObject content = flat .get (i );
246- if (!(content instanceof SemanticCaption )) continue ;
247- Integer floatIdx = captionToFloat .get (i );
248- COSObject floatElem = floatIdx != null ? floatStructElems .get (floatIdx ) : null ;
249- if (floatElem != null ) {
250- createCaptionStructElem ((SemanticCaption ) content , floatElem , cosDocument );
251- } else {
252- // No adjacent float found — add directly to Document as fallback
253- createCaptionStructElem ((SemanticCaption ) content , seDocument , cosDocument );
254- }
226+ }
227+
228+ /** Overload for nested contexts (list items, table cells) where heading normalization is not applicable. */
229+ private static void addKids (List <IObject > contents , COSObject parentStructElem , COSDocument cosDocument ) {
230+ addKids (contents , parentStructElem , cosDocument , null );
231+ }
232+
233+ /**
234+ * If a Caption is linked to this content element, attach it as first or last child
235+ * of the struct element based on spatial position.
236+ */
237+ private static void addCaptionIfPresent (IObject content , COSObject linkedObject , COSDocument cosDocument ) {
238+ Long linkedContentId = content .getRecognizedStructureId ();
239+ if (linkedContentId != null && structElementIdToCaptionMap .containsKey (linkedContentId )) {
240+ SemanticCaption caption = structElementIdToCaptionMap .get (linkedContentId );
241+ boolean isFirst = isCaptionFirstChild (caption .getBoundingBox (), content .getBoundingBox ());
242+ createCaptionStructElem (caption , linkedObject , cosDocument , isFirst );
255243 }
256- return seDocument ;
257244 }
258245
259- private static boolean isFloatElement (IObject obj ) {
260- if (obj instanceof ImageChunk ) return true ;
261- if (obj instanceof TableBorder ) {
262- TableBorder t = (TableBorder ) obj ;
263- return !t .isTextBlock () && !t .isOneCellTable ();
246+ /**
247+ * Determines if the caption should be the first child (above/before) or last child
248+ * (below/after) of its parent struct element.
249+ */
250+ private static boolean isCaptionFirstChild (BoundingBox caption , BoundingBox parent ) {
251+ if (caption == null || parent == null ) return true ;
252+ if (caption .getCenterY () > parent .getTopY ()) {
253+ return true ;
254+ } else if (caption .getCenterY () < parent .getBottomY ()) {
255+ return false ;
256+ } else {
257+ return caption .getCenterX () < parent .getCenterX ();
264258 }
265- return false ;
266259 }
267260
268261 /**
@@ -390,8 +383,6 @@ private static void createStructElem(IObject object, COSObject parentStructElem,
390383 ((SemanticHeading ) object ).getHeadingLevel ());
391384 } else if (object instanceof SemanticParagraph ) {
392385 createParagraphStructElem ((SemanticParagraph ) object , parentStructElem , cosDocument );
393- } else if (object instanceof SemanticCaption ) {
394- createCaptionStructElem ((SemanticCaption ) object , parentStructElem , cosDocument );
395386 } else if (object instanceof PDFList ) {
396387 createListStructElem ((PDFList ) object , parentStructElem , cosDocument );
397388 } else if (object instanceof TableBorder ) {
@@ -424,8 +415,8 @@ private static void createParagraphStructElem(SemanticParagraph paragraph, COSOb
424415 processTextNode (paragraph , paragraphObject );
425416 }
426417
427- private static void createCaptionStructElem (SemanticCaption caption , COSObject parent , COSDocument cosDocument ) {
428- COSObject captionObject = addStructElement (parent , cosDocument , TaggedPDFConstants .CAPTION , caption .getPageNumber ());
418+ private static void createCaptionStructElem (SemanticCaption caption , COSObject parent , COSDocument cosDocument , boolean isFirstChild ) {
419+ COSObject captionObject = addStructElement (parent , cosDocument , TaggedPDFConstants .CAPTION , caption .getPageNumber (), isFirstChild );
429420 processTextNode (caption , captionObject );
430421 }
431422
@@ -448,6 +439,7 @@ private static COSObject createFigureStructElemReturning(ImageChunk image, COSOb
448439 COSString .construct (altText .getBytes (StandardCharsets .UTF_16 ), false ));
449440 cosDocument .addChangedObject (figureObject );
450441 processImageNode (image , figureObject );
442+ addCaptionIfPresent (image , figureObject , cosDocument );
451443 return figureObject ;
452444 }
453445
@@ -494,10 +486,9 @@ private static void createListStructElem(PDFList list, COSObject parent, COSDocu
494486 listItem .getFirstLine ().getValue ().length ()));
495487 }
496488 processTextNode (lBodyTextNode , lBodyObject );
497- for (IObject content : listItem .getContents ()) {
498- createStructElem (content , lBodyObject , cosDocument );
499- }
489+ addKids (listItem .getContents (), lBodyObject , cosDocument );
500490 }
491+ addCaptionIfPresent (list , listObject , cosDocument );
501492 }
502493
503494 private static void createTableStructElem (TableBorder table , COSObject parent , COSDocument cosDocument ) {
@@ -525,21 +516,19 @@ private static COSObject createTableStructElemReturning(TableBorder table, COSOb
525516 if (cell .getRowSpan () != 1 ) {
526517 addAttributeToStructElem (cellObject , ASAtom .TABLE , ASAtom .ROW_SPAN , COSInteger .construct (cell .getRowSpan ()));
527518 }
528- for (IObject cellContent : cell .getContents ()) {
529- createStructElem (cellContent , cellObject , cosDocument );
530- }
519+ addKids (cell .getContents (), cellObject , cosDocument );
531520 }
532521 }
533522 }
523+ addCaptionIfPresent (table , tableObject , cosDocument );
534524 return tableObject ;
535525 }
536526
537527 private static void createPartStructElemForTextBlock (TableBorder table , COSObject parent , COSDocument cosDocument ) {
538528 COSObject partObject = addStructElement (parent , cosDocument , TaggedPDFConstants .PART , table .getPageNumber ());
539529 TableBorderCell cell = table .getCell (0 ,0 );
540- for (IObject cellContent : cell .getContents ()) {
541- createStructElem (cellContent , partObject , cosDocument );
542- }
530+ addKids (cell .getContents (), partObject , cosDocument );
531+ addCaptionIfPresent (table , partObject , cosDocument );
543532 }
544533
545534 private static void addAttributeToStructElem (COSObject structElement , ASAtom ownerASAtom , ASAtom attributeName ,
0 commit comments