Skip to content

Commit 050a397

Browse files
bundoleeclaude
andcommitted
refactor(auto-tagging): adopt Raman's Caption placement approach from #377
Replace the 2-pass caption-to-float mapping (521b274) with Raman Kakhnovich's addKids/addCaptionIfPresent pattern from origin/auto_tagging #377 (193a3b5). Key changes: - addKids(): first pass collects Captions by linkedContentId, second pass creates non-Caption struct elements (skipping Captions) - addCaptionIfPresent(): called by Figure/Table/List/Part after struct elem creation, attaches Caption as first or last child based on spatial position - isCaptionFirstChild(): compares caption vs parent bbox to decide placement - addStructElement() overload with isFirstKid for Caption-first insertion - Removed: captionToFloat map, floatStructElems map, isFloatElement() - Kept: heading normalization, Link annotations, Formula, EnrichedImageChunk Reference: Kakhnovich Raman's origin/auto_tagging 193a3b5 (#377) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8d9a1b2 commit 050a397

1 file changed

Lines changed: 74 additions & 85 deletions

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/AutoTaggingProcessor.java

Lines changed: 74 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.verapdf.wcag.algorithms.entities.content.TextLine;
2222
import org.verapdf.wcag.algorithms.entities.content.TextChunk;
2323
import org.verapdf.wcag.algorithms.entities.content.TextColumn;
24+
import org.verapdf.wcag.algorithms.entities.geometry.BoundingBox;
2425
import org.verapdf.wcag.algorithms.entities.lists.ListItem;
2526
import org.verapdf.wcag.algorithms.entities.lists.PDFList;
2627
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorder;
@@ -39,6 +40,8 @@ public class AutoTaggingProcessor {
3940
private static final Map<OperatorStreamKey, Integer> structParentsIntegers = new HashMap<>();
4041
// annotation StructParent entries: int key -> single struct element (Link)
4142
private static final Map<Integer, COSObject> annotationStructParents = new HashMap<>();
43+
// Caption elements keyed by their linked content ID (Raman's approach from #377)
44+
private static final Map<Long, SemanticCaption> structElementIdToCaptionMap = new HashMap<>();
4245
private static boolean isPDF2_0 = false;
4346
private static final int MAX_TOKENS_PER_STREAM = 100_000;
4447

@@ -47,6 +50,7 @@ public static synchronized void createTaggedPDF(File inputPDF, String outputFold
4750
structParents.clear();
4851
structParentsIntegers.clear();
4952
annotationStructParents.clear();
53+
structElementIdToCaptionMap.clear();
5054
imageChunkFigureCounter = 0;
5155
if (document.getVersion() == 2.0F) {
5256
isPDF2_0 = true;
@@ -156,10 +160,18 @@ private static void createParentTree(COSDocument cosDocument, COSObject structTr
156160
}
157161

158162
private static COSObject addStructElement(COSObject parent, COSDocument cosDocument, String type, Integer pageNumber) {
163+
return addStructElement(parent, cosDocument, type, pageNumber, false);
164+
}
165+
166+
private static COSObject addStructElement(COSObject parent, COSDocument cosDocument, String type, Integer pageNumber, boolean isFirstKid) {
159167
COSObject structElement = COSIndirect.construct(COSDictionary.construct(), cosDocument);
160168
COSObject k = parent.getKey(ASAtom.K);
161169
if (k.getType() == COSObjType.COS_ARRAY) {
162-
k.add(structElement);
170+
if (isFirstKid) {
171+
k.insert(0, structElement);
172+
} else {
173+
k.add(structElement);
174+
}
163175
} else {
164176
k = COSArray.construct();
165177
parent.setKey(ASAtom.K, k);
@@ -179,90 +191,71 @@ private static COSObject addStructElement(COSObject parent, COSDocument cosDocum
179191
public static COSObject createStructureTreeElements(List<List<IObject>> contents, COSObject structTreeRoot, COSDocument cosDocument) {
180192
COSObject seDocument = addStructElement(structTreeRoot, cosDocument, TaggedPDFConstants.DOCUMENT, null);
181193
Map<SemanticHeading, Integer> normalizedLevels = buildNormalizedHeadingLevels(contents);
182-
// Flatten all top-level content into a single ordered list for caption association.
183-
List<IObject> flat = new ArrayList<>();
184194
for (List<IObject> pageContents : contents) {
185-
flat.addAll(pageContents);
195+
addKids(pageContents, seDocument, cosDocument, normalizedLevels);
186196
}
187-
// Pre-compute which Caption maps to which float (Table/Figure) and whether it's
188-
// a pre-caption (goes first) or post-caption (goes last).
189-
// PDF/UA-2 §8.2.5.27: Caption must be first or last child of its parent.
190-
// §Table5: Document must not directly contain Caption.
191-
// Map each Caption index to its nearest adjacent float (Table/Figure) index.
192-
// Captions will be attached as last child of their float parent regardless of
193-
// source order, satisfying §8.2.5.27 (first or last child).
194-
Map<Integer, Integer> captionToFloat = new HashMap<>(); // caption index → float index
195-
for (int i = 0; i < flat.size(); i++) {
196-
if (!(flat.get(i) instanceof SemanticCaption)) continue;
197-
// Look ahead for next Table/Figure (within 2 items, skipping other captions)
198-
for (int j = i + 1; j < flat.size() && j <= i + 2; j++) {
199-
IObject next = flat.get(j);
200-
if (isFloatElement(next)) { captionToFloat.put(i, j); break; }
201-
if (!(next instanceof SemanticCaption)) break;
202-
}
203-
if (captionToFloat.containsKey(i)) continue;
204-
// Look behind for previous Table/Figure
205-
for (int j = i - 1; j >= 0 && j >= i - 2; j--) {
206-
IObject prev = flat.get(j);
207-
if (isFloatElement(prev)) { captionToFloat.put(i, j); break; }
208-
if (!(prev instanceof SemanticCaption)) break;
197+
return seDocument;
198+
}
199+
200+
/**
201+
* Adds child struct elements, collecting Captions and attaching them to their
202+
* linked float (Figure/Table/List) via addCaptionIfPresent().
203+
* Based on Raman Kakhnovich's approach from origin/auto_tagging #377.
204+
*/
205+
private static void addKids(List<IObject> contents, COSObject parentStructElem, COSDocument cosDocument,
206+
Map<SemanticHeading, Integer> normalizedLevels) {
207+
// First pass: collect Caption → linkedContentId mappings
208+
for (IObject content : contents) {
209+
if (content instanceof SemanticCaption) {
210+
structElementIdToCaptionMap.put(
211+
((SemanticCaption) content).getLinkedContentId(), (SemanticCaption) content);
209212
}
210213
}
211-
// Build float index → struct element map (created on demand during iteration)
212-
Map<Integer, COSObject> floatStructElems = new HashMap<>();
213-
// First pass: create all non-Caption elements; defer Captions
214-
for (int i = 0; i < flat.size(); i++) {
215-
IObject content = flat.get(i);
214+
// Second pass: create struct elements (skipping Captions — they are attached by addCaptionIfPresent)
215+
for (IObject content : contents) {
216216
if (content instanceof SemanticCaption) {
217-
continue; // deferred — handled after its float is created
217+
continue;
218218
}
219-
COSObject elem;
220-
if (content instanceof SemanticHeading) {
221-
elem = null;
222-
createHeadingStructElem((SemanticHeading) content, seDocument, cosDocument,
223-
normalizedLevels.get(content));
224-
} else if (content instanceof ImageChunk) {
225-
elem = createFigureStructElemReturning((ImageChunk) content, seDocument, cosDocument);
226-
floatStructElems.put(i, elem);
227-
} else if (content instanceof TableBorder) {
228-
TableBorder table = (TableBorder) content;
229-
if (table.isTextBlock()) {
230-
createPartStructElemForTextBlock(table, seDocument, cosDocument);
231-
elem = null;
232-
} else if (!table.isOneCellTable()) {
233-
elem = createTableStructElemReturning(table, seDocument, cosDocument);
234-
floatStructElems.put(i, elem);
235-
} else {
236-
elem = null;
237-
}
219+
if (content instanceof SemanticHeading && normalizedLevels != null) {
220+
createHeadingStructElem((SemanticHeading) content, parentStructElem, cosDocument,
221+
normalizedLevels.get(content));
238222
} else {
239-
createStructElem(content, seDocument, cosDocument);
240-
elem = null;
223+
createStructElem(content, parentStructElem, cosDocument);
241224
}
242225
}
243-
// Second pass: attach Captions to their float parent
244-
for (int i = 0; i < flat.size(); i++) {
245-
IObject content = flat.get(i);
246-
if (!(content instanceof SemanticCaption)) continue;
247-
Integer floatIdx = captionToFloat.get(i);
248-
COSObject floatElem = floatIdx != null ? floatStructElems.get(floatIdx) : null;
249-
if (floatElem != null) {
250-
createCaptionStructElem((SemanticCaption) content, floatElem, cosDocument);
251-
} else {
252-
// No adjacent float found — add directly to Document as fallback
253-
createCaptionStructElem((SemanticCaption) content, seDocument, cosDocument);
254-
}
226+
}
227+
228+
/** Overload for nested contexts (list items, table cells) where heading normalization is not applicable. */
229+
private static void addKids(List<IObject> contents, COSObject parentStructElem, COSDocument cosDocument) {
230+
addKids(contents, parentStructElem, cosDocument, null);
231+
}
232+
233+
/**
234+
* If a Caption is linked to this content element, attach it as first or last child
235+
* of the struct element based on spatial position.
236+
*/
237+
private static void addCaptionIfPresent(IObject content, COSObject linkedObject, COSDocument cosDocument) {
238+
Long linkedContentId = content.getRecognizedStructureId();
239+
if (linkedContentId != null && structElementIdToCaptionMap.containsKey(linkedContentId)) {
240+
SemanticCaption caption = structElementIdToCaptionMap.get(linkedContentId);
241+
boolean isFirst = isCaptionFirstChild(caption.getBoundingBox(), content.getBoundingBox());
242+
createCaptionStructElem(caption, linkedObject, cosDocument, isFirst);
255243
}
256-
return seDocument;
257244
}
258245

259-
private static boolean isFloatElement(IObject obj) {
260-
if (obj instanceof ImageChunk) return true;
261-
if (obj instanceof TableBorder) {
262-
TableBorder t = (TableBorder) obj;
263-
return !t.isTextBlock() && !t.isOneCellTable();
246+
/**
247+
* Determines if the caption should be the first child (above/before) or last child
248+
* (below/after) of its parent struct element.
249+
*/
250+
private static boolean isCaptionFirstChild(BoundingBox caption, BoundingBox parent) {
251+
if (caption == null || parent == null) return true;
252+
if (caption.getCenterY() > parent.getTopY()) {
253+
return true;
254+
} else if (caption.getCenterY() < parent.getBottomY()) {
255+
return false;
256+
} else {
257+
return caption.getCenterX() < parent.getCenterX();
264258
}
265-
return false;
266259
}
267260

268261
/**
@@ -390,8 +383,6 @@ private static void createStructElem(IObject object, COSObject parentStructElem,
390383
((SemanticHeading) object).getHeadingLevel());
391384
} else if (object instanceof SemanticParagraph) {
392385
createParagraphStructElem((SemanticParagraph) object, parentStructElem, cosDocument);
393-
} else if (object instanceof SemanticCaption) {
394-
createCaptionStructElem((SemanticCaption) object, parentStructElem, cosDocument);
395386
} else if (object instanceof PDFList) {
396387
createListStructElem((PDFList) object, parentStructElem, cosDocument);
397388
} else if (object instanceof TableBorder) {
@@ -424,8 +415,8 @@ private static void createParagraphStructElem(SemanticParagraph paragraph, COSOb
424415
processTextNode(paragraph, paragraphObject);
425416
}
426417

427-
private static void createCaptionStructElem(SemanticCaption caption, COSObject parent, COSDocument cosDocument) {
428-
COSObject captionObject = addStructElement(parent, cosDocument, TaggedPDFConstants.CAPTION, caption.getPageNumber());
418+
private static void createCaptionStructElem(SemanticCaption caption, COSObject parent, COSDocument cosDocument, boolean isFirstChild) {
419+
COSObject captionObject = addStructElement(parent, cosDocument, TaggedPDFConstants.CAPTION, caption.getPageNumber(), isFirstChild);
429420
processTextNode(caption, captionObject);
430421
}
431422

@@ -448,6 +439,7 @@ private static COSObject createFigureStructElemReturning(ImageChunk image, COSOb
448439
COSString.construct(altText.getBytes(StandardCharsets.UTF_16), false));
449440
cosDocument.addChangedObject(figureObject);
450441
processImageNode(image, figureObject);
442+
addCaptionIfPresent(image, figureObject, cosDocument);
451443
return figureObject;
452444
}
453445

@@ -494,10 +486,9 @@ private static void createListStructElem(PDFList list, COSObject parent, COSDocu
494486
listItem.getFirstLine().getValue().length()));
495487
}
496488
processTextNode(lBodyTextNode, lBodyObject);
497-
for (IObject content : listItem.getContents()) {
498-
createStructElem(content, lBodyObject, cosDocument);
499-
}
489+
addKids(listItem.getContents(), lBodyObject, cosDocument);
500490
}
491+
addCaptionIfPresent(list, listObject, cosDocument);
501492
}
502493

503494
private static void createTableStructElem(TableBorder table, COSObject parent, COSDocument cosDocument) {
@@ -525,21 +516,19 @@ private static COSObject createTableStructElemReturning(TableBorder table, COSOb
525516
if (cell.getRowSpan() != 1) {
526517
addAttributeToStructElem(cellObject, ASAtom.TABLE, ASAtom.ROW_SPAN, COSInteger.construct(cell.getRowSpan()));
527518
}
528-
for (IObject cellContent : cell.getContents()) {
529-
createStructElem(cellContent, cellObject, cosDocument);
530-
}
519+
addKids(cell.getContents(), cellObject, cosDocument);
531520
}
532521
}
533522
}
523+
addCaptionIfPresent(table, tableObject, cosDocument);
534524
return tableObject;
535525
}
536526

537527
private static void createPartStructElemForTextBlock(TableBorder table, COSObject parent, COSDocument cosDocument) {
538528
COSObject partObject = addStructElement(parent, cosDocument, TaggedPDFConstants.PART, table.getPageNumber());
539529
TableBorderCell cell = table.getCell(0,0);
540-
for (IObject cellContent : cell.getContents()) {
541-
createStructElem(cellContent, partObject, cosDocument);
542-
}
530+
addKids(cell.getContents(), partObject, cosDocument);
531+
addCaptionIfPresent(table, partObject, cosDocument);
543532
}
544533

545534
private static void addAttributeToStructElem(COSObject structElement, ASAtom ownerASAtom, ASAtom attributeName,

0 commit comments

Comments
 (0)