Skip to content

Commit 39293ad

Browse files
Update caption order
1 parent fd4a962 commit 39293ad

1 file changed

Lines changed: 51 additions & 17 deletions

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/AutoTaggingProcessor.java

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import org.verapdf.wcag.algorithms.entities.content.TextLine;
1818
import org.verapdf.wcag.algorithms.entities.content.TextChunk;
1919
import org.verapdf.wcag.algorithms.entities.content.TextColumn;
20+
import org.verapdf.wcag.algorithms.entities.geometry.BoundingBox;
2021
import org.verapdf.wcag.algorithms.entities.lists.ListItem;
2122
import org.verapdf.wcag.algorithms.entities.lists.PDFList;
2223
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorder;
@@ -32,6 +33,7 @@ public class AutoTaggingProcessor {
3233
private static final Map<OperatorStreamKey, Map<Integer, Set<StreamInfo>>> operatorIndexesToStreamInfosMap = new HashMap<>();
3334
private static final Map<OperatorStreamKey, List<COSObject>> structParents = new HashMap<>();
3435
private static final Map<OperatorStreamKey, Integer> structParentsIntegers = new HashMap<>();
36+
private static final Map<Long, SemanticCaption> structElementIdToCaptionMap = new HashMap<>();
3537
private static boolean isPDF2_0 = false;
3638
private static final int MAX_TOKENS_PER_STREAM = 100_000;
3739

@@ -137,10 +139,18 @@ private static void createParentTree(COSDocument cosDocument, COSObject structTr
137139
}
138140

139141
private static COSObject addStructElement(COSObject parent, COSDocument cosDocument, String type, Integer pageNumber) {
142+
return addStructElement(parent, cosDocument, type, pageNumber, false);
143+
}
144+
145+
private static COSObject addStructElement(COSObject parent, COSDocument cosDocument, String type, Integer pageNumber, boolean isFirstKid) {
140146
COSObject structElement = COSIndirect.construct(COSDictionary.construct(), cosDocument);
141147
COSObject k = parent.getKey(ASAtom.K);
142148
if (k.getType() == COSObjType.COS_ARRAY) {
143-
k.add(structElement);
149+
if (isFirstKid) {
150+
k.insert(0, structElement);
151+
} else {
152+
k.add(structElement);
153+
}
144154
} else {
145155
k = COSArray.construct();
146156
parent.setKey(ASAtom.K, k);
@@ -160,9 +170,7 @@ private static COSObject addStructElement(COSObject parent, COSDocument cosDocum
160170
public static void createStructureTreeElements(List<List<IObject>> contents, COSObject structTreeRoot, COSDocument cosDocument) {
161171
COSObject seDocument = addStructElement(structTreeRoot, cosDocument, TaggedPDFConstants.DOCUMENT, null);
162172
for (List<IObject> pageContents : contents) {
163-
for (IObject content : pageContents) {
164-
createStructElem(content, seDocument, cosDocument);
165-
}
173+
addKids(pageContents, seDocument, cosDocument);
166174
}
167175
}
168176

@@ -171,8 +179,6 @@ private static void createStructElem(IObject object, COSObject parentStructElem,
171179
createHeadingStructElem((SemanticHeading) object, parentStructElem, cosDocument);
172180
} else if (object instanceof SemanticParagraph) {
173181
createParagraphStructElem((SemanticParagraph) object, parentStructElem, cosDocument);
174-
} else if (object instanceof SemanticCaption) {
175-
createCaptionStructElem((SemanticCaption) object, parentStructElem, cosDocument);
176182
} else if (object instanceof PDFList) {
177183
createListStructElem((PDFList) object, parentStructElem, cosDocument);
178184
} else if (object instanceof TableBorder) {
@@ -192,15 +198,16 @@ private static void createHeadingStructElem(SemanticHeading heading, COSObject p
192198
isPDF2_0 ? TaggedPDFConstants.H + heading.getHeadingLevel() : TaggedPDFConstants.H,
193199
heading.getPageNumber());
194200
processTextNode(heading, headingObject);
201+
addCaptionIfPresent(heading, headingObject, cosDocument);
195202
}
196203

197204
private static void createParagraphStructElem(SemanticParagraph paragraph, COSObject parent, COSDocument cosDocument) {
198205
COSObject paragraphObject = addStructElement(parent, cosDocument, TaggedPDFConstants.P, paragraph.getPageNumber());
199206
processTextNode(paragraph, paragraphObject);
200207
}
201208

202-
private static void createCaptionStructElem(SemanticCaption caption, COSObject parent, COSDocument cosDocument) {
203-
COSObject captionObject = addStructElement(parent, cosDocument, TaggedPDFConstants.CAPTION, caption.getPageNumber());
209+
private static void createCaptionStructElem(SemanticCaption caption, COSObject parent, COSDocument cosDocument, boolean isFirstChild) {
210+
COSObject captionObject = addStructElement(parent, cosDocument, TaggedPDFConstants.CAPTION, caption.getPageNumber(), isFirstChild);
204211
processTextNode(caption, captionObject);
205212
}
206213

@@ -209,6 +216,7 @@ private static void createFigureStructElem(ImageChunk image, COSObject parent, C
209216
double[] bbox = {image.getLeftX(), image.getBottomY(), image.getRightX(), image.getTopY()};
210217
addAttributeToStructElem(figureObject, ASAtom.LAYOUT, ASAtom.BBOX, COSArray.construct(4, bbox));
211218
processImageNode(image, figureObject);
219+
addCaptionIfPresent(image, figureObject, cosDocument);
212220
//TODO: add height and width attributes
213221
}
214222

@@ -244,10 +252,9 @@ private static void createListStructElem(PDFList list, COSObject parent, COSDocu
244252
listItem.getFirstLine().getValue().length()));
245253
}
246254
processTextNode(lBodyTextNode, lBodyObject);
247-
for (IObject content : listItem.getContents()) {
248-
createStructElem(content, lBodyObject, cosDocument);
249-
}
255+
addKids(listItem.getContents(), lBodyObject, cosDocument);
250256
}
257+
addCaptionIfPresent(list, listObject, cosDocument);
251258
}
252259

253260
private static void createTableStructElem(TableBorder table, COSObject parent, COSDocument cosDocument) {
@@ -265,20 +272,18 @@ private static void createTableStructElem(TableBorder table, COSObject parent, C
265272
if (cell.getRowSpan() != 1) {
266273
addAttributeToStructElem(cellObject, ASAtom.TABLE, ASAtom.ROW_SPAN, COSInteger.construct(cell.getRowSpan()));
267274
}
268-
for (IObject cellContent : cell.getContents()) {
269-
createStructElem(cellContent, cellObject, cosDocument);
270-
}
275+
addKids(cell.getContents(), cellObject, cosDocument);
271276
}
272277
}
273278
}
279+
addCaptionIfPresent(table, tableObject, cosDocument);
274280
}
275281

276282
private static void createPartStructElemForTextBlock(TableBorder table, COSObject parent, COSDocument cosDocument) {
277283
COSObject partObject = addStructElement(parent, cosDocument, TaggedPDFConstants.PART, table.getPageNumber());
278284
TableBorderCell cell = table.getCell(0,0);
279-
for (IObject cellContent : cell.getContents()) {
280-
createStructElem(cellContent, partObject, cosDocument);
281-
}
285+
addKids(cell.getContents(), partObject, cosDocument);
286+
addCaptionIfPresent(table, partObject, cosDocument);
282287
}
283288

284289
private static void addAttributeToStructElem(COSObject structElement, ASAtom ownerASAtom, ASAtom attributeName,
@@ -398,4 +403,33 @@ public static Map<OperatorStreamKey, List<COSObject>> getStructParents() {
398403
public static Map<OperatorStreamKey, Map<Integer, Set<StreamInfo>>> getOperatorIndexesToStreamInfosMap() {
399404
return operatorIndexesToStreamInfosMap;
400405
}
406+
407+
private static void addKids(List<IObject> contents, COSObject parentStructElem, COSDocument cosDocument) {
408+
for (IObject content : contents) {
409+
if (content instanceof SemanticCaption) {
410+
structElementIdToCaptionMap.put(((SemanticCaption) content).getLinkedContentId(),(SemanticCaption) content);
411+
}
412+
}
413+
for (IObject content : contents) {
414+
createStructElem(content, parentStructElem, cosDocument);
415+
}
416+
}
417+
418+
private static void addCaptionIfPresent(IObject content, COSObject linkedObject, COSDocument cosDocument) {
419+
Long linkedContentId = content.getRecognizedStructureId();
420+
if (structElementIdToCaptionMap.containsKey(linkedContentId)) {
421+
SemanticCaption caption = structElementIdToCaptionMap.get(linkedContentId);
422+
createCaptionStructElem(caption, linkedObject, cosDocument, isCaptionFirstChild(caption.getBoundingBox(), content.getBoundingBox()));
423+
}
424+
}
425+
426+
public static boolean isCaptionFirstChild(BoundingBox caption, BoundingBox parent) {
427+
if (caption.getCenterY() > parent.getCenterY()) {
428+
return true;
429+
} else if (caption.getCenterY() < parent.getCenterY()) {
430+
return false;
431+
} else {
432+
return caption.getCenterX() < parent.getCenterX();
433+
}
434+
}
401435
}

0 commit comments

Comments
 (0)