Skip to content

Commit 7965cea

Browse files
Extract image alt for images in tagged PDFs (#438)
1 parent 261aeea commit 7965cea

1 file changed

Lines changed: 17 additions & 13 deletions

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TaggedDocumentProcessor.java

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.opendataloader.pdf.processors;
22

33
import org.opendataloader.pdf.api.Config;
4+
import org.opendataloader.pdf.entities.EnrichedImageChunk;
45
import org.verapdf.gf.model.impl.sa.GFSANode;
56
import org.verapdf.wcag.algorithms.entities.*;
67
import org.verapdf.wcag.algorithms.entities.content.ImageChunk;
@@ -35,7 +36,7 @@ public static List<List<IObject>> processDocument(String inputPdfName, Config co
3536
contents.add(new ArrayList<>());
3637
}
3738
ITree tree = StaticContainers.getDocument().getTree();
38-
processStructElem(tree.getRoot());
39+
processStructElem(tree.getRoot(), null);
3940
List<List<IObject>> artifacts = collectArtifacts(totalPages);
4041
for (int pageNumber = 0; pageNumber < totalPages; pageNumber++) {
4142
if (!shouldProcessPage(pageNumber)) {
@@ -92,17 +93,17 @@ private static boolean shouldProcessPage(int pageNumber) {
9293
return pagesToProcess == null || pagesToProcess.contains(pageNumber);
9394
}
9495

95-
private static void processStructElem(INode node) {
96+
private static void processStructElem(INode node, INode parent) {
9697
if (node instanceof SemanticFigure) {
97-
processImage((SemanticFigure) node);
98+
processImage((SemanticFigure) node, parent);
9899
return;
99100
}
100101
if (node instanceof SemanticSpan) {
101102
processTextChunk((SemanticSpan) node);
102103
}
103104
if (node.getInitialSemanticType() == null) {
104105
for (INode child : node.getChildren()) {
105-
processStructElem(child);
106+
processStructElem(child, node);
106107
}
107108
return;
108109
}
@@ -133,7 +134,7 @@ private static void processStructElem(INode node) {
133134
break;
134135
default:
135136
for (INode child : node.getChildren()) {
136-
processStructElem(child);
137+
processStructElem(child, node);
137138
}
138139
}
139140
}
@@ -194,7 +195,7 @@ private static void processList(INode node) {
194195
list.add(listItem);
195196
}
196197
} else {
197-
processStructElem(child);
198+
processStructElem(child, node);
198199
}
199200
}
200201
addObjectToContent(list);
@@ -280,11 +281,11 @@ private static List<INode> processTableRows(INode table) {
280281
listTR.add(child);
281282
processTableRowsChildren(child);
282283
} else {
283-
processStructElem(child);
284+
processStructElem(child, elem);
284285
}
285286
}
286287
} else {
287-
processStructElem(elem);
288+
processStructElem(elem, table);
288289
}
289290
}
290291
return listTR;
@@ -294,7 +295,7 @@ private static void processTableRowsChildren(INode tableRow) {
294295
for (INode tableCell : tableRow.getChildren()) {
295296
SemanticType tableCellType = tableCell.getInitialSemanticType();
296297
if (SemanticType.TABLE_CELL != tableCellType && SemanticType.TABLE_HEADER != tableCellType) {
297-
processStructElem(tableCell);
298+
processStructElem(tableCell, tableRow);
298299
}
299300
}
300301
}
@@ -338,7 +339,7 @@ private static void processTableCell(TableBorderCell cell, INode elem) {
338339
private static void processChildContents(INode elem, List<IObject> contents) {
339340
contentsStack.add(contents);
340341
for (INode childChild : elem.getChildren()) {
341-
processStructElem(childChild);
342+
processStructElem(childChild, elem);
342343
}
343344
contentsStack.pop();
344345
}
@@ -395,10 +396,13 @@ private static void processTOC(INode toc) {
395396

396397
}
397398

398-
private static void processImage(SemanticFigure image) {
399+
private static void processImage(SemanticFigure image, INode parent) {
400+
GFSANode parentNode = (GFSANode) parent;
399401
List<ImageChunk> images = image.getImages();
400402
if (!images.isEmpty()) {
401-
addObjectToContent(images.get(0));
403+
String alt = parentNode.getStructElem().getStructElemDictionary().getAlternateDescription();
404+
ImageChunk imageChunk = images.get(0);
405+
addObjectToContent(alt == null ? imageChunk : new EnrichedImageChunk(imageChunk, alt));
402406
}
403407
}
404408

@@ -412,7 +416,7 @@ private static List<IObject> getContents(INode node) {
412416
if (child instanceof SemanticSpan) {
413417
result.add(((SemanticSpan)child).getColumns().get(0).getFirstLine().getFirstTextChunk());
414418
} else if (child instanceof SemanticFigure) {
415-
processImage((SemanticFigure)child);
419+
processImage((SemanticFigure)child, node);
416420
} else {
417421
result.addAll(getContents(child));
418422
}

0 commit comments

Comments
 (0)