Skip to content

Commit eac53ed

Browse files
LonelyMidoriyaMaximPlusov
authored andcommitted
Add pdf version option (from pdfua)
1 parent a50269e commit eac53ed

4 files changed

Lines changed: 12 additions & 14 deletions

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/AutoTagger.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ private AutoTagger() {
5959
* @return result containing the tagged PDDocument and timing metadata
6060
* @throws IOException if unable to read or process the PDF
6161
*/
62-
public static TaggingResult tag(String inputPdf, Config config) throws IOException {
62+
public static TaggingResult tag(String inputPdf, Config config, Float pdfVersion) throws IOException {
6363
ExtractionResult extraction = DocumentProcessor.extractContents(inputPdf, config);
64-
return tag(inputPdf, extraction);
64+
return tag(extraction, pdfVersion);
6565
}
6666

6767
/**
@@ -70,15 +70,14 @@ public static TaggingResult tag(String inputPdf, Config config) throws IOExcepti
7070
* same extraction — call {@link DocumentProcessor#extractContents} once,
7171
* then pass the result here and to other output generators.
7272
*
73-
* @param inputPdf path to the input PDF file (used for metadata)
7473
* @param extraction pre-computed extraction result
7574
* @return result containing the tagged PDDocument and timing metadata
7675
* @throws IOException if unable to tag the document
7776
*/
78-
public static TaggingResult tag(String inputPdf, ExtractionResult extraction) throws IOException {
77+
public static TaggingResult tag(ExtractionResult extraction, Float pdfVersion) throws IOException {
7978
long t0 = System.nanoTime();
8079
PDDocument document = StaticResources.getDocument();
81-
AutoTaggingProcessor.tagDocument(new File(inputPdf), document, extraction.getContents());
80+
AutoTaggingProcessor.tagDocument(document, extraction.getContents(), pdfVersion);
8281
long taggingNs = System.nanoTime() - t0;
8382

8483
return new TaggingResult(document, extraction.getExtractionNs(), taggingNs,

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/OutputWriter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
* tagged PDF) from a pre-computed {@link ExtractionResult}.
2626
*
2727
* <p>Use this when you have already run extraction once (e.g. via
28-
* {@link AutoTagger#tag(String, ExtractionResult)}) and want to emit file
28+
* {@link AutoTagger#tag(ExtractionResult, Float)}) and want to emit file
2929
* outputs from that same result without re-extracting.
3030
*
3131
* <p>Typical two-phase usage:

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/AutoTaggingProcessor.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,10 @@ public class AutoTaggingProcessor {
5959
* Tag a PDF document in-memory without saving to disk.
6060
* Adds structure tree, marked content references, and parent tree to the document.
6161
*
62-
* @param inputPDF the original PDF file (used for metadata only)
6362
* @param document the PDDocument to tag (modified in place)
6463
* @param contents extracted content by page
6564
*/
66-
public static synchronized void tagDocument(File inputPDF, PDDocument document, List<List<IObject>> contents) throws IOException {
65+
public static synchronized void tagDocument(PDDocument document, List<List<IObject>> contents, Float pdfVersion) throws IOException {
6766
operatorIndexesToStreamInfosMap.clear();
6867
structParents.clear();
6968
structParentsIntegers.clear();
@@ -73,7 +72,7 @@ public static synchronized void tagDocument(File inputPDF, PDDocument document,
7372
annotationBBoxesMap.clear();
7473
currentStructParent = 0;
7574
imageChunkFigureCounter = 0;
76-
isPDF2_0 = document.getVersion() == 2.0F;
75+
isPDF2_0 = pdfVersion != null ? pdfVersion == 2.0F : document.getVersion() == 2.0F;
7776
COSDocument cosDocument = document.getDocument();
7877
PDCatalog catalog = document.getCatalog();
7978
COSObject structTreeRoot = createStructTreeRoot(catalog, cosDocument, document);
@@ -90,7 +89,7 @@ public static synchronized void tagDocument(File inputPDF, PDDocument document,
9089
* Tag a PDF document and save to disk. Existing behavior preserved.
9190
*/
9291
public static synchronized void createTaggedPDF(File inputPDF, String outputFolder, PDDocument document, List<List<IObject>> contents) throws IOException {
93-
tagDocument(inputPDF, document, contents);
92+
tagDocument(document, contents, null);
9493
String outputFileName = outputFolder + File.separator +
9594
inputPDF.getName().substring(0, inputPDF.getName().length() - 4) + "_tagged.pdf";
9695
document.saveAs(outputFileName);

java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/api/AutoTaggerTest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class AutoTaggerTest {
1717
void tagReturnsDocumentWithStructTree() throws Exception {
1818
Config config = new Config();
1919

20-
try (TaggingResult result = AutoTagger.tag(TEST_PDF, config)) {
20+
try (TaggingResult result = AutoTagger.tag(TEST_PDF, config, null)) {
2121
PDDocument doc = result.getDocument();
2222
assertThat(doc).isNotNull();
2323
assertThat(doc.getCatalog().getKey(org.verapdf.as.ASAtom.STRUCT_TREE_ROOT).empty())
@@ -30,7 +30,7 @@ void tagReturnsDocumentWithStructTree() throws Exception {
3030
void tagTimingsArePositive() throws Exception {
3131
Config config = new Config();
3232

33-
try (TaggingResult result = AutoTagger.tag(TEST_PDF, config)) {
33+
try (TaggingResult result = AutoTagger.tag(TEST_PDF, config, null)) {
3434
assertThat(result.getExtractionNs()).isGreaterThan(0);
3535
assertThat(result.getTaggingNs()).isGreaterThan(0);
3636
}
@@ -41,7 +41,7 @@ void saveToWritesFile(@TempDir Path tempDir) throws Exception {
4141
Config config = new Config();
4242
String outputPath = tempDir.resolve("output_tagged.pdf").toString();
4343

44-
try (TaggingResult result = AutoTagger.tag(TEST_PDF, config)) {
44+
try (TaggingResult result = AutoTagger.tag(TEST_PDF, config, null)) {
4545
result.saveTo(outputPath);
4646
}
4747

@@ -55,7 +55,7 @@ void tagIgnoresOutputFormatFlags() throws Exception {
5555
config.setGenerateJSON(true);
5656
config.setGenerateMarkdown(true);
5757

58-
try (TaggingResult result = AutoTagger.tag(TEST_PDF, config)) {
58+
try (TaggingResult result = AutoTagger.tag(TEST_PDF, config, null)) {
5959
assertThat(result.getDocument()).isNotNull();
6060
}
6161
}

0 commit comments

Comments
 (0)