Skip to content

Commit 42f53db

Browse files
committed
feat(api): add OutputWriter for two-phase extraction-then-output flow
Allows callers that have already run extraction (via DocumentProcessor.extractContents or AutoTagger.tag) to emit JSON/MD/HTML/PDF/text/images/tagged-PDF outputs from that same ExtractionResult, without re-parsing the PDF. The single-call processFile pipeline is unchanged; this is purely an additive public API. Internally exposes DocumentProcessor.generateOutputs() so the new api.OutputWriter facade can delegate to it; downstream callers should use OutputWriter rather than the processor directly. Use case: opendataloader-pdfua needs both file outputs (JSON/MD/HTML) and an in-memory tagged document (for accessibility remediation) from the same input. Previously this required two extraction passes (processFile + AutoTagger.tag); with OutputWriter the pdfua pipeline can extract once and feed both sinks.
1 parent 5fe5e8d commit 42f53db

2 files changed

Lines changed: 97 additions & 1 deletion

File tree

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/*
2+
* Copyright 2025-2026 Hancom Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.opendataloader.pdf.api;
17+
18+
import org.opendataloader.pdf.processors.DocumentProcessor;
19+
import org.opendataloader.pdf.processors.ExtractionResult;
20+
21+
import java.io.IOException;
22+
23+
/**
24+
* Writes configured output files (JSON, Markdown, HTML, PDF, text, images,
25+
* tagged PDF) from a pre-computed {@link ExtractionResult}.
26+
*
27+
* <p>Use this when you have already run extraction once (e.g. via
28+
* {@link AutoTagger#tag(String, ExtractionResult)}) and want to emit file
29+
* outputs from that same result without re-extracting.
30+
*
31+
* <p>Typical two-phase usage:
32+
* <pre>{@code
33+
* Config config = new Config();
34+
* config.setOutputFolder("/out");
35+
* config.setGenerateJSON(true);
36+
* config.setGenerateMarkdown(true);
37+
*
38+
* // Phase 1: extract once
39+
* ExtractionResult extraction =
40+
* org.opendataloader.pdf.processors.DocumentProcessor.extractContents(
41+
* "input.pdf", config);
42+
*
43+
* // Phase 2a: write output files
44+
* OutputWriter.writeOutputs("input.pdf", extraction, config);
45+
*
46+
* // Phase 2b: tag in-memory and reuse the same extraction
47+
* try (TaggingResult tagged = AutoTagger.tag("input.pdf", extraction)) {
48+
* // ... use tagged.getDocument()
49+
* }
50+
* }</pre>
51+
*
52+
* <p>For the single-call extraction-and-output pipeline, use
53+
* {@link OpenDataLoaderPDF#processFile} instead.
54+
*/
55+
public final class OutputWriter {
56+
57+
private OutputWriter() {
58+
}
59+
60+
/**
61+
* Writes the output files configured on {@code config} (e.g.
62+
* {@code generateJSON}, {@code generateMarkdown}, {@code generateHtml},
63+
* {@code generatePDF}, {@code generateTaggedPDF}, {@code generateText})
64+
* using the supplied pre-computed extraction.
65+
*
66+
* <p>This method does <em>not</em> re-run extraction. Output behaviour is
67+
* identical to {@link OpenDataLoaderPDF#processFile} for the same
68+
* {@link Config}, including stdout mode, image directory resolution, and
69+
* tagged-PDF generation.
70+
*
71+
* @param inputPdfName path to the input PDF file (used for filename derivation
72+
* and tagged-PDF / annotated-PDF re-saves; not re-parsed)
73+
* @param extraction pre-computed extraction result (from
74+
* {@code DocumentProcessor.extractContents})
75+
* @param config configuration controlling which output formats to emit
76+
* @throws IOException if writing any output file fails
77+
*/
78+
public static void writeOutputs(String inputPdfName, ExtractionResult extraction, Config config)
79+
throws IOException {
80+
DocumentProcessor.generateOutputs(inputPdfName, extraction.getContents(), config,
81+
extraction.getElementMetadata());
82+
}
83+
}

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,20 @@ private static boolean shouldProcessPage(int pageNumber, Set<Integer> pagesToPro
383383
return pagesToProcess == null || pagesToProcess.contains(pageNumber);
384384
}
385385

386-
private static void generateOutputs(String inputPdfName, List<List<IObject>> contents, Config config,
386+
/**
387+
* Writes the configured output files (JSON/MD/HTML/PDF/Text/images/tagged PDF)
388+
* from already-extracted contents.
389+
*
390+
* <p><strong>Internal API. Do not call directly.</strong> This method is
391+
* {@code public} only so the {@link org.opendataloader.pdf.api.OutputWriter}
392+
* facade in the {@code api} package can delegate to it. The signature
393+
* (notably the {@code List<List<IObject>>} and
394+
* {@code Map<Long, ElementMetadata>} parameters) is an implementation
395+
* detail and may change in any release. External callers must use
396+
* {@link org.opendataloader.pdf.api.OutputWriter#writeOutputs}, which is
397+
* the stable public API.
398+
*/
399+
public static void generateOutputs(String inputPdfName, List<List<IObject>> contents, Config config,
387400
Map<Long, ElementMetadata> elementMetadata) throws IOException {
388401
// Stdout mode: write primary format to stdout, skip file I/O
389402
if (config.isOutputStdout()) {

0 commit comments

Comments
 (0)