feat(api): add OutputWriter for two-phase extraction-then-output flow

bundolee · bundolee · commit 42f53dbf56a5 · 2026-04-29T19:43:58.000+09:00
Allows callers that have already run extraction (via DocumentProcessor.extractContents
or AutoTagger.tag) to emit JSON/MD/HTML/PDF/text/images/tagged-PDF outputs from
that same ExtractionResult, without re-parsing the PDF.

The single-call processFile pipeline is unchanged; this is purely an additive
public API. Internally exposes DocumentProcessor.generateOutputs() so the new
api.OutputWriter facade can delegate to it; downstream callers should use
OutputWriter rather than the processor directly.

Use case: opendataloader-pdfua needs both file outputs (JSON/MD/HTML) and an
in-memory tagged document (for accessibility remediation) from the same input.
Previously this required two extraction passes (processFile + AutoTagger.tag);
with OutputWriter the pdfua pipeline can extract once and feed both sinks.
diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/OutputWriter.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/OutputWriter.java
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2025-2026 Hancom Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.opendataloader.pdf.api;
+
+import org.opendataloader.pdf.processors.DocumentProcessor;
+import org.opendataloader.pdf.processors.ExtractionResult;
+
+import java.io.IOException;
+
+/**
+ * Writes configured output files (JSON, Markdown, HTML, PDF, text, images,
+ * tagged PDF) from a pre-computed {@link ExtractionResult}.
+ *
+ * <p>Use this when you have already run extraction once (e.g. via
+ * {@link AutoTagger#tag(String, ExtractionResult)}) and want to emit file
+ * outputs from that same result without re-extracting.
+ *
+ * <p>Typical two-phase usage:
+ * <pre>{@code
+ * Config config = new Config();
+ * config.setOutputFolder("/out");
+ * config.setGenerateJSON(true);
+ * config.setGenerateMarkdown(true);
+ *
+ * // Phase 1: extract once
+ * ExtractionResult extraction =
+ *     org.opendataloader.pdf.processors.DocumentProcessor.extractContents(
+ *         "input.pdf", config);
+ *
+ * // Phase 2a: write output files
+ * OutputWriter.writeOutputs("input.pdf", extraction, config);
+ *
+ * // Phase 2b: tag in-memory and reuse the same extraction
+ * try (TaggingResult tagged = AutoTagger.tag("input.pdf", extraction)) {
+ *     // ... use tagged.getDocument()
+ * }
+ * }</pre>
+ *
+ * <p>For the single-call extraction-and-output pipeline, use
+ * {@link OpenDataLoaderPDF#processFile} instead.
+ */
+public final class OutputWriter {
+
+    private OutputWriter() {
+    }
+
+    /**
+     * Writes the output files configured on {@code config} (e.g.
+     * {@code generateJSON}, {@code generateMarkdown}, {@code generateHtml},
+     * {@code generatePDF}, {@code generateTaggedPDF}, {@code generateText})
+     * using the supplied pre-computed extraction.
+     *
+     * <p>This method does <em>not</em> re-run extraction. Output behaviour is
+     * identical to {@link OpenDataLoaderPDF#processFile} for the same
+     * {@link Config}, including stdout mode, image directory resolution, and
+     * tagged-PDF generation.
+     *
+     * @param inputPdfName path to the input PDF file (used for filename derivation
+     *                     and tagged-PDF / annotated-PDF re-saves; not re-parsed)
+     * @param extraction   pre-computed extraction result (from
+     *                     {@code DocumentProcessor.extractContents})
+     * @param config       configuration controlling which output formats to emit
+     * @throws IOException if writing any output file fails
+     */
+    public static void writeOutputs(String inputPdfName, ExtractionResult extraction, Config config)
+            throws IOException {
+        DocumentProcessor.generateOutputs(inputPdfName, extraction.getContents(), config,
+                extraction.getElementMetadata());
+    }
+}
diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java
@@ -383,7 +383,20 @@ private static boolean shouldProcessPage(int pageNumber, Set<Integer> pagesToPro
         return pagesToProcess == null || pagesToProcess.contains(pageNumber);
     }
 
-    private static void generateOutputs(String inputPdfName, List<List<IObject>> contents, Config config,
+    /**
+     * Writes the configured output files (JSON/MD/HTML/PDF/Text/images/tagged PDF)
+     * from already-extracted contents.
+     *
+     * <p><strong>Internal API. Do not call directly.</strong> This method is
+     * {@code public} only so the {@link org.opendataloader.pdf.api.OutputWriter}
+     * facade in the {@code api} package can delegate to it. The signature
+     * (notably the {@code List<List<IObject>>} and
+     * {@code Map<Long, ElementMetadata>} parameters) is an implementation
+     * detail and may change in any release. External callers must use
+     * {@link org.opendataloader.pdf.api.OutputWriter#writeOutputs}, which is
+     * the stable public API.
+     */
+    public static void generateOutputs(String inputPdfName, List<List<IObject>> contents, Config config,
                                            Map<Long, ElementMetadata> elementMetadata) throws IOException {
         // Stdout mode: write primary format to stdout, skip file I/O
         if (config.isOutputStdout()) {