Skip to content

Commit 60101c5

Browse files
bundoleekuishou68
andauthored
fix(node): split CLI streaming from library API to prevent stdout double-write (#473)
Objective: CLI users without --quiet see every output line printed twice. The earlier streaming-flag-on-the-Promise approach (d6cded2 / 7e750d2) fixed CLI behavior at the cost of a library API regression — convert(), when called with quiet undefined, returned an empty string instead of the real stdout. Approach: Separate concerns at the layer boundary. - convert() (library API) is pinned to streamOutput: false. It always returns the full stdout string and never touches the parent process's stdio. No quiet-dependent return shape, no surprise side-effects. - _runForCli() (internal helper, used only by the bundled CLI) is pinned to streamOutput: true. It forwards Java's stdout and stderr to the parent process in real time and resolves to void — so the CLI cannot re-print what was already streamed. - cli.ts now calls _runForCli() and never touches the resolved value, closing the double-write loop at the source. This keeps Java's progress logs flowing in real time on stderr — the property that makes long hybrid runs observable — while the result text streams once on stdout. Evidence: Added a deterministic mock-based unit suite (executeJar.unit.test.ts, 9 tests) and a real-jar integration suite (streaming.integration.test.ts, 3 tests). Manual verification on the bundled CLI: | Scenario | Expected | Actual | |-----------------------------------------|--------------------------------|-------------------------------------------------| | CLI, no --quiet, --to-stdout (1 page) | 459 bytes (single copy) | 459 bytes | | CLI, no --quiet, --to-stdout (14 pages) | 63006 bytes (single copy) | 63006 bytes | | 14-page CLI: stderr progress logs | streamed before stdout | confirmed (28 lines, "Number of pages" present) | | Library: convert(quiet:true) returns | full stdout string | 459 bytes | | Library: convert(quiet:undef) returns | full stdout string | 459 bytes | | Library: process.stdout side-effect | none | none | | Library: process.stderr side-effect | none | none | vitest: 30/30 pass. Fixes #398 Co-authored-by: kuishou68 <54054995+kuishou68@users.noreply.github.com>
1 parent 07d0bdc commit 60101c5

5 files changed

Lines changed: 533 additions & 23 deletions

File tree

node/opendataloader-pdf/src/cli.ts

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env node
22
import { Command, CommanderError } from 'commander';
3-
import { convert } from './index.js';
3+
import { _runForCli } from './index.js';
44
import { CliOptions, buildConvertOptions } from './convert-options.generated.js';
55
import { registerCliOptions } from './cli-options.generated.js';
66

@@ -56,13 +56,9 @@ async function main(): Promise<number> {
5656
const convertOptions = buildConvertOptions(cliOptions);
5757

5858
try {
59-
const output = await convert(inputPaths, convertOptions);
60-
if (output && !convertOptions.quiet) {
61-
process.stdout.write(output);
62-
if (!output.endsWith('\n')) {
63-
process.stdout.write('\n');
64-
}
65-
}
59+
// _runForCli streams stdout/stderr to the parent process as they arrive;
60+
// we deliberately do not re-print anything here. (Issue #398.)
61+
await _runForCli(inputPaths, convertOptions);
6662
return 0;
6763
} catch (err) {
6864
const message = err instanceof Error ? err.message : String(err);

node/opendataloader-pdf/src/index.ts

Lines changed: 84 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { spawn } from 'child_process';
22
import * as path from 'path';
33
import * as fs from 'fs';
4+
import { StringDecoder } from 'string_decoder';
45
import { fileURLToPath } from 'url';
56

67
// Re-export types and utilities from auto-generated file
@@ -15,6 +16,11 @@ const __dirname = path.dirname(__filename);
1516
const JAR_NAME = 'opendataloader-pdf-cli.jar';
1617

1718
interface JarExecutionOptions {
19+
/**
20+
* When true, forwards Java's stdout and stderr chunks to the parent
21+
* process in real time as well as accumulating them. Used by the bundled
22+
* CLI so long-running conversions show progress as it happens.
23+
*/
1824
streamOutput?: boolean;
1925
}
2026

@@ -37,24 +43,55 @@ function executeJar(args: string[], executionOptions: JarExecutionOptions = {}):
3743

3844
let stdout = '';
3945
let stderr = '';
40-
41-
javaProcess.stdout.on('data', (data) => {
42-
const chunk = data.toString();
46+
// StringDecoder buffers incomplete multi-byte UTF-8 sequences across
47+
// chunk boundaries — Buffer.toString() alone would emit replacement
48+
// characters when, e.g., a 3-byte Korean codepoint splits across two
49+
// 'data' events. One decoder per stream so they don't share state.
50+
const stdoutDecoder = new StringDecoder('utf8');
51+
const stderrDecoder = new StringDecoder('utf8');
52+
53+
javaProcess.stdout.on('data', (data: Buffer) => {
54+
const chunk = stdoutDecoder.write(data);
55+
if (chunk.length === 0) return;
4356
if (streamOutput) {
57+
// Stream-only: don't also accumulate, or a multi-hour conversion
58+
// would buffer its entire (potentially gigabyte) stdout in memory
59+
// for no consumer.
4460
process.stdout.write(chunk);
61+
} else {
62+
stdout += chunk;
4563
}
46-
stdout += chunk;
4764
});
4865

49-
javaProcess.stderr.on('data', (data) => {
50-
const chunk = data.toString();
66+
javaProcess.stderr.on('data', (data: Buffer) => {
67+
const chunk = stderrDecoder.write(data);
68+
if (chunk.length === 0) return;
5169
if (streamOutput) {
5270
process.stderr.write(chunk);
5371
}
72+
// stderr is always accumulated (progress logs are small and we need
73+
// them for error messages on non-zero exit).
5474
stderr += chunk;
5575
});
5676

5777
javaProcess.on('close', (code) => {
78+
// Flush any trailing bytes the decoder is still holding (always emit
79+
// them — if we drop them on error paths, error messages with non-ASCII
80+
// characters lose their tail).
81+
const stdoutTail = stdoutDecoder.end();
82+
const stderrTail = stderrDecoder.end();
83+
if (stdoutTail.length > 0) {
84+
if (streamOutput) {
85+
process.stdout.write(stdoutTail);
86+
} else {
87+
stdout += stdoutTail;
88+
}
89+
}
90+
if (stderrTail.length > 0) {
91+
if (streamOutput) process.stderr.write(stderrTail);
92+
stderr += stderrTail;
93+
}
94+
5895
if (code === 0) {
5996
resolve(stdout);
6097
} else {
@@ -80,26 +117,58 @@ function executeJar(args: string[], executionOptions: JarExecutionOptions = {}):
80117
});
81118
}
82119

83-
export function convert(
120+
function buildJarArgs(
84121
inputPaths: string | string[],
85-
options: ConvertOptions = {},
86-
): Promise<string> {
122+
options: ConvertOptions,
123+
): string[] | Error {
87124
const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
88125
if (inputList.length === 0) {
89-
return Promise.reject(new Error('At least one input path must be provided.'));
126+
return new Error('At least one input path must be provided.');
90127
}
91128

92129
for (const input of inputList) {
93130
if (!fs.existsSync(input)) {
94-
return Promise.reject(new Error(`Input file or folder not found: ${input}`));
131+
return new Error(`Input file or folder not found: ${input}`);
95132
}
96133
}
97134

98-
const args: string[] = [...inputList, ...buildArgs(options)];
135+
return [...inputList, ...buildArgs(options)];
136+
}
99137

100-
return executeJar(args, {
101-
streamOutput: !options.quiet,
102-
});
138+
export function convert(
139+
inputPaths: string | string[],
140+
options: ConvertOptions = {},
141+
): Promise<string> {
142+
const argsOrError = buildJarArgs(inputPaths, options);
143+
if (argsOrError instanceof Error) {
144+
return Promise.reject(argsOrError);
145+
}
146+
// Library API: never streams to the parent process. Returns the full stdout
147+
// string so callers can do `const out = await convert(...)` without surprise
148+
// side-effects on process.stdout / process.stderr.
149+
return executeJar(argsOrError, { streamOutput: false });
150+
}
151+
152+
/**
153+
* Internal entry point used by the bundled CLI. Streams Java's stdout and
154+
* stderr to the parent process in real time (so long-running conversions like
155+
* hybrid mode show progress as it happens) and resolves without a stdout
156+
* payload — preventing the caller from re-printing what was already streamed.
157+
*
158+
* Not part of the public API: do not import this from application code. Use
159+
* {@link convert} instead.
160+
*
161+
* @internal
162+
*/
163+
export async function _runForCli(
164+
inputPaths: string | string[],
165+
options: ConvertOptions = {},
166+
): Promise<void> {
167+
const argsOrError = buildJarArgs(inputPaths, options);
168+
if (argsOrError instanceof Error) {
169+
throw argsOrError;
170+
}
171+
await executeJar(argsOrError, { streamOutput: true });
103172
}
104173

105174
/**

0 commit comments

Comments
 (0)