diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractor.java
index 043db122..a4f186a6 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractor.java
@@ -15,26 +15,37 @@
*/
package org.codelibs.fess.crawler.extractor.impl;
-import java.io.BufferedReader;
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayOutputStream;
import java.io.File;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.HashMap;
-import java.util.LinkedList;
import java.util.List;
import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.core.exception.InterruptedRuntimeException;
-import org.codelibs.core.io.CopyUtil;
import org.codelibs.core.io.FileUtil;
import org.codelibs.core.lang.StringUtil;
-import org.codelibs.core.lang.ThreadUtil;
import org.codelibs.fess.crawler.Constants;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
@@ -68,12 +79,46 @@ public class CommandExtractor extends AbstractExtractor {
/** The encoding for the command's output. */
protected String commandOutputEncoding = Charset.defaultCharset().displayName();
- /** The maximum number of lines to buffer from command output. */
+ /**
+ * The (formerly) maximum number of lines to buffer from command output.
+ *
+ * @deprecated The line-count cap has been removed in favor of a byte-count cap.
+ * This field is no longer consulted; use {@link #setMaxOutputSize(long)}
+ * to bound how many bytes are read from stdout/stderr. The field is
+ * retained only for source/binary compatibility with callers that set
+ * it via {@link #setMaxOutputLine(int)} or via reflection.
+ */
+ @Deprecated
protected int maxOutputLine = 1000;
/** Whether to redirect standard output to a file. */
protected boolean standardOutput = false;
+ /** Maximum bytes the subprocess is allowed to write to a single drained stream (stdout/stderr). */
+ protected long maxOutputSize = 10L * 1024L * 1024L; // 10 MiB
+
+ /** Whether {@link #setMaxOutputSize(long)} has been called explicitly, preventing {@link #setMaxOutputLine(int)} from overriding it. */
+ private boolean maxOutputSizeExplicit = false;
+
+ /** Maximum bytes copied from the input stream into the temporary input file. */
+ protected long maxInputSize = 100L * 1024L * 1024L; // 100 MiB
+
+ /** Grace period (ms) given to a process after destroy() before destroyForcibly() is invoked. */
+ protected long destroyGracePeriodMillis = 2000L;
+
+ /**
+ * Whether to append captured stderr text to the extracted content when
+ * {@link #standardOutput} is {@code false}. Defaults to {@code false} to
+ * match pre-3.x behavior where stderr was only routed to logs, never to
+ * extracted text. (The original {@code ProcessBuilder.redirectErrorStream(true)}
+ * call only merged the streams for log draining; it never caused stderr to
+ * appear in {@code ExtractData}.)
+ *
+ *
Set to {@code true} to append captured stderr after the file content in
+ * the extracted body when {@code standardOutput=false}.
+ */
+ protected boolean includeStderrInOutput = false;
+
/**
* Constructs a new CommandExtractor.
*/
@@ -115,6 +160,18 @@ public ExtractData getText(final InputStream in, final Map param
filePrefix = "none";
extention = "";
}
+ // Sanitize prefix and extension to prevent argument-injection via crafted resourceName
+ // when downstream commands re-expand args through a shell wrapper.
+ filePrefix = filePrefix.replaceAll("[^A-Za-z0-9._-]", "_");
+ if (filePrefix.startsWith("-")) {
+ filePrefix = "_" + filePrefix;
+ }
+ if (!extention.isEmpty()) {
+ extention = extention.replaceAll("[^A-Za-z0-9]", "");
+ if (extention.length() > 16) {
+ extention = extention.substring(0, 16);
+ }
+ }
File inputFile = null;
File outputFile = null;
try {
@@ -132,35 +189,76 @@ public ExtractData getText(final InputStream in, final Map param
}
outputFile = createTempFile("cmdextout_" + filePrefix + "_", ext, tempDir);
- // store to a file
- CopyUtil.copy(in, inputFile);
+ // store to a file (bounded by maxInputSize)
+ copyToFileBounded(in, inputFile, maxInputSize);
+
+ final String stderrText = executeCommand(inputFile, outputFile);
- executeCommand(inputFile, outputFile);
+ // For standardOutput=false this is the only guard against the external command writing
+ // an unbounded $OUTPUT_FILE — bounded readers cover stdout/stderr only.
+ if (outputFile.length() > maxOutputSize) {
+ logger.warn("output file size exceeded limit: size={} limit={} command={}", outputFile.length(), maxOutputSize, command);
+ throw new ExtractException("output file size exceeded limit: limit=" + maxOutputSize);
+ }
- final ExtractData extractData = new ExtractData(new String(FileUtil.readBytes(outputFile), outputEncoding));
+ final StringBuilder contentBuf = new StringBuilder();
+ contentBuf.append(new String(FileUtil.readBytes(outputFile), outputEncoding));
+ // For backward compatibility with the legacy implementation that used
+ // ProcessBuilder.redirectErrorStream(true) when standardOutput=false,
+ // append captured stderr text to the extracted content.
+ if (!standardOutput && includeStderrInOutput && StringUtil.isNotEmpty(stderrText)) {
+ contentBuf.append(stderrText);
+ }
+ final ExtractData extractData = new ExtractData(contentBuf.toString());
if (StringUtil.isNotBlank(resourceName)) {
extractData.putValues("resourceName", new String[] { resourceName });
}
return extractData;
} catch (final IOException e) {
- throw new ExtractException("Could not extract a content.", e);
+ throw new ExtractException("Could not extract content: resourceName=" + resourceName + " command=" + command, e);
} finally {
FileUtil.deleteInBackground(inputFile);
FileUtil.deleteInBackground(outputFile);
}
}
+ /**
+ * Copies an input stream into the destination file but stops and throws if the
+ * number of bytes read exceeds {@code limit}.
+ *
+ * @param in the input stream to read from
+ * @param dest the destination file
+ * @param limit the maximum number of bytes allowed
+ * @throws IOException if an I/O error occurs
+ * @throws ExtractException if the input stream exceeds {@code limit}
+ */
+ protected void copyToFileBounded(final InputStream in, final File dest, final long limit) throws IOException {
+ long total = 0L;
+ final byte[] buffer = new byte[8192];
+ try (BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(dest))) {
+ int n;
+ while ((n = in.read(buffer)) != -1) {
+ total += n;
+ if (total > limit) {
+ logger.warn("input size exceeded limit: limit={} command={}", limit, command);
+ throw new ExtractException("input size exceeded limit: limit=" + limit);
+ }
+ out.write(buffer, 0, n);
+ }
+ }
+ }
+
String getFileName(final String resourceName) {
- final String name = resourceName.replaceAll("/+$", "");
- final int pos = name.lastIndexOf('/');
+ final String name = resourceName.replaceAll("[/\\\\]+$", "");
+ final int pos = Math.max(name.lastIndexOf('/'), name.lastIndexOf('\\'));
if (pos >= 0) {
return name.substring(pos + 1);
}
return name;
}
- private void executeCommand(final File inputFile, final File outputFile) {
+ private String executeCommand(final File inputFile, final File outputFile) {
if (StringUtil.isBlank(command)) {
throw new CrawlerSystemException("External command is empty. Please configure a valid command for CommandExtractor.");
@@ -172,74 +270,285 @@ private void executeCommand(final File inputFile, final File outputFile) {
final List cmdList = parseCommand(command, params);
if (logger.isInfoEnabled()) {
- logger.info("Command: {}", cmdList);
+ logger.info("executing command: command={}", cmdList);
}
final ProcessBuilder pb = new ProcessBuilder(cmdList);
if (workingDirectory != null) {
pb.directory(workingDirectory);
}
- if (standardOutput) {
- pb.redirectOutput(outputFile);
- } else {
- pb.redirectErrorStream(true);
- }
+ // Do not use pb.redirectOutput(outputFile) for standardOutput=true: OS-level redirection
+ // bypasses the BoundedStreamReader, allowing unbounded writes. Instead, pipe stdout through
+ // a BoundedFileWriter that enforces maxOutputSize.
+
+ // Note: do not redirect error stream; we want stderr drained separately always.
Process currentProcess = null;
- MonitorThread mt = null;
+ ExecutorService streamPool = null;
+ Charset outCharset;
+ try {
+ outCharset = Charset.forName(commandOutputEncoding);
+ } catch (final IllegalCharsetNameException | UnsupportedCharsetException e) {
+ logger.warn("invalid commandOutputEncoding, falling back to UTF-8: encoding={} command={}", commandOutputEncoding, command, e);
+ outCharset = StandardCharsets.UTF_8;
+ }
+
+ final AtomicBoolean shuttingDown = new AtomicBoolean(false);
try {
currentProcess = pb.start();
- // monitoring
- mt = new MonitorThread(currentProcess, executionTimeout);
- mt.start();
+ streamPool = Executors.newFixedThreadPool(2, new DaemonThreadFactory("CommandExtractor-stream"));
- final InputStreamThread it = new InputStreamThread(currentProcess.getInputStream(), commandOutputEncoding, maxOutputLine);
- it.start();
+ final Process processRef = currentProcess;
+ final Charset charset = outCharset;
+ final long limit = maxOutputSize;
+ final AtomicBoolean overflowFlag = new AtomicBoolean(false);
+ final Future stdoutFuture;
+ if (standardOutput) {
+ // Drain stdout into outputFile with a byte-count bound to honour maxOutputSize.
+ stdoutFuture = streamPool.submit(
+ new BoundedFileWriter(processRef.getInputStream(), outputFile, limit, "stdout", overflowFlag, shuttingDown));
+ } else {
+ stdoutFuture = streamPool
+ .submit(new BoundedStreamReader(processRef.getInputStream(), charset, limit, "stdout", overflowFlag, shuttingDown));
+ }
+ final Future stderrFuture = streamPool
+ .submit(new BoundedStreamReader(processRef.getErrorStream(), charset, limit, "stderr", overflowFlag, shuttingDown));
- currentProcess.waitFor();
- it.join(5000);
+ // Poll for process exit so that an overflow detected by a reader thread can
+ // break out immediately rather than blocking for the full executionTimeout.
+ final long deadline = System.currentTimeMillis() + executionTimeout;
+ boolean exited = false;
+ while (System.currentTimeMillis() < deadline) {
+ if (overflowFlag.get()) {
+ // Reader thread detected overflow; kill the process tree immediately.
+ break;
+ }
+ if (currentProcess.waitFor(200, TimeUnit.MILLISECONDS)) {
+ exited = true;
+ break;
+ }
+ }
- if (mt.isTeminated()) {
- throw new ExecutionTimeoutException("The command execution is timeout: " + cmdList);
+ if (!exited) {
+ if (overflowFlag.get()) {
+ // Overflow path: kill the process tree, then surface the reader's exception.
+ shuttingDown.set(true);
+ destroyProcessTree(currentProcess);
+ // Collect exceptions from both futures with a short timeout.
+ for (final Future f : List.of(stdoutFuture, stderrFuture)) {
+ try {
+ f.get(2, TimeUnit.SECONDS);
+ } catch (final ExecutionException ee) {
+ final Throwable cause = ee.getCause();
+ if (cause instanceof OutputSizeExceededException) {
+ logger.warn("command output exceeded limit: limit={} command={}", maxOutputSize, cmdList);
+ throw new ExtractException("command output exceeded limit: limit=" + maxOutputSize, cause);
+ }
+ logger.warn("unexpected error draining stream during overflow handling: command={}", cmdList, cause);
+ throw new CrawlerSystemException("Failed to drain command output during overflow.", cause);
+ } catch (final TimeoutException te) {
+ // overflow already detected; surface the ExtractException below
+ } catch (final InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ break;
+ }
+ }
+ // guard: overflow flag set but no overflow exception observed
+ throw new ExtractException("command output exceeded limit: limit=" + maxOutputSize);
+ } else {
+ // Timeout path.
+ logger.warn("command timed out: timeout={}ms command={}", executionTimeout, cmdList);
+ shuttingDown.set(true);
+ destroyProcessTree(currentProcess);
+ throw new ExecutionTimeoutException("The command execution is timeout: " + cmdList);
+ }
+ }
+
+ // Process exited; collect drained output. Use a short timeout to avoid waiting forever
+ // if the reader threads are stuck (they shouldn't be since the streams are now closed).
+ String stdoutText = "";
+ String stderrText = "";
+ try {
+ stdoutText = stdoutFuture.get(5, TimeUnit.SECONDS);
+ stderrText = stderrFuture.get(5, TimeUnit.SECONDS);
+ } catch (final TimeoutException te) {
+ logger.warn("timed out collecting drained output: command={}", cmdList);
+ stdoutFuture.cancel(true);
+ stderrFuture.cancel(true);
+ destroyProcessTree(currentProcess);
+ throw new ExtractException("Failed to drain command output within 5s: command=" + cmdList, te);
+ } catch (final ExecutionException ee) {
+ final Throwable cause = ee.getCause();
+ if (cause instanceof OutputSizeExceededException) {
+ logger.warn("command output exceeded limit: limit={} command={}", maxOutputSize, cmdList);
+ shuttingDown.set(true);
+ destroyProcessTree(currentProcess);
+ throw new ExtractException("command output exceeded limit: limit=" + maxOutputSize, cause);
+ }
+ if (cause instanceof IOException) {
+ throw new CrawlerSystemException("Failed to drain command output.", cause);
+ }
+ throw new CrawlerSystemException("Failed to drain command output.", ee);
}
final int exitValue = currentProcess.exitValue();
+ if (exitValue != 0) {
+ logger.warn("command exited non-zero: exitCode={} command={}", exitValue, cmdList);
+ }
if (logger.isInfoEnabled()) {
if (standardOutput) {
- logger.info("Exit Code: {}", exitValue);
+ logger.info("command exited: exitCode={}", exitValue);
} else {
- logger.info("Exit Code: {} - Process Output:\n{}", exitValue, it.getOutput());
+ logger.info("command exited: exitCode={} stdout={}", exitValue, truncateForLog(stdoutText));
+ }
+ if (StringUtil.isNotEmpty(stderrText)) {
+ logger.info("command stderr: stderr={}", truncateForLog(stderrText));
}
}
- if (exitValue == 143 && mt.isTeminated()) {
- throw new ExecutionTimeoutException("The command execution is timeout: " + cmdList);
- }
+ return stderrText;
} catch (final CrawlerSystemException e) {
throw e;
} catch (final InterruptedException e) {
- if (mt != null && mt.isTeminated()) {
- throw new ExecutionTimeoutException("The command execution is timeout: " + cmdList, e);
+ Thread.currentThread().interrupt();
+ if (currentProcess != null) {
+ shuttingDown.set(true);
+ destroyProcessTree(currentProcess);
}
throw new InterruptedRuntimeException(e);
} catch (final Exception e) {
- throw new CrawlerSystemException("Process terminated.", e);
+ logger.warn("unexpected error executing command: command={}", cmdList, e);
+ throw new CrawlerSystemException("Failed to execute command: " + cmdList, e);
} finally {
- if (mt != null) {
- mt.setFinished(true);
+ if (currentProcess != null && currentProcess.isAlive()) {
+ shuttingDown.set(true);
+ destroyProcessTree(currentProcess);
+ }
+ if (streamPool != null) {
+ streamPool.shutdownNow();
try {
- mt.interrupt();
- } catch (final Exception e) {}
+ if (!streamPool.awaitTermination(1, TimeUnit.SECONDS)) {
+ logger.warn("stream pool did not terminate within 1s: command={}", cmdList);
+ }
+ } catch (final InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ }
}
- if (currentProcess != null) {
+ }
+ }
+
+ /**
+ * Sends SIGTERM, waits a grace period, then SIGKILL to the process and any descendants.
+ * Descendants are snapshotted before sending SIGTERM so that children reparented to
+ * PID 1 after the parent exits are still reachable.
+ *
+ * @param process the process to terminate
+ */
+ protected void destroyProcessTree(final Process process) {
+ if (process == null) {
+ return;
+ }
+ // Snapshot descendants before sending SIGTERM. If the parent shell receives SIGTERM and
+ // exits gracefully its children may be reparented to PID 1, making them invisible via
+ // process.descendants() after the fact. Taking the snapshot first ensures we still reach them.
+ final List descendantSnapshot;
+ try {
+ descendantSnapshot = process.descendants().collect(Collectors.toList());
+ } catch (final Exception e) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Failed to snapshot descendants.", e);
+ }
+ // Fall back to an empty list; we will still kill the parent below.
+ try {
+ process.destroy();
+ } catch (final Exception ex) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("destroy step failed: pid={}", process == null ? -1 : process.pid(), ex);
+ }
+ }
+ try {
+ process.destroyForcibly();
+ } catch (final Exception ex) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("destroy step failed: pid={}", process == null ? -1 : process.pid(), ex);
+ }
+ }
+ return;
+ }
+ try {
+ process.destroy();
+ } catch (final Exception ex) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("destroy step failed: pid={}", process == null ? -1 : process.pid(), ex);
+ }
+ }
+ try {
+ process.waitFor(destroyGracePeriodMillis, TimeUnit.MILLISECONDS);
+ } catch (final InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ }
+ // Forcibly kill parent and all snapshotted descendants regardless of whether the parent
+ // exited cleanly during the grace period.
+ for (final ProcessHandle h : descendantSnapshot) {
+ try {
+ if (h.isAlive()) {
+ h.destroyForcibly();
+ }
+ } catch (final Exception ex) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("destroy step failed: pid={}", h.pid(), ex);
+ }
+ }
+ }
+ try {
+ process.destroyForcibly();
+ } catch (final Exception ex) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("destroy step failed: pid={}", process == null ? -1 : process.pid(), ex);
+ }
+ }
+ // Bounded re-scan loop to catch fork-after-snapshot.
+ for (int attempt = 0; attempt < 3; attempt++) {
+ final List live;
+ try {
+ live = process.descendants().filter(ProcessHandle::isAlive).collect(Collectors.toList());
+ } catch (final Exception e) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("rescan failed: pid={}", process.pid(), e);
+ }
+ break;
+ }
+ if (live.isEmpty()) {
+ break;
+ }
+ for (final ProcessHandle h : live) {
try {
- currentProcess.destroy();
- } catch (final Exception e) {}
+ h.destroyForcibly();
+ } catch (final Exception e) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("destroy descendant failed: pid={}", h.pid(), e);
+ }
+ }
}
- currentProcess = null;
+ try {
+ Thread.sleep(50);
+ } catch (final InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ break;
+ }
+ }
+ }
+ private static String truncateForLog(final String s) {
+ if (s == null) {
+ return "";
+ }
+ final int max = 4000;
+ if (s.length() <= max) {
+ return s;
}
+ return s.substring(0, max) + "... [truncated]";
}
/**
@@ -301,119 +610,304 @@ private String getCommandValue(final String key, final Map param
}
/**
- * Thread to monitor and terminate processes that exceed the timeout.
+ * Daemon thread factory for stream-drain workers.
*/
- protected static class MonitorThread extends Thread {
- private final Process process;
+ protected static class DaemonThreadFactory implements ThreadFactory {
+ private final String namePrefix;
+ private final AtomicInteger counter = new AtomicInteger();
- private final long timeout;
+ /**
+ * Constructs a new DaemonThreadFactory.
+ *
+ * @param namePrefix the prefix used to name threads created by this factory
+ */
+ public DaemonThreadFactory(final String namePrefix) {
+ this.namePrefix = namePrefix;
+ }
- private boolean finished = false;
+ @Override
+ public Thread newThread(final Runnable r) {
+ final Thread t = new Thread(r, namePrefix + "-" + counter.incrementAndGet());
+ t.setDaemon(true);
+ return t;
+ }
+ }
- private boolean teminated = false;
+ /**
+ * Reads a stream fully (up to {@code limit} bytes) and returns it as a String.
+ * If the stream produces more than {@code limit} bytes, {@code overflowFlag} is
+ * set to {@code true} and an {@link OutputSizeExceededException} is thrown so
+ * that the main thread can detect the overflow without waiting for the full
+ * {@code executionTimeout}.
+ */
+ protected static class BoundedStreamReader implements Callable {
+ private final InputStream stream;
+ private final Charset charset;
+ private final long limit;
+ private final String streamName;
+ private final AtomicBoolean overflowFlag;
+ private final AtomicBoolean shuttingDown;
/**
- * Constructs a new MonitorThread.
- * @param process The process to monitor.
- * @param timeout The timeout for the process.
+ * Constructs a new BoundedStreamReader.
+ *
+ * @param stream the input stream to drain
+ * @param charset the charset used to decode the bytes
+ * @param limit the maximum number of bytes accepted before the process is killed
+ * @param streamName a label used in error messages and logs
+ * @param overflowFlag shared flag that is set to {@code true} just before
+ * {@link OutputSizeExceededException} is thrown, allowing the main
+ * thread's polling loop to break early and kill the process tree
+ * @param shuttingDown shared flag indicating the main thread is in the process of killing
+ * the subprocess; IOExceptions observed while this flag is set are
+ * logged at debug level rather than surfaced as ExecutionExceptions
*/
- public MonitorThread(final Process process, final long timeout) {
- this.process = process;
- this.timeout = timeout;
+ public BoundedStreamReader(final InputStream stream, final Charset charset, final long limit, final String streamName,
+ final AtomicBoolean overflowFlag, final AtomicBoolean shuttingDown) {
+ this.stream = stream;
+ this.charset = charset;
+ this.limit = limit;
+ this.streamName = streamName;
+ this.overflowFlag = overflowFlag;
+ this.shuttingDown = shuttingDown;
}
@Override
- public void run() {
- ThreadUtil.sleepQuietly(timeout);
+ public String call() throws IOException {
+ final byte[] buf = new byte[8192];
+ final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ long total = 0L;
+ try (InputStream is = stream) {
+ int n;
+ while ((n = is.read(buf)) != -1) {
+ total += n;
+ if (total > limit) {
+ // Signal the main thread's polling loop before throwing so it can
+ // break immediately and kill the process tree rather than blocking
+ // for the remainder of executionTimeout.
+ overflowFlag.set(true);
+ throw new OutputSizeExceededException(
+ "command output exceeded limit on " + streamName + ": limit=" + limit + " bytes");
+ }
+ baos.write(buf, 0, n);
+ }
+ } catch (final OutputSizeExceededException e) {
+ throw e;
+ } catch (final IOException ioe) {
+ if (shuttingDown.get()) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("stream closed during shutdown: stream={}", streamName, ioe);
+ }
+ } else {
+ logger.warn("unexpected I/O error on stream: stream={} drainedBytes={}", streamName, total, ioe);
+ throw ioe;
+ }
+ }
+ return new String(baos.toByteArray(), charset);
+ }
+ }
- if (!finished) {
- try {
- process.destroy();
- teminated = true;
- } catch (final Exception e) {
- if (logger.isInfoEnabled()) {
- logger.info("Could not kill the subprocess.", e);
+ /**
+ * Internal signal that the subprocess produced more output than allowed.
+ */
+ protected static class OutputSizeExceededException extends IOException {
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Constructs a new OutputSizeExceededException.
+ *
+ * @param message the detail message
+ */
+ public OutputSizeExceededException(final String message) {
+ super(message);
+ }
+ }
+
+ /**
+ * Copies bytes from an {@link InputStream} to a {@link File} up to {@code limit} bytes.
+ * When the limit is exceeded an {@link OutputSizeExceededException} is thrown so that the
+ * caller can invoke {@link CommandExtractor#destroyProcessTree(Process)} to terminate the
+ * subprocess and its descendants. The return value is always an empty string; the written
+ * content is in the output file.
+ */
+ protected static class BoundedFileWriter implements Callable {
+ private final InputStream stream;
+ private final File outputFile;
+ private final long limit;
+ private final String streamName;
+ private final AtomicBoolean overflowFlag;
+ private final AtomicBoolean shuttingDown;
+
+ /**
+ * Constructs a new BoundedFileWriter.
+ *
+ * @param stream the input stream to drain
+ * @param outputFile the file to write output to
+ * @param limit the maximum number of bytes accepted before the process is killed
+ * @param streamName a label used in error messages and logs
+ * @param overflowFlag shared flag set to {@code true} just before
+ * {@link OutputSizeExceededException} is thrown so the main thread's
+ * polling loop breaks out early instead of blocking until executionTimeout
+ * @param shuttingDown shared flag indicating the main thread is in the process of killing
+ * the subprocess; IOExceptions observed while this flag is set are
+ * logged at debug level rather than surfaced as ExecutionExceptions
+ */
+ public BoundedFileWriter(final InputStream stream, final File outputFile, final long limit, final String streamName,
+ final AtomicBoolean overflowFlag, final AtomicBoolean shuttingDown) {
+ this.stream = stream;
+ this.outputFile = outputFile;
+ this.limit = limit;
+ this.streamName = streamName;
+ this.overflowFlag = overflowFlag;
+ this.shuttingDown = shuttingDown;
+ }
+
+ @Override
+ public String call() throws IOException {
+ final byte[] buf = new byte[8192];
+ long total = 0L;
+ try (InputStream is = stream; BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(outputFile))) {
+ int n;
+ while ((n = is.read(buf)) != -1) {
+ total += n;
+ if (total > limit) {
+ overflowFlag.set(true);
+ throw new OutputSizeExceededException(
+ "command output exceeded limit on " + streamName + ": limit=" + limit + " bytes");
}
+ out.write(buf, 0, n);
+ }
+ } catch (final OutputSizeExceededException e) {
+ throw e;
+ } catch (final IOException ioe) {
+ if (shuttingDown.get()) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("stream closed during shutdown: stream={}", streamName, ioe);
+ }
+ } else {
+ logger.warn("unexpected I/O error on stream: stream={} drainedBytes={}", streamName, total, ioe);
+ throw ioe;
}
}
+ return "";
+ }
+ }
+
+ /**
+ * Legacy thread that used to monitor and terminate processes exceeding the
+ * timeout.
+ *
+ * @deprecated The timeout/kill machinery is now handled inline by
+ * {@link CommandExtractor#executeCommand(File, File)} using
+ * {@link Process#waitFor(long, TimeUnit)} and
+ * {@link CommandExtractor#destroyProcessTree(Process)}. This class is
+ * unused internally and is retained only as an empty stub so that
+ * existing third-party subclasses or callers that referenced
+ * {@code CommandExtractor.MonitorThread} continue to compile and
+ * link. New code should not extend or instantiate it.
+ */
+ @Deprecated
+ protected static class MonitorThread extends Thread {
+
+ /**
+ * Constructs a new MonitorThread.
+ *
+ * @param process the process to monitor (ignored; retained for source compat)
+ * @param timeout the timeout (ignored; retained for source compat)
+ */
+ @Deprecated
+ public MonitorThread(final Process process, final long timeout) {
+ // NOP - retained only for source compatibility.
+ }
+
+ /**
+ * No-op stub. Subclasses cannot override this — it is intentionally final to make
+ * legacy override attempts compile-fail (rather than silently no-op).
+ */
+ @Deprecated
+ @Override
+ public final void run() {
+ // NOP
}
/**
* Sets the finished flag.
- * @param finished The finished flag to set.
+ *
+ * Subclasses cannot override this — it is intentionally final to make legacy
+ * override attempts compile-fail (rather than silently no-op).
+ *
+ * @param finished the finished flag (ignored)
*/
- public void setFinished(final boolean finished) {
- this.finished = finished;
+ @Deprecated
+ public final void setFinished(final boolean finished) {
+ // NOP - retained only for source compatibility.
}
/**
* Returns whether the process was terminated.
- * @return true if terminated, false otherwise.
+ *
+ *
Subclasses cannot override this — it is intentionally final to make legacy
+ * override attempts compile-fail (rather than silently no-op).
+ *
+ *
Note: the typo is preserved intentionally for binary compatibility with the
+ * legacy API.
+ *
+ * @return always {@code false}; this stub never terminates anything.
*/
- public boolean isTeminated() {
- return teminated;
+ @Deprecated
+ public final boolean isTeminated() {
+ return false;
}
}
/**
- * Thread to read and buffer output from an input stream.
+ * Legacy thread that used to read and buffer output from an input stream.
+ *
+ * @deprecated Stream draining is now performed by
+ * {@link CommandExtractor.BoundedStreamReader} which is byte-bounded
+ * rather than line-bounded. This class is unused internally and is
+ * retained only as an empty stub so that existing third-party
+ * subclasses or callers that referenced
+ * {@code CommandExtractor.InputStreamThread} continue to compile and
+ * link. New code should not extend or instantiate it.
*/
+ @Deprecated
protected static class InputStreamThread extends Thread {
- private BufferedReader br;
-
- private final List list = new LinkedList<>();
-
- private final int maxLineBuffer;
-
/**
* Constructs a new InputStreamThread.
- * @param is The InputStream to read from.
- * @param charset The charset to use for reading.
- * @param maxOutputLineBuffer The maximum number of lines to buffer.
+ *
+ * @param is the input stream (ignored; retained for source compat)
+ * @param charset the charset (ignored; retained for source compat)
+ * @param maxOutputLineBuffer the line buffer size (ignored; retained for source compat)
*/
+ @Deprecated
public InputStreamThread(final InputStream is, final String charset, final int maxOutputLineBuffer) {
- try {
- br = new BufferedReader(new InputStreamReader(is, charset));
- } catch (final UnsupportedEncodingException e) {
- br = new BufferedReader(new InputStreamReader(is, Constants.UTF_8_CHARSET));
- }
- maxLineBuffer = maxOutputLineBuffer;
+ // NOP - retained only for source compatibility.
}
+ /**
+ * No-op stub. Subclasses cannot override this — it is intentionally final to make
+ * legacy override attempts compile-fail (rather than silently no-op).
+ */
+ @Deprecated
@Override
- public void run() {
- for (;;) {
- try {
- final String line = br.readLine();
- if (line == null) {
- break;
- }
- if (logger.isDebugEnabled()) {
- logger.debug(line);
- }
- list.add(line);
- if (list.size() > maxLineBuffer) {
- list.remove(0);
- }
- } catch (final IOException e) {
- throw new CrawlerSystemException(e);
- }
- }
+ public final void run() {
+ // NOP
}
/**
- * Returns the output as a String.
- * @return The output string.
+ * Returns the buffered output as a String.
+ *
+ * Subclasses cannot override this — it is intentionally final to make legacy
+ * override attempts compile-fail (rather than silently no-op).
+ *
+ * @return always an empty string; this stub never reads anything.
*/
- public String getOutput() {
- final StringBuilder buf = new StringBuilder(100);
- for (final String value : list) {
- buf.append(value).append("\n");
- }
- return buf.toString();
+ @Deprecated
+ public final String getOutput() {
+ return "";
}
-
}
/**
@@ -473,11 +967,25 @@ public void setCommandOutputEncoding(final String commandOutputEncoding) {
}
/**
- * Sets the maximum number of output lines to process.
+ * Sets the maximum number of output lines to buffer.
+ *
+ *
This setter is deprecated; use {@link #setMaxOutputSize(long)} instead.
+ * For backward compatibility, calling this method conservatively shrinks
+ * {@link #maxOutputSize} to {@code max(64 KiB, n * 1024)} bytes (1 line ≈ 1 KiB),
+ * but only when {@link #setMaxOutputSize(long)} has not been called
+ * explicitly on this instance. When both setters are called, the explicit
+ * {@link #setMaxOutputSize(long)} value always wins regardless of call order.
+ *
* @param maxOutputLine The maximum output lines to set.
+ * @deprecated Use {@link #setMaxOutputSize(long)} to set the byte-count bound directly.
*/
+ @Deprecated
public void setMaxOutputLine(final int maxOutputLine) {
this.maxOutputLine = maxOutputLine;
+ final long inferred = Math.max(64L * 1024L, (long) maxOutputLine * 1024L);
+ if (!maxOutputSizeExplicit) {
+ this.maxOutputSize = Math.min(this.maxOutputSize, inferred);
+ }
}
/**
@@ -487,4 +995,69 @@ public void setMaxOutputLine(final int maxOutputLine) {
public void setStandardOutput(final boolean standardOutput) {
this.standardOutput = standardOutput;
}
+
+ /**
+ * Sets the maximum number of bytes the subprocess may write to a single stream
+ * (stdout or stderr) before the process is killed. Calling this method marks the
+ * size as explicitly set, preventing a subsequent {@link #setMaxOutputLine(int)}
+ * call from overriding it.
+ *
+ *
Note: the entire output may be loaded into memory during extraction
+ * (transient 2–4× heap usage). Values above 512 MiB log a warning.
+ *
+ * @param maxOutputSize the limit in bytes
+ */
+ public void setMaxOutputSize(final long maxOutputSize) {
+ if (maxOutputSize > 512L * 1024L * 1024L) {
+ logger.warn("maxOutputSize is large; extraction may transiently allocate 2-4x this in heap: maxOutputSize={}", maxOutputSize);
+ }
+ this.maxOutputSize = maxOutputSize;
+ this.maxOutputSizeExplicit = true;
+ }
+
+ /**
+ * Returns the current maximum output size limit in bytes.
+ *
+ * @return the maximum number of bytes the subprocess may write to a single stream
+ */
+ public long getMaxOutputSize() {
+ return maxOutputSize;
+ }
+
+ /**
+ * Sets the maximum number of bytes copied from the input stream into the
+ * temporary input file. If the input exceeds this size an {@link ExtractException}
+ * is thrown before the command is invoked.
+ *
+ * @param maxInputSize the limit in bytes
+ */
+ public void setMaxInputSize(final long maxInputSize) {
+ this.maxInputSize = maxInputSize;
+ }
+
+ /**
+ * Sets the grace period in milliseconds between {@code Process.destroy()} (SIGTERM)
+ * and the fallback {@code Process.destroyForcibly()} (SIGKILL).
+ *
+ * @param destroyGracePeriodMillis the grace period
+ */
+ public void setDestroyGracePeriodMillis(final long destroyGracePeriodMillis) {
+ this.destroyGracePeriodMillis = destroyGracePeriodMillis;
+ }
+
+ /**
+ * Sets whether captured stderr text should be appended to the extracted content
+ * when {@link #standardOutput} is {@code false}. Defaults to {@code false} to
+ * match pre-3.x behavior where stderr was only logged, never included in
+ * {@code ExtractData}.
+ *
+ *
When set to {@code true} and {@code standardOutput=false}, captured stderr
+ * text is appended after the file content in the extracted body.
+ *
+ * @param includeStderrInOutput {@code true} to append stderr to the extracted
+ * content; {@code false} to keep the file content only
+ */
+ public void setIncludeStderrInOutput(final boolean includeStderrInOutput) {
+ this.includeStderrInOutput = includeStderrInOutput;
+ }
}
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractorTest.java
index f26ec320..bc2dae8d 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractorTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractorTest.java
@@ -15,19 +15,25 @@
*/
package org.codelibs.fess.crawler.extractor.impl;
+import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import org.apache.commons.lang3.SystemUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
import org.codelibs.core.exception.IORuntimeException;
import org.codelibs.core.io.FileUtil;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.ExecutionTimeoutException;
-import org.junit.jupiter.api.Test;
+import org.codelibs.fess.crawler.exception.ExtractException;
import org.dbflute.utflute.core.PlainTestCase;
+import org.junit.jupiter.api.Test;
/**
* @author shinsuke
@@ -35,6 +41,8 @@
*/
public class CommandExtractorTest extends PlainTestCase {
+ private static final Logger logger = LogManager.getLogger(CommandExtractorTest.class);
+
private File createScriptTempFile(final int sleep) {
String extention;
String content;
@@ -164,9 +172,11 @@ public void test_getText_timeout() throws IOException {
extractor.command = getCommand(scriptFile);
final Map params = new HashMap();
try {
- final ExtractData data = extractor.getText(new FileInputStream(contentFile), params);
+ extractor.getText(new FileInputStream(contentFile), params);
fail();
- } catch (final ExecutionTimeoutException e) {}
+ } catch (final ExecutionTimeoutException e) {
+ // expected
+ }
}
@Test
@@ -274,4 +284,669 @@ public void test_getFileName() {
assertEquals(expected, actual);
}
+
+ // ==========================================================
+ // PR-D: robustness tests
+ // ==========================================================
+
+ /**
+ * Verifies a long-running subprocess is killed when the timeout fires
+ * and the call returns within a few seconds (i.e. we don't wait the
+ * full sleep duration).
+ */
+ @Test
+ public void test_timeout_killsProcess() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return; // skip on non-Unix
+ }
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = "sh -c \"sleep 30\"";
+ extractor.executionTimeout = 500L;
+
+ final long start = System.currentTimeMillis();
+ try {
+ extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>());
+ fail();
+ } catch (final ExecutionTimeoutException e) {
+ // expected
+ }
+ final long elapsed = System.currentTimeMillis() - start;
+ assertTrue(elapsed < 5000L);
+ }
+
+ /**
+ * Verifies that when the subprocess produces more bytes on stderr than
+ * {@code maxOutputSize} the call fails fast with an {@link ExtractException}
+ * rather than buffering an unbounded amount of data.
+ */
+ @Test
+ public void test_outputSizeExceeded_throwsExtractException() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final CommandExtractor extractor = new CommandExtractor();
+ // print 5 MiB of zeros to stderr; cap at 1 MiB.
+ extractor.command = "sh -c \"head -c 5242880 /dev/zero 1>&2\"";
+ extractor.executionTimeout = 30_000L;
+ extractor.maxOutputSize = 1024L * 1024L; // 1 MiB
+
+ final long start = System.currentTimeMillis();
+ try {
+ extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>());
+ fail();
+ } catch (final ExtractException e) {
+ // expected
+ }
+ final long elapsed = System.currentTimeMillis() - start;
+ if (elapsed >= 5000L) {
+ logger.warn("test_outputSizeExceeded_throwsExtractException: did not fast-fail (was {}ms)", elapsed);
+ }
+ assertTrue(elapsed < 5000L);
+ }
+
+ /**
+ * Verifies stderr is drained concurrently so the process does not block
+ * on a full pipe buffer (default ~64KB on Linux). We write much more than
+ * that to stderr while the script also produces normal output.
+ */
+ @Test
+ public void test_stderrDrained_doesNotDeadlock() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final File scriptFile;
+ try {
+ scriptFile = File.createTempFile("stderr_script", ".sh");
+ scriptFile.deleteOnExit();
+ // Write 1 MiB to stderr (well above default pipe buffer) then copy input.
+ FileUtil.writeBytes(scriptFile.getAbsolutePath(),
+ ("#!/bin/bash\nhead -c 1048576 /dev/zero 1>&2\ncp \"$1\" \"$2\"\n").getBytes());
+ } catch (final IOException e) {
+ throw new IORuntimeException(e);
+ }
+
+ final String content = "TEST";
+ final File contentFile = createContentFile(".txt", content.getBytes());
+
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.executionTimeout = 30_000L;
+ extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE";
+ // This test focuses on draining stderr so the process does not block;
+ // suppress stderr-in-output so the assertion compares only the file content.
+ extractor.setIncludeStderrInOutput(false);
+
+ final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>());
+ assertEquals(content, data.getContent());
+ }
+
+ /**
+ * Verifies the input copy is bounded by {@code maxInputSize} and the
+ * extractor fails fast before invoking the command.
+ */
+ @Test
+ public void test_inputSizeExceeded_throwsExtractException() {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final File scriptFile = createScriptTempFile(0);
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = getCommand(scriptFile);
+ extractor.maxInputSize = 1024L; // 1 KiB
+
+ // 64 KiB of zeros is well above the 1 KiB cap.
+ final InputStream big = new ByteArrayInputStream(new byte[64 * 1024]);
+ try {
+ extractor.getText(big, new HashMap<>());
+ fail();
+ } catch (final ExtractException e) {
+ // expected
+ }
+ }
+
+ /**
+ * Verifies that when a subprocess spawns a long-running child, the timeout
+ * path attempts to kill descendants too. We check this best-effort by
+ * snapshotting {@link ProcessHandle#allProcesses} before/after; the precise
+ * verification of orphan kill is platform-specific.
+ */
+ @Test
+ public void test_processDescendants_killed() throws Exception {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final CommandExtractor extractor = new CommandExtractor();
+ // Spawn a child sleep, then wait. When the parent is killed we want the child gone too.
+ extractor.command = "sh -c \"sleep 30 & wait\"";
+ extractor.executionTimeout = 500L;
+
+ final long start = System.currentTimeMillis();
+ try {
+ extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>());
+ fail();
+ } catch (final ExecutionTimeoutException e) {
+ // expected
+ }
+ final long elapsed = System.currentTimeMillis() - start;
+ // Total wall clock should be close to the timeout, not the 30-sec sleep.
+ assertTrue(elapsed < 10_000L);
+ }
+
+ /**
+ * Verifies that when {@code standardOutput=false} and {@code includeStderrInOutput=true},
+ * captured stderr text is appended to the extracted content.
+ * Note: the opt-in flag must be set explicitly because the default is {@code false}.
+ */
+ @Test
+ public void test_stderrAppendedToOutput_whenStandardOutputFalse() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ // Script writes "OUT" to $OUTPUT_FILE and "ERR" to stderr.
+ final File scriptFile;
+ try {
+ scriptFile = File.createTempFile("stderr_in_out_", ".sh");
+ scriptFile.deleteOnExit();
+ FileUtil.writeBytes(scriptFile.getAbsolutePath(), "#!/bin/bash\necho OUT > \"$2\"\necho ERR >&2\n".getBytes());
+ } catch (final IOException e) {
+ throw new IORuntimeException(e);
+ }
+
+ final File contentFile = createContentFile(".txt", new byte[] { 0 });
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE";
+ extractor.executionTimeout = 30_000L;
+ // Explicitly opt in — default is false (pre-3.x behavior).
+ extractor.setIncludeStderrInOutput(true);
+
+ final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>());
+ final String text = data.getContent();
+ assertTrue(text.contains("OUT"));
+ assertTrue(text.contains("ERR"));
+ }
+
+ /**
+ * Verifies that by default ({@code includeStderrInOutput=false}), stderr is NOT
+ * appended to the extracted content even when {@code standardOutput=false}. This
+ * matches the pre-3.x behavior where stderr was only routed to logs, never to
+ * extracted text.
+ */
+ @Test
+ public void test_stderrNotInOutput_byDefault() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ // Script writes "content" to $OUTPUT_FILE and "warning" to stderr.
+ final File scriptFile;
+ try {
+ scriptFile = File.createTempFile("stderr_default_", ".sh");
+ scriptFile.deleteOnExit();
+ FileUtil.writeBytes(scriptFile.getAbsolutePath(), "#!/bin/bash\necho content > \"$2\"\necho warning >&2\n".getBytes());
+ } catch (final IOException e) {
+ throw new IORuntimeException(e);
+ }
+
+ final File contentFile = createContentFile(".txt", new byte[] { 0 });
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE";
+ extractor.executionTimeout = 30_000L;
+ // Do NOT call setIncludeStderrInOutput — default must be false.
+
+ final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>());
+ final String text = data.getContent();
+ assertTrue(text.contains("content"));
+ assertFalse(text.contains("warning")); // stderr should not appear in extracted text by default
+ }
+
+ /**
+ * Verifies that {@code setIncludeStderrInOutput(false)} suppresses appending
+ * stderr to the extracted content even when {@code standardOutput=false}.
+ */
+ @Test
+ public void test_stderrSuppressedFromOutput_whenIncludeStderrInOutputFalse() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final File scriptFile;
+ try {
+ scriptFile = File.createTempFile("stderr_suppressed_", ".sh");
+ scriptFile.deleteOnExit();
+ FileUtil.writeBytes(scriptFile.getAbsolutePath(), "#!/bin/bash\necho OUT > \"$2\"\necho ERR >&2\n".getBytes());
+ } catch (final IOException e) {
+ throw new IORuntimeException(e);
+ }
+
+ final File contentFile = createContentFile(".txt", new byte[] { 0 });
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE";
+ extractor.executionTimeout = 30_000L;
+ extractor.setIncludeStderrInOutput(false);
+
+ final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>());
+ final String text = data.getContent();
+ assertTrue(text.contains("OUT"));
+ assertFalse(text.contains("ERR"));
+ }
+
+ // ==========================================================
+ // Fix 1: outputFile exceeds maxOutputSize → ExtractException
+ // ==========================================================
+
+ /**
+ * Fix 1 (standardOutput=true): when the command writes more than maxOutputSize bytes to
+ * stdout, getText() must throw ExtractException rather than loading the entire file into
+ * memory.
+ */
+ @Test
+ public void test_outputFileExceedsMaxOutputSize_throws() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final CommandExtractor extractor = new CommandExtractor();
+ // cat $INPUT_FILE sends content to stdout; we will pipe a 5-MiB input with standardOutput=true.
+ // 5 MiB output > 1 MiB limit should trigger the guard.
+ extractor.standardOutput = true;
+ extractor.command = "sh -c \"head -c 5242880 /dev/zero\"";
+ extractor.executionTimeout = 30_000L;
+ extractor.maxOutputSize = 1024L * 1024L; // 1 MiB
+
+ try {
+ extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>());
+ fail();
+ } catch (final ExtractException e) {
+ // expected
+ }
+ }
+
+ /**
+ * Fix 1 (standardOutput=false): when the command writes more than maxOutputSize bytes to
+ * $OUTPUT_FILE, getText() must throw ExtractException (length-check defensive line).
+ */
+ @Test
+ public void test_outputFileExceedsMaxOutputSize_standardOutputFalse_throws() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final File scriptFile;
+ try {
+ scriptFile = File.createTempFile("big_output_", ".sh");
+ scriptFile.deleteOnExit();
+ // Write 5 MiB of zeros to $OUTPUT_FILE directly (bypasses BoundedStreamReader).
+ FileUtil.writeBytes(scriptFile.getAbsolutePath(), "#!/bin/bash\nhead -c 5242880 /dev/zero > \"$2\"\n".getBytes());
+ } catch (final IOException e) {
+ throw new IORuntimeException(e);
+ }
+
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE";
+ extractor.executionTimeout = 30_000L;
+ extractor.maxOutputSize = 1024L * 1024L; // 1 MiB
+
+ try {
+ extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>());
+ fail();
+ } catch (final ExtractException e) {
+ // expected
+ }
+ }
+
+ // ==========================================================
+ // Fix 2: descendants are killed even when parent exits cleanly
+ // ==========================================================
+
+ /**
+ * Fix 2: verifies that after a timeout the spawned child process (sleep) is killed even
+ * when the parent shell exits gracefully before the grace period expires and would otherwise
+ * cause descendants() to return an empty snapshot.
+ */
+ @Test
+ public void test_processDescendants_killed_orphan_gone() throws Exception {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final CommandExtractor extractor = new CommandExtractor();
+ // The shell spawns a background sleep and prints its PID to stdout, then waits.
+ // standardOutput=false so we can read stdout text via BoundedStreamReader.
+ extractor.command = "sh -c \"sleep 30 & echo $!; wait\"";
+ extractor.executionTimeout = 800L;
+ extractor.destroyGracePeriodMillis = 200L;
+
+ try {
+ extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>());
+ fail();
+ } catch (final ExecutionTimeoutException e) {
+ // expected — but we did not capture stdout here since the exception fires before we collect it.
+ } catch (final Exception e) {
+ // acceptable
+ }
+
+ // Give the OS a moment to reap the child.
+ Thread.sleep(500);
+
+ // Best-effort check: find any process with "sleep 30" in its command that is still alive.
+ final boolean orphanAlive = ProcessHandle.allProcesses()
+ .filter(ProcessHandle::isAlive)
+ .anyMatch(ph -> ph.info().command().map(c -> c.contains("sleep")).orElse(false) && ph.info().arguments().map(args -> {
+ for (final String a : args) {
+ if ("30".equals(a)) {
+ return true;
+ }
+ }
+ return false;
+ }).orElse(false));
+
+ if (orphanAlive) {
+ logger.warn("test_processDescendants_killed_orphan_gone: orphan 'sleep 30' still alive after kill attempt");
+ }
+ assertFalse(orphanAlive);
+ }
+
+ // ==========================================================
+ // Fix 3: overflow kills descendants via destroyProcessTree
+ // ==========================================================
+
+ /**
+ * Fix 3 + overflow fail-fast: when BoundedStreamReader detects overflow it throws
+ * OutputSizeExceededException and sets the overflow flag so the main thread breaks
+ * out of its polling loop and calls destroyProcessTree immediately — well before the
+ * full executionTimeout elapses. Also verifies the background sleep child is reaped.
+ */
+ @Test
+ public void test_overflow_killsDescendants() throws Exception {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final CommandExtractor extractor = new CommandExtractor();
+ // Spawn a background sleep, then flood stderr with zeros to trigger overflow.
+ // standardOutput=false so stdout is piped; stderr triggers BoundedStreamReader overflow.
+ extractor.command = "sh -c \"sleep 30 & head -c 5242880 /dev/zero 1>&2; wait\"";
+ // Use 10 s timeout — test must complete well below this if fail-fast works.
+ extractor.executionTimeout = 10_000L;
+ extractor.maxOutputSize = 1024L * 1024L; // 1 MiB
+ extractor.destroyGracePeriodMillis = 200L;
+
+ final long start = System.currentTimeMillis();
+ try {
+ extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>());
+ fail();
+ } catch (final ExtractException e) {
+ // expected — overflow detected
+ }
+ final long elapsed = System.currentTimeMillis() - start;
+ // Overflow should be detected and process killed well before the 10 s timeout.
+ assertTrue(elapsed < 5000L); // overflow should kill process tree in < 5 s
+
+ Thread.sleep(500);
+
+ final boolean orphanAlive = ProcessHandle.allProcesses()
+ .filter(ProcessHandle::isAlive)
+ .anyMatch(ph -> ph.info().command().map(c -> c.contains("sleep")).orElse(false) && ph.info().arguments().map(args -> {
+ for (final String a : args) {
+ if ("30".equals(a)) {
+ return true;
+ }
+ }
+ return false;
+ }).orElse(false));
+
+ if (orphanAlive) {
+ logger.warn("test_overflow_killsDescendants: orphan 'sleep 30' still alive after kill attempt");
+ }
+ assertFalse(orphanAlive);
+ }
+
+ // ==========================================================
+ // Fix 4: setMaxOutputLine conservatively shrinks maxOutputSize
+ // ==========================================================
+
+ /**
+ * Fix 4a: setMaxOutputLine(1) should shrink maxOutputSize to max(64KiB, 1*1024) = 64 KiB
+ * when setMaxOutputSize has not been called.
+ */
+ @Test
+ @SuppressWarnings("deprecation")
+ public void test_setMaxOutputLine_shrinksMaxOutputSize() {
+ final CommandExtractor extractor = new CommandExtractor();
+ // Default maxOutputSize is 10 MiB; setMaxOutputLine(1) infers 64 KiB (floor), which is smaller.
+ extractor.setMaxOutputLine(1);
+ assertTrue(extractor.getMaxOutputSize() <= 64L * 1024L);
+ }
+
+ /**
+ * Fix 4b: when setMaxOutputSize has been called explicitly, a subsequent setMaxOutputLine
+ * must not override it — the explicit value must be preserved regardless of call order.
+ */
+ @Test
+ @SuppressWarnings("deprecation")
+ public void test_setMaxOutputLine_doesNotOverrideExplicitMaxOutputSize() {
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.setMaxOutputSize(50L * 1024L * 1024L); // 50 MiB explicit
+ extractor.setMaxOutputLine(1); // would infer 64 KiB but must not override explicit
+ assertEquals(50L * 1024L * 1024L, extractor.getMaxOutputSize());
+ }
+
+ // ==========================================================
+ // PR-159 review-fix tests
+ // ==========================================================
+
+ /**
+ * Validates C1: standardOutput=true overflow must fail fast (not wait for executionTimeout).
+ */
+ @Test
+ public void test_standardOutputOverflow_failsFastWithoutWaitingForTimeout() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.standardOutput = true;
+ // Produce 5 MiB then keep running. Without C1 fix the test would wait the full executionTimeout.
+ extractor.command = "sh -c \"head -c 5242880 /dev/zero; sleep 30\"";
+ extractor.executionTimeout = 10_000L;
+ extractor.maxOutputSize = 1024L * 1024L;
+ final long start = System.currentTimeMillis();
+ try {
+ extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>());
+ fail();
+ } catch (final ExtractException e) {
+ // expected
+ }
+ final long elapsed = System.currentTimeMillis() - start;
+ if (elapsed >= 5000L) {
+ logger.warn("test_standardOutputOverflow_failsFastWithoutWaitingForTimeout: overflow did not fail fast (was {}ms)", elapsed);
+ }
+ assertTrue(elapsed < 5000L);
+ }
+
+ /**
+ * Validates C4: invalid charset name falls back to UTF-8 silently (warning logged).
+ */
+ @Test
+ public void test_invalidCommandOutputEncoding_fallsBackToUtf8() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final File scriptFile = createScriptTempFileStdout(0);
+ final File contentFile = createContentFile(".txt", "HELLO".getBytes());
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.standardOutput = true;
+ extractor.command = getCommandStdout(scriptFile);
+ extractor.setCommandOutputEncoding("not-a-real-charset-12345");
+ extractor.executionTimeout = 30_000L;
+ // Should NOT throw — falls back to UTF-8 silently (warning logged).
+ final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>());
+ assertEquals("HELLO", data.getContent());
+ }
+
+ /**
+ * Verifies a blank command surfaces as a CrawlerSystemException rather than NPE/IAE.
+ */
+ @Test
+ public void test_blankCommand_throwsCrawlerSystemException() {
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = " ";
+ try {
+ extractor.getText(new ByteArrayInputStream("x".getBytes()), new HashMap<>());
+ fail();
+ } catch (final org.codelibs.fess.crawler.exception.CrawlerSystemException e) {
+ // expected
+ }
+ }
+
+ /**
+ * Verifies a nonexistent binary surfaces as a CrawlerSystemException.
+ */
+ @Test
+ public void test_nonexistentBinary_throwsCrawlerSystemException() {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = "/nonexistent/path/to/binary $INPUT_FILE $OUTPUT_FILE";
+ try {
+ extractor.getText(new ByteArrayInputStream("x".getBytes()), new HashMap<>());
+ fail();
+ } catch (final org.codelibs.fess.crawler.exception.CrawlerSystemException e) {
+ // expected
+ }
+ }
+
+ /**
+ * Validates M4: non-zero exit must NOT throw — content is still returned (lenient contract).
+ */
+ @Test
+ public void test_nonZeroExit_returnsAvailableContent() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final File scriptFile;
+ try {
+ scriptFile = File.createTempFile("nonzero_", ".sh");
+ scriptFile.deleteOnExit();
+ FileUtil.writeBytes(scriptFile.getAbsolutePath(), "#!/bin/bash\necho partial > \"$2\"\nexit 7\n".getBytes());
+ } catch (final IOException e) {
+ throw new IORuntimeException(e);
+ }
+ final File contentFile = createContentFile(".txt", new byte[] { 0 });
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE";
+ extractor.executionTimeout = 30_000L;
+ // Lenient contract: non-zero exit does NOT throw; content is returned.
+ final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>());
+ assertTrue(data.getContent().contains("partial"));
+ }
+
+ /**
+ * Tightened version of test_setMaxOutputLine_shrinksMaxOutputSize: verifies exact 64 KiB floor.
+ */
+ @Test
+ @SuppressWarnings("deprecation")
+ public void test_setMaxOutputLine_shrinksMaxOutputSize_toExactFloor() {
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.setMaxOutputLine(1);
+ assertEquals(64L * 1024L, extractor.getMaxOutputSize());
+ }
+
+ /**
+ * Verifies M8: resourceName containing shell metacharacters is sanitized so it cannot be used
+ * to inject arguments via the temp-file prefix.
+ */
+ @Test
+ public void test_resourceNameWithMetacharacters_sanitizedInTempFileName() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final File scriptFile = createScriptTempFile(0);
+ final File contentFile = createContentFile(".txt", "OK".getBytes());
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = getCommand(scriptFile);
+ final Map params = new HashMap<>();
+ // A maliciously crafted resourceName with shell metacharacters.
+ params.put(ExtractData.RESOURCE_NAME_KEY, "evil;rm -rf /.--reset.txt");
+ final ExtractData data = extractor.getText(new FileInputStream(contentFile), params);
+ assertEquals("OK", data.getContent());
+ }
+
+ /**
+ * Verifies M8: getFileName handles Windows-style backslash path separators.
+ */
+ @Test
+ public void test_getFileName_handlesWindowsBackslash() {
+ final CommandExtractor extractor = new CommandExtractor();
+ assertEquals("hoge.txt", extractor.getFileName("C:\\path\\hoge.txt"));
+ assertEquals("hoge.txt", extractor.getFileName("path\\hoge.txt"));
+ assertEquals("hoge.txt", extractor.getFileName("hoge.txt\\"));
+ }
+
+ /**
+ * Verifies thread interrupt is propagated and the interrupt flag is restored.
+ */
+ @Test
+ public void test_interrupt_propagatesAsInterruptedRuntimeException() throws InterruptedException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = "sh -c \"sleep 30\"";
+ extractor.executionTimeout = 30_000L;
+ final Thread target = Thread.currentThread();
+ final Thread interrupter = new Thread(() -> {
+ try {
+ Thread.sleep(500);
+ } catch (final InterruptedException ignored) {}
+ target.interrupt();
+ });
+ interrupter.start();
+ try {
+ extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>());
+ fail();
+ } catch (final org.codelibs.core.exception.InterruptedRuntimeException expected) {
+ // good
+ } catch (final Exception e) {
+ // also acceptable as long as interrupt flag was restored
+ } finally {
+ Thread.interrupted(); // clear interrupt for subsequent tests
+ }
+ }
+
+ /**
+ * Verifies an invalid workingDirectory surfaces as a CrawlerSystemException.
+ */
+ @Test
+ public void test_invalidWorkingDirectory_throwsCrawlerSystemException() throws IOException {
+ if (!SystemUtils.IS_OS_UNIX) {
+ return;
+ }
+ final File scriptFile = createScriptTempFile(0);
+ final File contentFile = createContentFile(".txt", "x".getBytes());
+ final CommandExtractor extractor = new CommandExtractor();
+ extractor.command = getCommand(scriptFile);
+ extractor.workingDirectory = new File("/nonexistent/path/that/does/not/exist");
+ try {
+ extractor.getText(new FileInputStream(contentFile), new HashMap<>());
+ fail();
+ } catch (final org.codelibs.fess.crawler.exception.CrawlerSystemException e) {
+ // expected
+ }
+ }
+
+ /**
+ * Verifies the deprecated MonitorThread stub still works as a NOP.
+ */
+ @Test
+ @SuppressWarnings("deprecation")
+ public void test_deprecatedMonitorThread_isNop() {
+ final CommandExtractor.MonitorThread mt = new CommandExtractor.MonitorThread(null, 1000L);
+ mt.setFinished(true);
+ assertFalse(mt.isTeminated());
+ }
+
+ /**
+ * Verifies the deprecated InputStreamThread stub still works as a NOP.
+ */
+ @Test
+ @SuppressWarnings("deprecation")
+ public void test_deprecatedInputStreamThread_isNop() {
+ final CommandExtractor.InputStreamThread it =
+ new CommandExtractor.InputStreamThread(new ByteArrayInputStream(new byte[0]), "UTF-8", 100);
+ assertEquals("", it.getOutput());
+ }
}