diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractor.java index 043db122..a4f186a6 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractor.java @@ -15,26 +15,37 @@ */ package org.codelibs.fess.crawler.extractor.impl; -import java.io.BufferedReader; +import java.io.BufferedOutputStream; +import java.io.ByteArrayOutputStream; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; import java.util.ArrayList; import java.util.HashMap; -import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.core.exception.InterruptedRuntimeException; -import org.codelibs.core.io.CopyUtil; import org.codelibs.core.io.FileUtil; import org.codelibs.core.lang.StringUtil; -import org.codelibs.core.lang.ThreadUtil; import org.codelibs.fess.crawler.Constants; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; @@ -68,12 +79,46 @@ public class CommandExtractor extends AbstractExtractor { /** The encoding for the command's output. */ protected String commandOutputEncoding = Charset.defaultCharset().displayName(); - /** The maximum number of lines to buffer from command output. */ + /** + * The (formerly) maximum number of lines to buffer from command output. + * + * @deprecated The line-count cap has been removed in favor of a byte-count cap. + * This field is no longer consulted; use {@link #setMaxOutputSize(long)} + * to bound how many bytes are read from stdout/stderr. The field is + * retained only for source/binary compatibility with callers that set + * it via {@link #setMaxOutputLine(int)} or via reflection. + */ + @Deprecated protected int maxOutputLine = 1000; /** Whether to redirect standard output to a file. */ protected boolean standardOutput = false; + /** Maximum bytes the subprocess is allowed to write to a single drained stream (stdout/stderr). */ + protected long maxOutputSize = 10L * 1024L * 1024L; // 10 MiB + + /** Whether {@link #setMaxOutputSize(long)} has been called explicitly, preventing {@link #setMaxOutputLine(int)} from overriding it. */ + private boolean maxOutputSizeExplicit = false; + + /** Maximum bytes copied from the input stream into the temporary input file. */ + protected long maxInputSize = 100L * 1024L * 1024L; // 100 MiB + + /** Grace period (ms) given to a process after destroy() before destroyForcibly() is invoked. */ + protected long destroyGracePeriodMillis = 2000L; + + /** + * Whether to append captured stderr text to the extracted content when + * {@link #standardOutput} is {@code false}. Defaults to {@code false} to + * match pre-3.x behavior where stderr was only routed to logs, never to + * extracted text. (The original {@code ProcessBuilder.redirectErrorStream(true)} + * call only merged the streams for log draining; it never caused stderr to + * appear in {@code ExtractData}.) + * + *

Set to {@code true} to append captured stderr after the file content in + * the extracted body when {@code standardOutput=false}. + */ + protected boolean includeStderrInOutput = false; + /** * Constructs a new CommandExtractor. */ @@ -115,6 +160,18 @@ public ExtractData getText(final InputStream in, final Map param filePrefix = "none"; extention = ""; } + // Sanitize prefix and extension to prevent argument-injection via crafted resourceName + // when downstream commands re-expand args through a shell wrapper. + filePrefix = filePrefix.replaceAll("[^A-Za-z0-9._-]", "_"); + if (filePrefix.startsWith("-")) { + filePrefix = "_" + filePrefix; + } + if (!extention.isEmpty()) { + extention = extention.replaceAll("[^A-Za-z0-9]", ""); + if (extention.length() > 16) { + extention = extention.substring(0, 16); + } + } File inputFile = null; File outputFile = null; try { @@ -132,35 +189,76 @@ public ExtractData getText(final InputStream in, final Map param } outputFile = createTempFile("cmdextout_" + filePrefix + "_", ext, tempDir); - // store to a file - CopyUtil.copy(in, inputFile); + // store to a file (bounded by maxInputSize) + copyToFileBounded(in, inputFile, maxInputSize); + + final String stderrText = executeCommand(inputFile, outputFile); - executeCommand(inputFile, outputFile); + // For standardOutput=false this is the only guard against the external command writing + // an unbounded $OUTPUT_FILE — bounded readers cover stdout/stderr only. + if (outputFile.length() > maxOutputSize) { + logger.warn("output file size exceeded limit: size={} limit={} command={}", outputFile.length(), maxOutputSize, command); + throw new ExtractException("output file size exceeded limit: limit=" + maxOutputSize); + } - final ExtractData extractData = new ExtractData(new String(FileUtil.readBytes(outputFile), outputEncoding)); + final StringBuilder contentBuf = new StringBuilder(); + contentBuf.append(new String(FileUtil.readBytes(outputFile), outputEncoding)); + // For backward compatibility with the legacy implementation that used + // ProcessBuilder.redirectErrorStream(true) when standardOutput=false, + // append captured stderr text to the extracted content. + if (!standardOutput && includeStderrInOutput && StringUtil.isNotEmpty(stderrText)) { + contentBuf.append(stderrText); + } + final ExtractData extractData = new ExtractData(contentBuf.toString()); if (StringUtil.isNotBlank(resourceName)) { extractData.putValues("resourceName", new String[] { resourceName }); } return extractData; } catch (final IOException e) { - throw new ExtractException("Could not extract a content.", e); + throw new ExtractException("Could not extract content: resourceName=" + resourceName + " command=" + command, e); } finally { FileUtil.deleteInBackground(inputFile); FileUtil.deleteInBackground(outputFile); } } + /** + * Copies an input stream into the destination file but stops and throws if the + * number of bytes read exceeds {@code limit}. + * + * @param in the input stream to read from + * @param dest the destination file + * @param limit the maximum number of bytes allowed + * @throws IOException if an I/O error occurs + * @throws ExtractException if the input stream exceeds {@code limit} + */ + protected void copyToFileBounded(final InputStream in, final File dest, final long limit) throws IOException { + long total = 0L; + final byte[] buffer = new byte[8192]; + try (BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(dest))) { + int n; + while ((n = in.read(buffer)) != -1) { + total += n; + if (total > limit) { + logger.warn("input size exceeded limit: limit={} command={}", limit, command); + throw new ExtractException("input size exceeded limit: limit=" + limit); + } + out.write(buffer, 0, n); + } + } + } + String getFileName(final String resourceName) { - final String name = resourceName.replaceAll("/+$", ""); - final int pos = name.lastIndexOf('/'); + final String name = resourceName.replaceAll("[/\\\\]+$", ""); + final int pos = Math.max(name.lastIndexOf('/'), name.lastIndexOf('\\')); if (pos >= 0) { return name.substring(pos + 1); } return name; } - private void executeCommand(final File inputFile, final File outputFile) { + private String executeCommand(final File inputFile, final File outputFile) { if (StringUtil.isBlank(command)) { throw new CrawlerSystemException("External command is empty. Please configure a valid command for CommandExtractor."); @@ -172,74 +270,285 @@ private void executeCommand(final File inputFile, final File outputFile) { final List cmdList = parseCommand(command, params); if (logger.isInfoEnabled()) { - logger.info("Command: {}", cmdList); + logger.info("executing command: command={}", cmdList); } final ProcessBuilder pb = new ProcessBuilder(cmdList); if (workingDirectory != null) { pb.directory(workingDirectory); } - if (standardOutput) { - pb.redirectOutput(outputFile); - } else { - pb.redirectErrorStream(true); - } + // Do not use pb.redirectOutput(outputFile) for standardOutput=true: OS-level redirection + // bypasses the BoundedStreamReader, allowing unbounded writes. Instead, pipe stdout through + // a BoundedFileWriter that enforces maxOutputSize. + + // Note: do not redirect error stream; we want stderr drained separately always. Process currentProcess = null; - MonitorThread mt = null; + ExecutorService streamPool = null; + Charset outCharset; + try { + outCharset = Charset.forName(commandOutputEncoding); + } catch (final IllegalCharsetNameException | UnsupportedCharsetException e) { + logger.warn("invalid commandOutputEncoding, falling back to UTF-8: encoding={} command={}", commandOutputEncoding, command, e); + outCharset = StandardCharsets.UTF_8; + } + + final AtomicBoolean shuttingDown = new AtomicBoolean(false); try { currentProcess = pb.start(); - // monitoring - mt = new MonitorThread(currentProcess, executionTimeout); - mt.start(); + streamPool = Executors.newFixedThreadPool(2, new DaemonThreadFactory("CommandExtractor-stream")); - final InputStreamThread it = new InputStreamThread(currentProcess.getInputStream(), commandOutputEncoding, maxOutputLine); - it.start(); + final Process processRef = currentProcess; + final Charset charset = outCharset; + final long limit = maxOutputSize; + final AtomicBoolean overflowFlag = new AtomicBoolean(false); + final Future stdoutFuture; + if (standardOutput) { + // Drain stdout into outputFile with a byte-count bound to honour maxOutputSize. + stdoutFuture = streamPool.submit( + new BoundedFileWriter(processRef.getInputStream(), outputFile, limit, "stdout", overflowFlag, shuttingDown)); + } else { + stdoutFuture = streamPool + .submit(new BoundedStreamReader(processRef.getInputStream(), charset, limit, "stdout", overflowFlag, shuttingDown)); + } + final Future stderrFuture = streamPool + .submit(new BoundedStreamReader(processRef.getErrorStream(), charset, limit, "stderr", overflowFlag, shuttingDown)); - currentProcess.waitFor(); - it.join(5000); + // Poll for process exit so that an overflow detected by a reader thread can + // break out immediately rather than blocking for the full executionTimeout. + final long deadline = System.currentTimeMillis() + executionTimeout; + boolean exited = false; + while (System.currentTimeMillis() < deadline) { + if (overflowFlag.get()) { + // Reader thread detected overflow; kill the process tree immediately. + break; + } + if (currentProcess.waitFor(200, TimeUnit.MILLISECONDS)) { + exited = true; + break; + } + } - if (mt.isTeminated()) { - throw new ExecutionTimeoutException("The command execution is timeout: " + cmdList); + if (!exited) { + if (overflowFlag.get()) { + // Overflow path: kill the process tree, then surface the reader's exception. + shuttingDown.set(true); + destroyProcessTree(currentProcess); + // Collect exceptions from both futures with a short timeout. + for (final Future f : List.of(stdoutFuture, stderrFuture)) { + try { + f.get(2, TimeUnit.SECONDS); + } catch (final ExecutionException ee) { + final Throwable cause = ee.getCause(); + if (cause instanceof OutputSizeExceededException) { + logger.warn("command output exceeded limit: limit={} command={}", maxOutputSize, cmdList); + throw new ExtractException("command output exceeded limit: limit=" + maxOutputSize, cause); + } + logger.warn("unexpected error draining stream during overflow handling: command={}", cmdList, cause); + throw new CrawlerSystemException("Failed to drain command output during overflow.", cause); + } catch (final TimeoutException te) { + // overflow already detected; surface the ExtractException below + } catch (final InterruptedException ie) { + Thread.currentThread().interrupt(); + break; + } + } + // guard: overflow flag set but no overflow exception observed + throw new ExtractException("command output exceeded limit: limit=" + maxOutputSize); + } else { + // Timeout path. + logger.warn("command timed out: timeout={}ms command={}", executionTimeout, cmdList); + shuttingDown.set(true); + destroyProcessTree(currentProcess); + throw new ExecutionTimeoutException("The command execution is timeout: " + cmdList); + } + } + + // Process exited; collect drained output. Use a short timeout to avoid waiting forever + // if the reader threads are stuck (they shouldn't be since the streams are now closed). + String stdoutText = ""; + String stderrText = ""; + try { + stdoutText = stdoutFuture.get(5, TimeUnit.SECONDS); + stderrText = stderrFuture.get(5, TimeUnit.SECONDS); + } catch (final TimeoutException te) { + logger.warn("timed out collecting drained output: command={}", cmdList); + stdoutFuture.cancel(true); + stderrFuture.cancel(true); + destroyProcessTree(currentProcess); + throw new ExtractException("Failed to drain command output within 5s: command=" + cmdList, te); + } catch (final ExecutionException ee) { + final Throwable cause = ee.getCause(); + if (cause instanceof OutputSizeExceededException) { + logger.warn("command output exceeded limit: limit={} command={}", maxOutputSize, cmdList); + shuttingDown.set(true); + destroyProcessTree(currentProcess); + throw new ExtractException("command output exceeded limit: limit=" + maxOutputSize, cause); + } + if (cause instanceof IOException) { + throw new CrawlerSystemException("Failed to drain command output.", cause); + } + throw new CrawlerSystemException("Failed to drain command output.", ee); } final int exitValue = currentProcess.exitValue(); + if (exitValue != 0) { + logger.warn("command exited non-zero: exitCode={} command={}", exitValue, cmdList); + } if (logger.isInfoEnabled()) { if (standardOutput) { - logger.info("Exit Code: {}", exitValue); + logger.info("command exited: exitCode={}", exitValue); } else { - logger.info("Exit Code: {} - Process Output:\n{}", exitValue, it.getOutput()); + logger.info("command exited: exitCode={} stdout={}", exitValue, truncateForLog(stdoutText)); + } + if (StringUtil.isNotEmpty(stderrText)) { + logger.info("command stderr: stderr={}", truncateForLog(stderrText)); } } - if (exitValue == 143 && mt.isTeminated()) { - throw new ExecutionTimeoutException("The command execution is timeout: " + cmdList); - } + return stderrText; } catch (final CrawlerSystemException e) { throw e; } catch (final InterruptedException e) { - if (mt != null && mt.isTeminated()) { - throw new ExecutionTimeoutException("The command execution is timeout: " + cmdList, e); + Thread.currentThread().interrupt(); + if (currentProcess != null) { + shuttingDown.set(true); + destroyProcessTree(currentProcess); } throw new InterruptedRuntimeException(e); } catch (final Exception e) { - throw new CrawlerSystemException("Process terminated.", e); + logger.warn("unexpected error executing command: command={}", cmdList, e); + throw new CrawlerSystemException("Failed to execute command: " + cmdList, e); } finally { - if (mt != null) { - mt.setFinished(true); + if (currentProcess != null && currentProcess.isAlive()) { + shuttingDown.set(true); + destroyProcessTree(currentProcess); + } + if (streamPool != null) { + streamPool.shutdownNow(); try { - mt.interrupt(); - } catch (final Exception e) {} + if (!streamPool.awaitTermination(1, TimeUnit.SECONDS)) { + logger.warn("stream pool did not terminate within 1s: command={}", cmdList); + } + } catch (final InterruptedException ie) { + Thread.currentThread().interrupt(); + } } - if (currentProcess != null) { + } + } + + /** + * Sends SIGTERM, waits a grace period, then SIGKILL to the process and any descendants. + * Descendants are snapshotted before sending SIGTERM so that children reparented to + * PID 1 after the parent exits are still reachable. + * + * @param process the process to terminate + */ + protected void destroyProcessTree(final Process process) { + if (process == null) { + return; + } + // Snapshot descendants before sending SIGTERM. If the parent shell receives SIGTERM and + // exits gracefully its children may be reparented to PID 1, making them invisible via + // process.descendants() after the fact. Taking the snapshot first ensures we still reach them. + final List descendantSnapshot; + try { + descendantSnapshot = process.descendants().collect(Collectors.toList()); + } catch (final Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to snapshot descendants.", e); + } + // Fall back to an empty list; we will still kill the parent below. + try { + process.destroy(); + } catch (final Exception ex) { + if (logger.isDebugEnabled()) { + logger.debug("destroy step failed: pid={}", process == null ? -1 : process.pid(), ex); + } + } + try { + process.destroyForcibly(); + } catch (final Exception ex) { + if (logger.isDebugEnabled()) { + logger.debug("destroy step failed: pid={}", process == null ? -1 : process.pid(), ex); + } + } + return; + } + try { + process.destroy(); + } catch (final Exception ex) { + if (logger.isDebugEnabled()) { + logger.debug("destroy step failed: pid={}", process == null ? -1 : process.pid(), ex); + } + } + try { + process.waitFor(destroyGracePeriodMillis, TimeUnit.MILLISECONDS); + } catch (final InterruptedException ie) { + Thread.currentThread().interrupt(); + } + // Forcibly kill parent and all snapshotted descendants regardless of whether the parent + // exited cleanly during the grace period. + for (final ProcessHandle h : descendantSnapshot) { + try { + if (h.isAlive()) { + h.destroyForcibly(); + } + } catch (final Exception ex) { + if (logger.isDebugEnabled()) { + logger.debug("destroy step failed: pid={}", h.pid(), ex); + } + } + } + try { + process.destroyForcibly(); + } catch (final Exception ex) { + if (logger.isDebugEnabled()) { + logger.debug("destroy step failed: pid={}", process == null ? -1 : process.pid(), ex); + } + } + // Bounded re-scan loop to catch fork-after-snapshot. + for (int attempt = 0; attempt < 3; attempt++) { + final List live; + try { + live = process.descendants().filter(ProcessHandle::isAlive).collect(Collectors.toList()); + } catch (final Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("rescan failed: pid={}", process.pid(), e); + } + break; + } + if (live.isEmpty()) { + break; + } + for (final ProcessHandle h : live) { try { - currentProcess.destroy(); - } catch (final Exception e) {} + h.destroyForcibly(); + } catch (final Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("destroy descendant failed: pid={}", h.pid(), e); + } + } } - currentProcess = null; + try { + Thread.sleep(50); + } catch (final InterruptedException ie) { + Thread.currentThread().interrupt(); + break; + } + } + } + private static String truncateForLog(final String s) { + if (s == null) { + return ""; + } + final int max = 4000; + if (s.length() <= max) { + return s; } + return s.substring(0, max) + "... [truncated]"; } /** @@ -301,119 +610,304 @@ private String getCommandValue(final String key, final Map param } /** - * Thread to monitor and terminate processes that exceed the timeout. + * Daemon thread factory for stream-drain workers. */ - protected static class MonitorThread extends Thread { - private final Process process; + protected static class DaemonThreadFactory implements ThreadFactory { + private final String namePrefix; + private final AtomicInteger counter = new AtomicInteger(); - private final long timeout; + /** + * Constructs a new DaemonThreadFactory. + * + * @param namePrefix the prefix used to name threads created by this factory + */ + public DaemonThreadFactory(final String namePrefix) { + this.namePrefix = namePrefix; + } - private boolean finished = false; + @Override + public Thread newThread(final Runnable r) { + final Thread t = new Thread(r, namePrefix + "-" + counter.incrementAndGet()); + t.setDaemon(true); + return t; + } + } - private boolean teminated = false; + /** + * Reads a stream fully (up to {@code limit} bytes) and returns it as a String. + * If the stream produces more than {@code limit} bytes, {@code overflowFlag} is + * set to {@code true} and an {@link OutputSizeExceededException} is thrown so + * that the main thread can detect the overflow without waiting for the full + * {@code executionTimeout}. + */ + protected static class BoundedStreamReader implements Callable { + private final InputStream stream; + private final Charset charset; + private final long limit; + private final String streamName; + private final AtomicBoolean overflowFlag; + private final AtomicBoolean shuttingDown; /** - * Constructs a new MonitorThread. - * @param process The process to monitor. - * @param timeout The timeout for the process. + * Constructs a new BoundedStreamReader. + * + * @param stream the input stream to drain + * @param charset the charset used to decode the bytes + * @param limit the maximum number of bytes accepted before the process is killed + * @param streamName a label used in error messages and logs + * @param overflowFlag shared flag that is set to {@code true} just before + * {@link OutputSizeExceededException} is thrown, allowing the main + * thread's polling loop to break early and kill the process tree + * @param shuttingDown shared flag indicating the main thread is in the process of killing + * the subprocess; IOExceptions observed while this flag is set are + * logged at debug level rather than surfaced as ExecutionExceptions */ - public MonitorThread(final Process process, final long timeout) { - this.process = process; - this.timeout = timeout; + public BoundedStreamReader(final InputStream stream, final Charset charset, final long limit, final String streamName, + final AtomicBoolean overflowFlag, final AtomicBoolean shuttingDown) { + this.stream = stream; + this.charset = charset; + this.limit = limit; + this.streamName = streamName; + this.overflowFlag = overflowFlag; + this.shuttingDown = shuttingDown; } @Override - public void run() { - ThreadUtil.sleepQuietly(timeout); + public String call() throws IOException { + final byte[] buf = new byte[8192]; + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + long total = 0L; + try (InputStream is = stream) { + int n; + while ((n = is.read(buf)) != -1) { + total += n; + if (total > limit) { + // Signal the main thread's polling loop before throwing so it can + // break immediately and kill the process tree rather than blocking + // for the remainder of executionTimeout. + overflowFlag.set(true); + throw new OutputSizeExceededException( + "command output exceeded limit on " + streamName + ": limit=" + limit + " bytes"); + } + baos.write(buf, 0, n); + } + } catch (final OutputSizeExceededException e) { + throw e; + } catch (final IOException ioe) { + if (shuttingDown.get()) { + if (logger.isDebugEnabled()) { + logger.debug("stream closed during shutdown: stream={}", streamName, ioe); + } + } else { + logger.warn("unexpected I/O error on stream: stream={} drainedBytes={}", streamName, total, ioe); + throw ioe; + } + } + return new String(baos.toByteArray(), charset); + } + } - if (!finished) { - try { - process.destroy(); - teminated = true; - } catch (final Exception e) { - if (logger.isInfoEnabled()) { - logger.info("Could not kill the subprocess.", e); + /** + * Internal signal that the subprocess produced more output than allowed. + */ + protected static class OutputSizeExceededException extends IOException { + private static final long serialVersionUID = 1L; + + /** + * Constructs a new OutputSizeExceededException. + * + * @param message the detail message + */ + public OutputSizeExceededException(final String message) { + super(message); + } + } + + /** + * Copies bytes from an {@link InputStream} to a {@link File} up to {@code limit} bytes. + * When the limit is exceeded an {@link OutputSizeExceededException} is thrown so that the + * caller can invoke {@link CommandExtractor#destroyProcessTree(Process)} to terminate the + * subprocess and its descendants. The return value is always an empty string; the written + * content is in the output file. + */ + protected static class BoundedFileWriter implements Callable { + private final InputStream stream; + private final File outputFile; + private final long limit; + private final String streamName; + private final AtomicBoolean overflowFlag; + private final AtomicBoolean shuttingDown; + + /** + * Constructs a new BoundedFileWriter. + * + * @param stream the input stream to drain + * @param outputFile the file to write output to + * @param limit the maximum number of bytes accepted before the process is killed + * @param streamName a label used in error messages and logs + * @param overflowFlag shared flag set to {@code true} just before + * {@link OutputSizeExceededException} is thrown so the main thread's + * polling loop breaks out early instead of blocking until executionTimeout + * @param shuttingDown shared flag indicating the main thread is in the process of killing + * the subprocess; IOExceptions observed while this flag is set are + * logged at debug level rather than surfaced as ExecutionExceptions + */ + public BoundedFileWriter(final InputStream stream, final File outputFile, final long limit, final String streamName, + final AtomicBoolean overflowFlag, final AtomicBoolean shuttingDown) { + this.stream = stream; + this.outputFile = outputFile; + this.limit = limit; + this.streamName = streamName; + this.overflowFlag = overflowFlag; + this.shuttingDown = shuttingDown; + } + + @Override + public String call() throws IOException { + final byte[] buf = new byte[8192]; + long total = 0L; + try (InputStream is = stream; BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(outputFile))) { + int n; + while ((n = is.read(buf)) != -1) { + total += n; + if (total > limit) { + overflowFlag.set(true); + throw new OutputSizeExceededException( + "command output exceeded limit on " + streamName + ": limit=" + limit + " bytes"); } + out.write(buf, 0, n); + } + } catch (final OutputSizeExceededException e) { + throw e; + } catch (final IOException ioe) { + if (shuttingDown.get()) { + if (logger.isDebugEnabled()) { + logger.debug("stream closed during shutdown: stream={}", streamName, ioe); + } + } else { + logger.warn("unexpected I/O error on stream: stream={} drainedBytes={}", streamName, total, ioe); + throw ioe; } } + return ""; + } + } + + /** + * Legacy thread that used to monitor and terminate processes exceeding the + * timeout. + * + * @deprecated The timeout/kill machinery is now handled inline by + * {@link CommandExtractor#executeCommand(File, File)} using + * {@link Process#waitFor(long, TimeUnit)} and + * {@link CommandExtractor#destroyProcessTree(Process)}. This class is + * unused internally and is retained only as an empty stub so that + * existing third-party subclasses or callers that referenced + * {@code CommandExtractor.MonitorThread} continue to compile and + * link. New code should not extend or instantiate it. + */ + @Deprecated + protected static class MonitorThread extends Thread { + + /** + * Constructs a new MonitorThread. + * + * @param process the process to monitor (ignored; retained for source compat) + * @param timeout the timeout (ignored; retained for source compat) + */ + @Deprecated + public MonitorThread(final Process process, final long timeout) { + // NOP - retained only for source compatibility. + } + + /** + * No-op stub. Subclasses cannot override this — it is intentionally final to make + * legacy override attempts compile-fail (rather than silently no-op). + */ + @Deprecated + @Override + public final void run() { + // NOP } /** * Sets the finished flag. - * @param finished The finished flag to set. + * + *

Subclasses cannot override this — it is intentionally final to make legacy + * override attempts compile-fail (rather than silently no-op). + * + * @param finished the finished flag (ignored) */ - public void setFinished(final boolean finished) { - this.finished = finished; + @Deprecated + public final void setFinished(final boolean finished) { + // NOP - retained only for source compatibility. } /** * Returns whether the process was terminated. - * @return true if terminated, false otherwise. + * + *

Subclasses cannot override this — it is intentionally final to make legacy + * override attempts compile-fail (rather than silently no-op). + * + *

Note: the typo is preserved intentionally for binary compatibility with the + * legacy API. + * + * @return always {@code false}; this stub never terminates anything. */ - public boolean isTeminated() { - return teminated; + @Deprecated + public final boolean isTeminated() { + return false; } } /** - * Thread to read and buffer output from an input stream. + * Legacy thread that used to read and buffer output from an input stream. + * + * @deprecated Stream draining is now performed by + * {@link CommandExtractor.BoundedStreamReader} which is byte-bounded + * rather than line-bounded. This class is unused internally and is + * retained only as an empty stub so that existing third-party + * subclasses or callers that referenced + * {@code CommandExtractor.InputStreamThread} continue to compile and + * link. New code should not extend or instantiate it. */ + @Deprecated protected static class InputStreamThread extends Thread { - private BufferedReader br; - - private final List list = new LinkedList<>(); - - private final int maxLineBuffer; - /** * Constructs a new InputStreamThread. - * @param is The InputStream to read from. - * @param charset The charset to use for reading. - * @param maxOutputLineBuffer The maximum number of lines to buffer. + * + * @param is the input stream (ignored; retained for source compat) + * @param charset the charset (ignored; retained for source compat) + * @param maxOutputLineBuffer the line buffer size (ignored; retained for source compat) */ + @Deprecated public InputStreamThread(final InputStream is, final String charset, final int maxOutputLineBuffer) { - try { - br = new BufferedReader(new InputStreamReader(is, charset)); - } catch (final UnsupportedEncodingException e) { - br = new BufferedReader(new InputStreamReader(is, Constants.UTF_8_CHARSET)); - } - maxLineBuffer = maxOutputLineBuffer; + // NOP - retained only for source compatibility. } + /** + * No-op stub. Subclasses cannot override this — it is intentionally final to make + * legacy override attempts compile-fail (rather than silently no-op). + */ + @Deprecated @Override - public void run() { - for (;;) { - try { - final String line = br.readLine(); - if (line == null) { - break; - } - if (logger.isDebugEnabled()) { - logger.debug(line); - } - list.add(line); - if (list.size() > maxLineBuffer) { - list.remove(0); - } - } catch (final IOException e) { - throw new CrawlerSystemException(e); - } - } + public final void run() { + // NOP } /** - * Returns the output as a String. - * @return The output string. + * Returns the buffered output as a String. + * + *

Subclasses cannot override this — it is intentionally final to make legacy + * override attempts compile-fail (rather than silently no-op). + * + * @return always an empty string; this stub never reads anything. */ - public String getOutput() { - final StringBuilder buf = new StringBuilder(100); - for (final String value : list) { - buf.append(value).append("\n"); - } - return buf.toString(); + @Deprecated + public final String getOutput() { + return ""; } - } /** @@ -473,11 +967,25 @@ public void setCommandOutputEncoding(final String commandOutputEncoding) { } /** - * Sets the maximum number of output lines to process. + * Sets the maximum number of output lines to buffer. + * + *

This setter is deprecated; use {@link #setMaxOutputSize(long)} instead. + * For backward compatibility, calling this method conservatively shrinks + * {@link #maxOutputSize} to {@code max(64 KiB, n * 1024)} bytes (1 line ≈ 1 KiB), + * but only when {@link #setMaxOutputSize(long)} has not been called + * explicitly on this instance. When both setters are called, the explicit + * {@link #setMaxOutputSize(long)} value always wins regardless of call order. + * * @param maxOutputLine The maximum output lines to set. + * @deprecated Use {@link #setMaxOutputSize(long)} to set the byte-count bound directly. */ + @Deprecated public void setMaxOutputLine(final int maxOutputLine) { this.maxOutputLine = maxOutputLine; + final long inferred = Math.max(64L * 1024L, (long) maxOutputLine * 1024L); + if (!maxOutputSizeExplicit) { + this.maxOutputSize = Math.min(this.maxOutputSize, inferred); + } } /** @@ -487,4 +995,69 @@ public void setMaxOutputLine(final int maxOutputLine) { public void setStandardOutput(final boolean standardOutput) { this.standardOutput = standardOutput; } + + /** + * Sets the maximum number of bytes the subprocess may write to a single stream + * (stdout or stderr) before the process is killed. Calling this method marks the + * size as explicitly set, preventing a subsequent {@link #setMaxOutputLine(int)} + * call from overriding it. + * + *

Note: the entire output may be loaded into memory during extraction + * (transient 2–4× heap usage). Values above 512 MiB log a warning. + * + * @param maxOutputSize the limit in bytes + */ + public void setMaxOutputSize(final long maxOutputSize) { + if (maxOutputSize > 512L * 1024L * 1024L) { + logger.warn("maxOutputSize is large; extraction may transiently allocate 2-4x this in heap: maxOutputSize={}", maxOutputSize); + } + this.maxOutputSize = maxOutputSize; + this.maxOutputSizeExplicit = true; + } + + /** + * Returns the current maximum output size limit in bytes. + * + * @return the maximum number of bytes the subprocess may write to a single stream + */ + public long getMaxOutputSize() { + return maxOutputSize; + } + + /** + * Sets the maximum number of bytes copied from the input stream into the + * temporary input file. If the input exceeds this size an {@link ExtractException} + * is thrown before the command is invoked. + * + * @param maxInputSize the limit in bytes + */ + public void setMaxInputSize(final long maxInputSize) { + this.maxInputSize = maxInputSize; + } + + /** + * Sets the grace period in milliseconds between {@code Process.destroy()} (SIGTERM) + * and the fallback {@code Process.destroyForcibly()} (SIGKILL). + * + * @param destroyGracePeriodMillis the grace period + */ + public void setDestroyGracePeriodMillis(final long destroyGracePeriodMillis) { + this.destroyGracePeriodMillis = destroyGracePeriodMillis; + } + + /** + * Sets whether captured stderr text should be appended to the extracted content + * when {@link #standardOutput} is {@code false}. Defaults to {@code false} to + * match pre-3.x behavior where stderr was only logged, never included in + * {@code ExtractData}. + * + *

When set to {@code true} and {@code standardOutput=false}, captured stderr + * text is appended after the file content in the extracted body. + * + * @param includeStderrInOutput {@code true} to append stderr to the extracted + * content; {@code false} to keep the file content only + */ + public void setIncludeStderrInOutput(final boolean includeStderrInOutput) { + this.includeStderrInOutput = includeStderrInOutput; + } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractorTest.java index f26ec320..bc2dae8d 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractorTest.java @@ -15,19 +15,25 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.commons.lang3.SystemUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.codelibs.core.exception.IORuntimeException; import org.codelibs.core.io.FileUtil; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.ExecutionTimeoutException; -import org.junit.jupiter.api.Test; +import org.codelibs.fess.crawler.exception.ExtractException; import org.dbflute.utflute.core.PlainTestCase; +import org.junit.jupiter.api.Test; /** * @author shinsuke @@ -35,6 +41,8 @@ */ public class CommandExtractorTest extends PlainTestCase { + private static final Logger logger = LogManager.getLogger(CommandExtractorTest.class); + private File createScriptTempFile(final int sleep) { String extention; String content; @@ -164,9 +172,11 @@ public void test_getText_timeout() throws IOException { extractor.command = getCommand(scriptFile); final Map params = new HashMap(); try { - final ExtractData data = extractor.getText(new FileInputStream(contentFile), params); + extractor.getText(new FileInputStream(contentFile), params); fail(); - } catch (final ExecutionTimeoutException e) {} + } catch (final ExecutionTimeoutException e) { + // expected + } } @Test @@ -274,4 +284,669 @@ public void test_getFileName() { assertEquals(expected, actual); } + + // ========================================================== + // PR-D: robustness tests + // ========================================================== + + /** + * Verifies a long-running subprocess is killed when the timeout fires + * and the call returns within a few seconds (i.e. we don't wait the + * full sleep duration). + */ + @Test + public void test_timeout_killsProcess() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; // skip on non-Unix + } + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = "sh -c \"sleep 30\""; + extractor.executionTimeout = 500L; + + final long start = System.currentTimeMillis(); + try { + extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>()); + fail(); + } catch (final ExecutionTimeoutException e) { + // expected + } + final long elapsed = System.currentTimeMillis() - start; + assertTrue(elapsed < 5000L); + } + + /** + * Verifies that when the subprocess produces more bytes on stderr than + * {@code maxOutputSize} the call fails fast with an {@link ExtractException} + * rather than buffering an unbounded amount of data. + */ + @Test + public void test_outputSizeExceeded_throwsExtractException() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final CommandExtractor extractor = new CommandExtractor(); + // print 5 MiB of zeros to stderr; cap at 1 MiB. + extractor.command = "sh -c \"head -c 5242880 /dev/zero 1>&2\""; + extractor.executionTimeout = 30_000L; + extractor.maxOutputSize = 1024L * 1024L; // 1 MiB + + final long start = System.currentTimeMillis(); + try { + extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>()); + fail(); + } catch (final ExtractException e) { + // expected + } + final long elapsed = System.currentTimeMillis() - start; + if (elapsed >= 5000L) { + logger.warn("test_outputSizeExceeded_throwsExtractException: did not fast-fail (was {}ms)", elapsed); + } + assertTrue(elapsed < 5000L); + } + + /** + * Verifies stderr is drained concurrently so the process does not block + * on a full pipe buffer (default ~64KB on Linux). We write much more than + * that to stderr while the script also produces normal output. + */ + @Test + public void test_stderrDrained_doesNotDeadlock() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final File scriptFile; + try { + scriptFile = File.createTempFile("stderr_script", ".sh"); + scriptFile.deleteOnExit(); + // Write 1 MiB to stderr (well above default pipe buffer) then copy input. + FileUtil.writeBytes(scriptFile.getAbsolutePath(), + ("#!/bin/bash\nhead -c 1048576 /dev/zero 1>&2\ncp \"$1\" \"$2\"\n").getBytes()); + } catch (final IOException e) { + throw new IORuntimeException(e); + } + + final String content = "TEST"; + final File contentFile = createContentFile(".txt", content.getBytes()); + + final CommandExtractor extractor = new CommandExtractor(); + extractor.executionTimeout = 30_000L; + extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE"; + // This test focuses on draining stderr so the process does not block; + // suppress stderr-in-output so the assertion compares only the file content. + extractor.setIncludeStderrInOutput(false); + + final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>()); + assertEquals(content, data.getContent()); + } + + /** + * Verifies the input copy is bounded by {@code maxInputSize} and the + * extractor fails fast before invoking the command. + */ + @Test + public void test_inputSizeExceeded_throwsExtractException() { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final File scriptFile = createScriptTempFile(0); + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = getCommand(scriptFile); + extractor.maxInputSize = 1024L; // 1 KiB + + // 64 KiB of zeros is well above the 1 KiB cap. + final InputStream big = new ByteArrayInputStream(new byte[64 * 1024]); + try { + extractor.getText(big, new HashMap<>()); + fail(); + } catch (final ExtractException e) { + // expected + } + } + + /** + * Verifies that when a subprocess spawns a long-running child, the timeout + * path attempts to kill descendants too. We check this best-effort by + * snapshotting {@link ProcessHandle#allProcesses} before/after; the precise + * verification of orphan kill is platform-specific. + */ + @Test + public void test_processDescendants_killed() throws Exception { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final CommandExtractor extractor = new CommandExtractor(); + // Spawn a child sleep, then wait. When the parent is killed we want the child gone too. + extractor.command = "sh -c \"sleep 30 & wait\""; + extractor.executionTimeout = 500L; + + final long start = System.currentTimeMillis(); + try { + extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>()); + fail(); + } catch (final ExecutionTimeoutException e) { + // expected + } + final long elapsed = System.currentTimeMillis() - start; + // Total wall clock should be close to the timeout, not the 30-sec sleep. + assertTrue(elapsed < 10_000L); + } + + /** + * Verifies that when {@code standardOutput=false} and {@code includeStderrInOutput=true}, + * captured stderr text is appended to the extracted content. + * Note: the opt-in flag must be set explicitly because the default is {@code false}. + */ + @Test + public void test_stderrAppendedToOutput_whenStandardOutputFalse() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + // Script writes "OUT" to $OUTPUT_FILE and "ERR" to stderr. + final File scriptFile; + try { + scriptFile = File.createTempFile("stderr_in_out_", ".sh"); + scriptFile.deleteOnExit(); + FileUtil.writeBytes(scriptFile.getAbsolutePath(), "#!/bin/bash\necho OUT > \"$2\"\necho ERR >&2\n".getBytes()); + } catch (final IOException e) { + throw new IORuntimeException(e); + } + + final File contentFile = createContentFile(".txt", new byte[] { 0 }); + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE"; + extractor.executionTimeout = 30_000L; + // Explicitly opt in — default is false (pre-3.x behavior). + extractor.setIncludeStderrInOutput(true); + + final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>()); + final String text = data.getContent(); + assertTrue(text.contains("OUT")); + assertTrue(text.contains("ERR")); + } + + /** + * Verifies that by default ({@code includeStderrInOutput=false}), stderr is NOT + * appended to the extracted content even when {@code standardOutput=false}. This + * matches the pre-3.x behavior where stderr was only routed to logs, never to + * extracted text. + */ + @Test + public void test_stderrNotInOutput_byDefault() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + // Script writes "content" to $OUTPUT_FILE and "warning" to stderr. + final File scriptFile; + try { + scriptFile = File.createTempFile("stderr_default_", ".sh"); + scriptFile.deleteOnExit(); + FileUtil.writeBytes(scriptFile.getAbsolutePath(), "#!/bin/bash\necho content > \"$2\"\necho warning >&2\n".getBytes()); + } catch (final IOException e) { + throw new IORuntimeException(e); + } + + final File contentFile = createContentFile(".txt", new byte[] { 0 }); + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE"; + extractor.executionTimeout = 30_000L; + // Do NOT call setIncludeStderrInOutput — default must be false. + + final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>()); + final String text = data.getContent(); + assertTrue(text.contains("content")); + assertFalse(text.contains("warning")); // stderr should not appear in extracted text by default + } + + /** + * Verifies that {@code setIncludeStderrInOutput(false)} suppresses appending + * stderr to the extracted content even when {@code standardOutput=false}. + */ + @Test + public void test_stderrSuppressedFromOutput_whenIncludeStderrInOutputFalse() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final File scriptFile; + try { + scriptFile = File.createTempFile("stderr_suppressed_", ".sh"); + scriptFile.deleteOnExit(); + FileUtil.writeBytes(scriptFile.getAbsolutePath(), "#!/bin/bash\necho OUT > \"$2\"\necho ERR >&2\n".getBytes()); + } catch (final IOException e) { + throw new IORuntimeException(e); + } + + final File contentFile = createContentFile(".txt", new byte[] { 0 }); + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE"; + extractor.executionTimeout = 30_000L; + extractor.setIncludeStderrInOutput(false); + + final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>()); + final String text = data.getContent(); + assertTrue(text.contains("OUT")); + assertFalse(text.contains("ERR")); + } + + // ========================================================== + // Fix 1: outputFile exceeds maxOutputSize → ExtractException + // ========================================================== + + /** + * Fix 1 (standardOutput=true): when the command writes more than maxOutputSize bytes to + * stdout, getText() must throw ExtractException rather than loading the entire file into + * memory. + */ + @Test + public void test_outputFileExceedsMaxOutputSize_throws() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final CommandExtractor extractor = new CommandExtractor(); + // cat $INPUT_FILE sends content to stdout; we will pipe a 5-MiB input with standardOutput=true. + // 5 MiB output > 1 MiB limit should trigger the guard. + extractor.standardOutput = true; + extractor.command = "sh -c \"head -c 5242880 /dev/zero\""; + extractor.executionTimeout = 30_000L; + extractor.maxOutputSize = 1024L * 1024L; // 1 MiB + + try { + extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>()); + fail(); + } catch (final ExtractException e) { + // expected + } + } + + /** + * Fix 1 (standardOutput=false): when the command writes more than maxOutputSize bytes to + * $OUTPUT_FILE, getText() must throw ExtractException (length-check defensive line). + */ + @Test + public void test_outputFileExceedsMaxOutputSize_standardOutputFalse_throws() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final File scriptFile; + try { + scriptFile = File.createTempFile("big_output_", ".sh"); + scriptFile.deleteOnExit(); + // Write 5 MiB of zeros to $OUTPUT_FILE directly (bypasses BoundedStreamReader). + FileUtil.writeBytes(scriptFile.getAbsolutePath(), "#!/bin/bash\nhead -c 5242880 /dev/zero > \"$2\"\n".getBytes()); + } catch (final IOException e) { + throw new IORuntimeException(e); + } + + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE"; + extractor.executionTimeout = 30_000L; + extractor.maxOutputSize = 1024L * 1024L; // 1 MiB + + try { + extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>()); + fail(); + } catch (final ExtractException e) { + // expected + } + } + + // ========================================================== + // Fix 2: descendants are killed even when parent exits cleanly + // ========================================================== + + /** + * Fix 2: verifies that after a timeout the spawned child process (sleep) is killed even + * when the parent shell exits gracefully before the grace period expires and would otherwise + * cause descendants() to return an empty snapshot. + */ + @Test + public void test_processDescendants_killed_orphan_gone() throws Exception { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final CommandExtractor extractor = new CommandExtractor(); + // The shell spawns a background sleep and prints its PID to stdout, then waits. + // standardOutput=false so we can read stdout text via BoundedStreamReader. + extractor.command = "sh -c \"sleep 30 & echo $!; wait\""; + extractor.executionTimeout = 800L; + extractor.destroyGracePeriodMillis = 200L; + + try { + extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>()); + fail(); + } catch (final ExecutionTimeoutException e) { + // expected — but we did not capture stdout here since the exception fires before we collect it. + } catch (final Exception e) { + // acceptable + } + + // Give the OS a moment to reap the child. + Thread.sleep(500); + + // Best-effort check: find any process with "sleep 30" in its command that is still alive. + final boolean orphanAlive = ProcessHandle.allProcesses() + .filter(ProcessHandle::isAlive) + .anyMatch(ph -> ph.info().command().map(c -> c.contains("sleep")).orElse(false) && ph.info().arguments().map(args -> { + for (final String a : args) { + if ("30".equals(a)) { + return true; + } + } + return false; + }).orElse(false)); + + if (orphanAlive) { + logger.warn("test_processDescendants_killed_orphan_gone: orphan 'sleep 30' still alive after kill attempt"); + } + assertFalse(orphanAlive); + } + + // ========================================================== + // Fix 3: overflow kills descendants via destroyProcessTree + // ========================================================== + + /** + * Fix 3 + overflow fail-fast: when BoundedStreamReader detects overflow it throws + * OutputSizeExceededException and sets the overflow flag so the main thread breaks + * out of its polling loop and calls destroyProcessTree immediately — well before the + * full executionTimeout elapses. Also verifies the background sleep child is reaped. + */ + @Test + public void test_overflow_killsDescendants() throws Exception { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final CommandExtractor extractor = new CommandExtractor(); + // Spawn a background sleep, then flood stderr with zeros to trigger overflow. + // standardOutput=false so stdout is piped; stderr triggers BoundedStreamReader overflow. + extractor.command = "sh -c \"sleep 30 & head -c 5242880 /dev/zero 1>&2; wait\""; + // Use 10 s timeout — test must complete well below this if fail-fast works. + extractor.executionTimeout = 10_000L; + extractor.maxOutputSize = 1024L * 1024L; // 1 MiB + extractor.destroyGracePeriodMillis = 200L; + + final long start = System.currentTimeMillis(); + try { + extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>()); + fail(); + } catch (final ExtractException e) { + // expected — overflow detected + } + final long elapsed = System.currentTimeMillis() - start; + // Overflow should be detected and process killed well before the 10 s timeout. + assertTrue(elapsed < 5000L); // overflow should kill process tree in < 5 s + + Thread.sleep(500); + + final boolean orphanAlive = ProcessHandle.allProcesses() + .filter(ProcessHandle::isAlive) + .anyMatch(ph -> ph.info().command().map(c -> c.contains("sleep")).orElse(false) && ph.info().arguments().map(args -> { + for (final String a : args) { + if ("30".equals(a)) { + return true; + } + } + return false; + }).orElse(false)); + + if (orphanAlive) { + logger.warn("test_overflow_killsDescendants: orphan 'sleep 30' still alive after kill attempt"); + } + assertFalse(orphanAlive); + } + + // ========================================================== + // Fix 4: setMaxOutputLine conservatively shrinks maxOutputSize + // ========================================================== + + /** + * Fix 4a: setMaxOutputLine(1) should shrink maxOutputSize to max(64KiB, 1*1024) = 64 KiB + * when setMaxOutputSize has not been called. + */ + @Test + @SuppressWarnings("deprecation") + public void test_setMaxOutputLine_shrinksMaxOutputSize() { + final CommandExtractor extractor = new CommandExtractor(); + // Default maxOutputSize is 10 MiB; setMaxOutputLine(1) infers 64 KiB (floor), which is smaller. + extractor.setMaxOutputLine(1); + assertTrue(extractor.getMaxOutputSize() <= 64L * 1024L); + } + + /** + * Fix 4b: when setMaxOutputSize has been called explicitly, a subsequent setMaxOutputLine + * must not override it — the explicit value must be preserved regardless of call order. + */ + @Test + @SuppressWarnings("deprecation") + public void test_setMaxOutputLine_doesNotOverrideExplicitMaxOutputSize() { + final CommandExtractor extractor = new CommandExtractor(); + extractor.setMaxOutputSize(50L * 1024L * 1024L); // 50 MiB explicit + extractor.setMaxOutputLine(1); // would infer 64 KiB but must not override explicit + assertEquals(50L * 1024L * 1024L, extractor.getMaxOutputSize()); + } + + // ========================================================== + // PR-159 review-fix tests + // ========================================================== + + /** + * Validates C1: standardOutput=true overflow must fail fast (not wait for executionTimeout). + */ + @Test + public void test_standardOutputOverflow_failsFastWithoutWaitingForTimeout() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final CommandExtractor extractor = new CommandExtractor(); + extractor.standardOutput = true; + // Produce 5 MiB then keep running. Without C1 fix the test would wait the full executionTimeout. + extractor.command = "sh -c \"head -c 5242880 /dev/zero; sleep 30\""; + extractor.executionTimeout = 10_000L; + extractor.maxOutputSize = 1024L * 1024L; + final long start = System.currentTimeMillis(); + try { + extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>()); + fail(); + } catch (final ExtractException e) { + // expected + } + final long elapsed = System.currentTimeMillis() - start; + if (elapsed >= 5000L) { + logger.warn("test_standardOutputOverflow_failsFastWithoutWaitingForTimeout: overflow did not fail fast (was {}ms)", elapsed); + } + assertTrue(elapsed < 5000L); + } + + /** + * Validates C4: invalid charset name falls back to UTF-8 silently (warning logged). + */ + @Test + public void test_invalidCommandOutputEncoding_fallsBackToUtf8() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final File scriptFile = createScriptTempFileStdout(0); + final File contentFile = createContentFile(".txt", "HELLO".getBytes()); + final CommandExtractor extractor = new CommandExtractor(); + extractor.standardOutput = true; + extractor.command = getCommandStdout(scriptFile); + extractor.setCommandOutputEncoding("not-a-real-charset-12345"); + extractor.executionTimeout = 30_000L; + // Should NOT throw — falls back to UTF-8 silently (warning logged). + final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>()); + assertEquals("HELLO", data.getContent()); + } + + /** + * Verifies a blank command surfaces as a CrawlerSystemException rather than NPE/IAE. + */ + @Test + public void test_blankCommand_throwsCrawlerSystemException() { + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = " "; + try { + extractor.getText(new ByteArrayInputStream("x".getBytes()), new HashMap<>()); + fail(); + } catch (final org.codelibs.fess.crawler.exception.CrawlerSystemException e) { + // expected + } + } + + /** + * Verifies a nonexistent binary surfaces as a CrawlerSystemException. + */ + @Test + public void test_nonexistentBinary_throwsCrawlerSystemException() { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = "/nonexistent/path/to/binary $INPUT_FILE $OUTPUT_FILE"; + try { + extractor.getText(new ByteArrayInputStream("x".getBytes()), new HashMap<>()); + fail(); + } catch (final org.codelibs.fess.crawler.exception.CrawlerSystemException e) { + // expected + } + } + + /** + * Validates M4: non-zero exit must NOT throw — content is still returned (lenient contract). + */ + @Test + public void test_nonZeroExit_returnsAvailableContent() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final File scriptFile; + try { + scriptFile = File.createTempFile("nonzero_", ".sh"); + scriptFile.deleteOnExit(); + FileUtil.writeBytes(scriptFile.getAbsolutePath(), "#!/bin/bash\necho partial > \"$2\"\nexit 7\n".getBytes()); + } catch (final IOException e) { + throw new IORuntimeException(e); + } + final File contentFile = createContentFile(".txt", new byte[] { 0 }); + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = "sh " + scriptFile.getAbsolutePath() + " $INPUT_FILE $OUTPUT_FILE"; + extractor.executionTimeout = 30_000L; + // Lenient contract: non-zero exit does NOT throw; content is returned. + final ExtractData data = extractor.getText(new FileInputStream(contentFile), new HashMap<>()); + assertTrue(data.getContent().contains("partial")); + } + + /** + * Tightened version of test_setMaxOutputLine_shrinksMaxOutputSize: verifies exact 64 KiB floor. + */ + @Test + @SuppressWarnings("deprecation") + public void test_setMaxOutputLine_shrinksMaxOutputSize_toExactFloor() { + final CommandExtractor extractor = new CommandExtractor(); + extractor.setMaxOutputLine(1); + assertEquals(64L * 1024L, extractor.getMaxOutputSize()); + } + + /** + * Verifies M8: resourceName containing shell metacharacters is sanitized so it cannot be used + * to inject arguments via the temp-file prefix. + */ + @Test + public void test_resourceNameWithMetacharacters_sanitizedInTempFileName() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final File scriptFile = createScriptTempFile(0); + final File contentFile = createContentFile(".txt", "OK".getBytes()); + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = getCommand(scriptFile); + final Map params = new HashMap<>(); + // A maliciously crafted resourceName with shell metacharacters. + params.put(ExtractData.RESOURCE_NAME_KEY, "evil;rm -rf /.--reset.txt"); + final ExtractData data = extractor.getText(new FileInputStream(contentFile), params); + assertEquals("OK", data.getContent()); + } + + /** + * Verifies M8: getFileName handles Windows-style backslash path separators. + */ + @Test + public void test_getFileName_handlesWindowsBackslash() { + final CommandExtractor extractor = new CommandExtractor(); + assertEquals("hoge.txt", extractor.getFileName("C:\\path\\hoge.txt")); + assertEquals("hoge.txt", extractor.getFileName("path\\hoge.txt")); + assertEquals("hoge.txt", extractor.getFileName("hoge.txt\\")); + } + + /** + * Verifies thread interrupt is propagated and the interrupt flag is restored. + */ + @Test + public void test_interrupt_propagatesAsInterruptedRuntimeException() throws InterruptedException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = "sh -c \"sleep 30\""; + extractor.executionTimeout = 30_000L; + final Thread target = Thread.currentThread(); + final Thread interrupter = new Thread(() -> { + try { + Thread.sleep(500); + } catch (final InterruptedException ignored) {} + target.interrupt(); + }); + interrupter.start(); + try { + extractor.getText(new ByteArrayInputStream(new byte[0]), new HashMap<>()); + fail(); + } catch (final org.codelibs.core.exception.InterruptedRuntimeException expected) { + // good + } catch (final Exception e) { + // also acceptable as long as interrupt flag was restored + } finally { + Thread.interrupted(); // clear interrupt for subsequent tests + } + } + + /** + * Verifies an invalid workingDirectory surfaces as a CrawlerSystemException. + */ + @Test + public void test_invalidWorkingDirectory_throwsCrawlerSystemException() throws IOException { + if (!SystemUtils.IS_OS_UNIX) { + return; + } + final File scriptFile = createScriptTempFile(0); + final File contentFile = createContentFile(".txt", "x".getBytes()); + final CommandExtractor extractor = new CommandExtractor(); + extractor.command = getCommand(scriptFile); + extractor.workingDirectory = new File("/nonexistent/path/that/does/not/exist"); + try { + extractor.getText(new FileInputStream(contentFile), new HashMap<>()); + fail(); + } catch (final org.codelibs.fess.crawler.exception.CrawlerSystemException e) { + // expected + } + } + + /** + * Verifies the deprecated MonitorThread stub still works as a NOP. + */ + @Test + @SuppressWarnings("deprecation") + public void test_deprecatedMonitorThread_isNop() { + final CommandExtractor.MonitorThread mt = new CommandExtractor.MonitorThread(null, 1000L); + mt.setFinished(true); + assertFalse(mt.isTeminated()); + } + + /** + * Verifies the deprecated InputStreamThread stub still works as a NOP. + */ + @Test + @SuppressWarnings("deprecation") + public void test_deprecatedInputStreamThread_isNop() { + final CommandExtractor.InputStreamThread it = + new CommandExtractor.InputStreamThread(new ByteArrayInputStream(new byte[0]), "UTF-8", 100); + assertEquals("", it.getOutput()); + } }