codelibs · marevol · Jul 5, 2026 · May 4, 2026 · May 16, 2026 · Jul 5, 2026
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
@@ -15,10 +15,12 @@
  */
 package org.codelibs.fess.crawler.extractor.impl;
 
+import java.io.BufferedReader;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.io.Reader;
 import java.nio.file.InvalidPathException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
@@ -27,6 +29,8 @@
 import java.util.Map;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 import org.codelibs.fess.crawler.container.CrawlerContainer;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
@@ -56,6 +60,12 @@
  */
 public abstract class AbstractExtractor implements Extractor {
 
+    /** Logger instance for this class. */
+    private static final Logger logger = LogManager.getLogger(AbstractExtractor.class);
+
+    /** Default read buffer size in characters when streaming reader content. */
+    protected static final int READ_BUFFER_SIZE = 8192;
+
     /**
      * Parameter key used to track the recursion depth across nested archive
      * extraction. Callers/recursive extractor invocations may set this to
@@ -235,6 +245,70 @@ protected void validateInputStream(final InputStream in) {
         }
     }
 
+    /**
+     * Holder for the result of {@link #readWithLimit(Reader, long)}.
+     */
+    protected static final class TextReadResult {
+        /** The decoded content (possibly truncated). */
+        public final String content;
+        /** Whether the content was truncated at the configured limit. */
+        public final boolean truncated;
+
+        /**
+         * Creates a new result holder.
+         * @param content the decoded content
+         * @param truncated whether truncation occurred
+         */
+        TextReadResult(final String content, final boolean truncated) {
+            this.content = content;
+            this.truncated = truncated;
+        }
+    }
+
+    /**
+     * Reads characters from the supplied reader into a string, bounding the number
+     * of characters by {@code maxTextLength}. When the limit is reached the read
+     * stops early, a WARN-level message is logged, and the result is flagged as
+     * truncated. A {@code maxTextLength} less than or equal to zero disables the
+     * limit. At the truncation boundary a trailing unpaired high surrogate is
+     * dropped so the returned string is always a valid UTF-16 sequence.
+     *
+     * <p>The supplied reader is not closed by this method; the caller retains
+     * ownership.
+     *
+     * @param reader the reader to consume
+     * @param maxTextLength the maximum number of characters ({@code char} units) to read
+     * @return a {@link TextReadResult} with the decoded content and the truncation flag
+     * @throws IOException if reading fails
+     */
+    protected TextReadResult readWithLimit(final Reader reader, final long maxTextLength) throws IOException {
+        final BufferedReader br = reader instanceof BufferedReader ? (BufferedReader) reader : new BufferedReader(reader);
+        final StringBuilder sb = new StringBuilder();
+        final char[] buf = new char[READ_BUFFER_SIZE];
+        long total = 0;
+        boolean truncated = false;
+        int n;
+        while ((n = br.read(buf)) >= 0) {
+            if (maxTextLength > 0 && total + n > maxTextLength) {
+                final int remaining = (int) (maxTextLength - total);
+                if (remaining > 0) {
+                    sb.append(buf, 0, remaining);
+                }
+                // Avoid leaving an unpaired high surrogate at the end.
+                if (sb.length() > 0 && Character.isHighSurrogate(sb.charAt(sb.length() - 1))) {
+                    sb.setLength(sb.length() - 1);
+                }
+                logger.warn("Extracted content truncated: extractor={} maxTextLength={} totalChars={}", getClass().getSimpleName(),
+                        maxTextLength, total + n);
+                truncated = true;
+                break;
+            }
+            sb.append(buf, 0, n);
+            total += n;
+        }
+        return new TextReadResult(sb.toString(), truncated);
+    }
+
     /**
      * Returns true when the supplied entry name escapes the conceptual
      * extraction root via path-traversal segments. The check is performed on

diff --git a/...-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractXmlExtractor.java b/...-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractXmlExtractor.java
@@ -18,6 +18,8 @@
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
 import java.nio.charset.Charset;
 import java.util.Map;
 import java.util.regex.Matcher;
@@ -32,7 +34,6 @@
 import org.apache.commons.text.translate.NumericEntityUnescaper;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
-import org.codelibs.core.io.InputStreamUtil;
 import org.codelibs.core.lang.StringUtil;
 import org.codelibs.fess.crawler.Constants;
 import org.codelibs.fess.crawler.entity.ExtractData;
@@ -44,6 +45,14 @@
  * Provides common functionality for extracting text content from XML-like documents.
  * It handles encoding detection, HTML entity unescaping, and tag-based content extraction.
  *
+ * <p>Features:
+ * <ul>
+ *   <li>BOM detection for UTF-8, UTF-16 LE/BE, UTF-32 LE/BE, and UTF-7</li>
+ *   <li>Reader-based streaming to avoid loading entire documents into memory</li>
+ *   <li>Optional {@code maxTextLength} cap to bound heap usage on very large inputs</li>
+ *   <li>HTML entity unescaping</li>
+ *   <li>Configurable comment-tag suppression</li>
+ * </ul>
  */
 public abstract class AbstractXmlExtractor extends AbstractExtractor {
 
@@ -54,6 +63,8 @@ public abstract class AbstractXmlExtractor extends AbstractExtractor {
 
     /**
      * UTF-7 Byte Order Mark definition.
+     * Note: 'UTF-7' is not always provided by the JVM. Reading content declared as UTF-7
+     * may fail with {@code UnsupportedEncodingException}.
      */
     protected static final ByteOrderMark BOM_UTF_7 = new ByteOrderMark("UTF-7", 0x2B, 0x2F, 0x76);
 
@@ -79,6 +90,17 @@ public abstract class AbstractXmlExtractor extends AbstractExtractor {
      */
     protected boolean ignoreCommentTag = false;
 
+    /**
+     * Maximum number of characters to read from the input. The default is
+     * {@link Long#MAX_VALUE}, which is effectively unlimited. Values less than
+     * or equal to zero explicitly disable the limit.
+     *
+     * <p>The limit is measured in Java {@code char} units (UTF-16 code units).
+     * At the truncation boundary, an unpaired high surrogate is dropped to avoid
+     * leaving an invalid string.
+     */
+    protected long maxTextLength = Long.MAX_VALUE;
+
     /**
      * Constructs a new AbstractXmlExtractor.
      */
@@ -98,6 +120,24 @@ public AbstractXmlExtractor() {
      */
     protected abstract Pattern getTagPattern();
 
+    /**
+     * Extracts text from the supplied XML input stream.
+     *
+     * <p>This method detects the character encoding via a leading BOM (UTF-8,
+     * UTF-16 LE/BE, UTF-32 LE/BE, UTF-7) or from the XML declaration, then
+     * streams the content through a {@code BufferedReader}. The raw character
+     * count is bounded by {@link #maxTextLength} before tag-stripping is applied.
+     * When truncation occurs, a WARN-level log message is emitted and the returned
+     * {@link ExtractData} carries {@code truncated=true} and
+     * {@code maxTextLength=<value>} metadata entries. The supplied {@code in} is
+     * closed by this method.
+     *
+     * @param in the XML input stream; must not be {@code null}
+     * @param params optional extraction parameters (may be {@code null})
+     * @return the extracted text and optional truncation metadata
+     * @throws CrawlerSystemException if {@code in} is {@code null}
+     * @throws ExtractException if reading or decoding fails
+     */
     @Override
     public ExtractData getText(final InputStream in, final Map<String, String> params) {
         if (in == null) {
@@ -106,13 +146,43 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
         try {
             final BufferedInputStream bis = new BufferedInputStream(in);
             final String enc = getEncoding(bis);
-            final String content = UNESCAPE_HTML4.translate(new String(InputStreamUtil.getBytes(bis), enc));
-            return createExtractData(content);
+            // Strip any BOM bytes from the actual stream (the encoding lookup above
+            // resets the underlying buffer, so the BOM is still present here).
+            try (BOMInputStream bomStripped = BOMInputStream.builder()
+                    .setInputStream(bis)
+                    .setInclude(false)
+                    .setByteOrderMarks(ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE,
+                            ByteOrderMark.UTF_32BE, BOM_UTF_7)
+                    .get()) {
+                final TextReadResult result = readAsString(bomStripped, enc);
+                final String content = UNESCAPE_HTML4.translate(result.content);
+                final ExtractData extractData = createExtractData(content);
+                if (result.truncated) {
+                    extractData.putValue("truncated", "true");
+                    extractData.putValue("maxTextLength", Long.toString(maxTextLength));
+                }
+                return extractData;
+            }
         } catch (final Exception e) {
             throw new ExtractException(e);
         }
     }
 
+    /**
+     * Streams the supplied input into a string using the given charset. The
+     * total number of characters appended is bounded by {@link #maxTextLength}.
+     *
+     * @param in the input stream
+     * @param charset the charset name
+     * @return a {@link TextReadResult} containing the decoded content and whether truncation occurred
+     * @throws IOException if reading fails
+     */
+    protected TextReadResult readAsString(final InputStream in, final String charset) throws IOException {
+        try (Reader reader = new InputStreamReader(in, charset)) {
+            return readWithLimit(reader, maxTextLength);
+        }
+    }
+
     /**
      * Creates an ExtractData object from the extracted content.
      * @param content The extracted content.
@@ -131,9 +201,15 @@ protected String getEncoding(final BufferedInputStream bis) {
         final byte[] b = new byte[preloadSizeForCharset];
         try {
             bis.mark(preloadSizeForCharset);
+            // The wrapping BOMInputStream is intentionally not closed here so the
+            // underlying buffered stream can be reset (see finally) and reused by getText.
             @SuppressWarnings("resource")
-            final BOMInputStream bomIn = new BOMInputStream(bis, false, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE,
-                    ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE, BOM_UTF_7);
+            final BOMInputStream bomIn = BOMInputStream.builder()
+                    .setInputStream(bis)
+                    .setInclude(false)
+                    .setByteOrderMarks(ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
+                            ByteOrderMark.UTF_32LE, BOM_UTF_7)
+                    .get();
             if (bomIn.hasBOM()) {
                 if (logger.isDebugEnabled()) {
                     logger.debug("BOM: {}", bomIn.getBOMCharsetName());
@@ -248,4 +324,29 @@ public void setIgnoreCommentTag(final boolean ignoreCommentTag) {
         this.ignoreCommentTag = ignoreCommentTag;
     }
 
+    /**
+     * Returns the maximum number of characters that will be read from the
+     * input stream before truncation.
+     *
+     * @return the maximum text length
+     */
+    public long getMaxTextLength() {
+        return maxTextLength;
+    }
+
+    /**
+     * Sets the maximum number of characters that will be read from the input
+     * stream. The default is {@link Long#MAX_VALUE}, which is effectively
+     * unlimited. Values less than or equal to zero explicitly disable the limit.
+     *
+     * <p>The limit is measured in Java {@code char} units (UTF-16 code units).
+     * At the truncation boundary, an unpaired high surrogate is dropped to avoid
+     * leaving an invalid string.
+     *
+     * @param maxTextLength the maximum text length
+     */
+    public void setMaxTextLength(final long maxTextLength) {
+        this.maxTextLength = maxTextLength;
+    }
+
 }