Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
*/
package org.codelibs.fess.crawler.extractor.impl;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.nio.file.Paths;
Expand All @@ -27,6 +29,8 @@
import java.util.Map;

import org.apache.commons.io.IOUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.fess.crawler.container.CrawlerContainer;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
Expand Down Expand Up @@ -56,6 +60,12 @@
*/
public abstract class AbstractExtractor implements Extractor {

/** Logger instance for this class. */
private static final Logger logger = LogManager.getLogger(AbstractExtractor.class);

/** Default read buffer size in characters when streaming reader content. */
protected static final int READ_BUFFER_SIZE = 8192;

/**
* Parameter key used to track the recursion depth across nested archive
* extraction. Callers/recursive extractor invocations may set this to
Expand Down Expand Up @@ -235,6 +245,70 @@ protected void validateInputStream(final InputStream in) {
}
}

/**
* Holder for the result of {@link #readWithLimit(Reader, long)}.
*/
protected static final class TextReadResult {
/** The decoded content (possibly truncated). */
public final String content;
/** Whether the content was truncated at the configured limit. */
public final boolean truncated;

/**
* Creates a new result holder.
* @param content the decoded content
* @param truncated whether truncation occurred
*/
TextReadResult(final String content, final boolean truncated) {
this.content = content;
this.truncated = truncated;
}
}

/**
* Reads characters from the supplied reader into a string, bounding the number
* of characters by {@code maxTextLength}. When the limit is reached the read
* stops early, a WARN-level message is logged, and the result is flagged as
* truncated. A {@code maxTextLength} less than or equal to zero disables the
* limit. At the truncation boundary a trailing unpaired high surrogate is
* dropped so the returned string is always a valid UTF-16 sequence.
*
* <p>The supplied reader is not closed by this method; the caller retains
* ownership.
*
* @param reader the reader to consume
* @param maxTextLength the maximum number of characters ({@code char} units) to read
* @return a {@link TextReadResult} with the decoded content and the truncation flag
* @throws IOException if reading fails
*/
protected TextReadResult readWithLimit(final Reader reader, final long maxTextLength) throws IOException {
final BufferedReader br = reader instanceof BufferedReader ? (BufferedReader) reader : new BufferedReader(reader);
final StringBuilder sb = new StringBuilder();
final char[] buf = new char[READ_BUFFER_SIZE];
long total = 0;
boolean truncated = false;
int n;
while ((n = br.read(buf)) >= 0) {
if (maxTextLength > 0 && total + n > maxTextLength) {
final int remaining = (int) (maxTextLength - total);
if (remaining > 0) {
sb.append(buf, 0, remaining);
}
// Avoid leaving an unpaired high surrogate at the end.
if (sb.length() > 0 && Character.isHighSurrogate(sb.charAt(sb.length() - 1))) {
sb.setLength(sb.length() - 1);
}
logger.warn("Extracted content truncated: extractor={} maxTextLength={} totalChars={}", getClass().getSimpleName(),
maxTextLength, total + n);
truncated = true;
break;
}
sb.append(buf, 0, n);
total += n;
}
return new TextReadResult(sb.toString(), truncated);
}

/**
* Returns true when the supplied entry name escapes the conceptual
* extraction root via path-traversal segments. The check is performed on
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.regex.Matcher;
Expand All @@ -32,7 +34,6 @@
import org.apache.commons.text.translate.NumericEntityUnescaper;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.core.io.InputStreamUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.Constants;
import org.codelibs.fess.crawler.entity.ExtractData;
Expand All @@ -44,6 +45,14 @@
* Provides common functionality for extracting text content from XML-like documents.
* It handles encoding detection, HTML entity unescaping, and tag-based content extraction.
*
* <p>Features:
* <ul>
* <li>BOM detection for UTF-8, UTF-16 LE/BE, UTF-32 LE/BE, and UTF-7</li>
* <li>Reader-based streaming to avoid loading entire documents into memory</li>
* <li>Optional {@code maxTextLength} cap to bound heap usage on very large inputs</li>
* <li>HTML entity unescaping</li>
* <li>Configurable comment-tag suppression</li>
* </ul>
*/
public abstract class AbstractXmlExtractor extends AbstractExtractor {

Expand All @@ -54,6 +63,8 @@ public abstract class AbstractXmlExtractor extends AbstractExtractor {

/**
* UTF-7 Byte Order Mark definition.
* Note: 'UTF-7' is not always provided by the JVM. Reading content declared as UTF-7
* may fail with {@code UnsupportedEncodingException}.
*/
protected static final ByteOrderMark BOM_UTF_7 = new ByteOrderMark("UTF-7", 0x2B, 0x2F, 0x76);

Expand All @@ -79,6 +90,17 @@ public abstract class AbstractXmlExtractor extends AbstractExtractor {
*/
protected boolean ignoreCommentTag = false;

/**
* Maximum number of characters to read from the input. The default is
* {@link Long#MAX_VALUE}, which is effectively unlimited. Values less than
* or equal to zero explicitly disable the limit.
*
* <p>The limit is measured in Java {@code char} units (UTF-16 code units).
* At the truncation boundary, an unpaired high surrogate is dropped to avoid
* leaving an invalid string.
*/
protected long maxTextLength = Long.MAX_VALUE;

/**
* Constructs a new AbstractXmlExtractor.
*/
Expand All @@ -98,6 +120,24 @@ public AbstractXmlExtractor() {
*/
protected abstract Pattern getTagPattern();

/**
* Extracts text from the supplied XML input stream.
*
* <p>This method detects the character encoding via a leading BOM (UTF-8,
* UTF-16 LE/BE, UTF-32 LE/BE, UTF-7) or from the XML declaration, then
* streams the content through a {@code BufferedReader}. The raw character
* count is bounded by {@link #maxTextLength} before tag-stripping is applied.
* When truncation occurs, a WARN-level log message is emitted and the returned
* {@link ExtractData} carries {@code truncated=true} and
* {@code maxTextLength=<value>} metadata entries. The supplied {@code in} is
* closed by this method.
*
* @param in the XML input stream; must not be {@code null}
* @param params optional extraction parameters (may be {@code null})
* @return the extracted text and optional truncation metadata
* @throws CrawlerSystemException if {@code in} is {@code null}
* @throws ExtractException if reading or decoding fails
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
Expand All @@ -106,13 +146,43 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
try {
final BufferedInputStream bis = new BufferedInputStream(in);
final String enc = getEncoding(bis);
final String content = UNESCAPE_HTML4.translate(new String(InputStreamUtil.getBytes(bis), enc));
return createExtractData(content);
// Strip any BOM bytes from the actual stream (the encoding lookup above
// resets the underlying buffer, so the BOM is still present here).
try (BOMInputStream bomStripped = BOMInputStream.builder()
.setInputStream(bis)
.setInclude(false)
.setByteOrderMarks(ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE,
ByteOrderMark.UTF_32BE, BOM_UTF_7)
.get()) {
final TextReadResult result = readAsString(bomStripped, enc);
final String content = UNESCAPE_HTML4.translate(result.content);
final ExtractData extractData = createExtractData(content);
if (result.truncated) {
extractData.putValue("truncated", "true");
extractData.putValue("maxTextLength", Long.toString(maxTextLength));
}
return extractData;
}
} catch (final Exception e) {
throw new ExtractException(e);
}
}

/**
* Streams the supplied input into a string using the given charset. The
* total number of characters appended is bounded by {@link #maxTextLength}.
*
* @param in the input stream
* @param charset the charset name
* @return a {@link TextReadResult} containing the decoded content and whether truncation occurred
* @throws IOException if reading fails
*/
protected TextReadResult readAsString(final InputStream in, final String charset) throws IOException {
try (Reader reader = new InputStreamReader(in, charset)) {
return readWithLimit(reader, maxTextLength);
}
}

/**
* Creates an ExtractData object from the extracted content.
* @param content The extracted content.
Expand All @@ -131,9 +201,15 @@ protected String getEncoding(final BufferedInputStream bis) {
final byte[] b = new byte[preloadSizeForCharset];
try {
bis.mark(preloadSizeForCharset);
// The wrapping BOMInputStream is intentionally not closed here so the
// underlying buffered stream can be reset (see finally) and reused by getText.
@SuppressWarnings("resource")
final BOMInputStream bomIn = new BOMInputStream(bis, false, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE,
ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE, BOM_UTF_7);
final BOMInputStream bomIn = BOMInputStream.builder()
.setInputStream(bis)
.setInclude(false)
.setByteOrderMarks(ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
ByteOrderMark.UTF_32LE, BOM_UTF_7)
.get();
if (bomIn.hasBOM()) {
if (logger.isDebugEnabled()) {
logger.debug("BOM: {}", bomIn.getBOMCharsetName());
Expand Down Expand Up @@ -248,4 +324,29 @@ public void setIgnoreCommentTag(final boolean ignoreCommentTag) {
this.ignoreCommentTag = ignoreCommentTag;
}

/**
* Returns the maximum number of characters that will be read from the
* input stream before truncation.
*
* @return the maximum text length
*/
public long getMaxTextLength() {
return maxTextLength;
}

/**
* Sets the maximum number of characters that will be read from the input
* stream. The default is {@link Long#MAX_VALUE}, which is effectively
* unlimited. Values less than or equal to zero explicitly disable the limit.
*
* <p>The limit is measured in Java {@code char} units (UTF-16 code units).
* At the truncation boundary, an unpaired high surrogate is dropped to avoid
* leaving an invalid string.
*
* @param maxTextLength the maximum text length
*/
public void setMaxTextLength(final long maxTextLength) {
this.maxTextLength = maxTextLength;
}

}
Loading
Loading