diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java index 14759e30..19c71822 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java @@ -18,10 +18,18 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.InvalidPathException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import org.apache.commons.io.IOUtils; import org.codelibs.fess.crawler.container.CrawlerContainer; import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; @@ -48,6 +56,14 @@ */ public abstract class AbstractExtractor implements Extractor { + /** + * Parameter key used to track the recursion depth across nested archive + * extraction. Callers/recursive extractor invocations may set this to + * limit how deeply nested archives are unpacked. The value is parsed as + * an integer; missing or unparseable values are treated as depth 0. + */ + public static final String EXTRACTOR_DEPTH_KEY = "extractorDepth"; + /** The crawler container. */ @Resource protected CrawlerContainer crawlerContainer; @@ -55,6 +71,14 @@ public abstract class AbstractExtractor implements Extractor { /** The weight of this extractor. */ protected int weight = 1; + /** + * Maximum allowed depth for recursive archive extraction. When the depth + * value parsed from {@link #EXTRACTOR_DEPTH_KEY} reaches this threshold, + * {@link #checkDepth(Map, int)} aborts further recursion to defend + * against recursion-bomb archives. + */ + protected int maxArchiveDepth = 10; + /** * Constructs a new AbstractExtractor. */ @@ -62,6 +86,74 @@ public AbstractExtractor() { // NOP } + /** + * Sets the maximum allowed recursion depth for nested archive extraction. + * @param maxArchiveDepth the new maximum depth (non-negative) + */ + public void setMaxArchiveDepth(final int maxArchiveDepth) { + this.maxArchiveDepth = maxArchiveDepth; + } + + /** + * Returns the current recursion depth recorded in the extractor params. + * Missing, blank, or unparseable values are treated as {@code 0}. + * + * @param params the extractor parameters (may be {@code null}) + * @return the parsed depth, or {@code 0} if not set + */ + protected int getCurrentDepth(final Map params) { + if (params == null) { + return 0; + } + final String value = params.get(EXTRACTOR_DEPTH_KEY); + if (value == null || value.isBlank()) { + return 0; + } + try { + final int depth = Integer.parseInt(value.trim()); + return depth < 0 ? 0 : depth; + } catch (final NumberFormatException e) { + return 0; + } + } + + /** + * Returns a NEW parameter map (the original is not mutated) with the + * recursion depth incremented by one. Useful when an archive extractor + * recursively delegates to another extractor for a nested archive entry. + * + * @param params the current extractor parameters (may be {@code null}) + * @return a new map containing all original entries plus an incremented + * depth + */ + protected Map incrementDepth(final Map params) { + final Map next = new HashMap<>(); + if (params != null) { + next.putAll(params); + } + next.put(EXTRACTOR_DEPTH_KEY, Integer.toString(getCurrentDepth(params) + 1)); + return next; + } + + /** + * Validates that the recursion depth recorded in {@code params} does not + * meet or exceed {@code maxDepth}. Throws {@link MaxLengthExceededException} + * (a {@link org.codelibs.fess.crawler.exception.CrawlingAccessException + * CrawlingAccessException}) when the threshold is reached so that the + * surrounding crawler treats it as a data-driven access failure rather + * than a system error. + * + * @param params the extractor parameters (may be {@code null}) + * @param maxDepth the (exclusive) maximum allowed depth + * @throws MaxLengthExceededException when {@code currentDepth >= maxDepth} + */ + protected void checkDepth(final Map params, final int maxDepth) { + final int current = getCurrentDepth(params); + if (current >= maxDepth) { + throw new MaxLengthExceededException("Archive recursion depth exceeded: depth=" + current + " max=" + maxDepth); + } + } + @Override public int getWeight() { return weight; @@ -142,4 +234,108 @@ protected void validateInputStream(final InputStream in) { throw new CrawlerSystemException("The inputstream is null."); } } + + /** + * Returns true when the supplied entry name escapes the conceptual + * extraction root via path-traversal segments. The check is performed on + * a normalised form of the path and is shared between the archive + * extractors (Zip / Tar / Lha) so the rejection rules stay in lock step. + * + *

+ * An entry is rejected when it is null/empty, when it is rooted at + * {@code /} or {@code \}, when it begins with a Windows drive letter + * (e.g. {@code C:}), when its normalised form contains a {@code ..} + * segment, or when {@link Paths#get} treats it as malformed. + *

+ * + * @param name the entry name as reported by the archive + * @return {@code true} if the name should be rejected + */ + protected static boolean isPathTraversal(final String name) { + if (name == null || name.isEmpty()) { + return true; + } + // Absolute paths (Unix or Windows-style) are unsafe in the + // context of an archive extracted into a sandbox root. + if (name.startsWith("/") || name.startsWith("\\")) { + return true; + } + if (name.length() >= 2 && name.charAt(1) == ':') { + return true; + } + // Normalise backslashes to forward slashes BEFORE calling Paths.get(). + // On Linux (and macOS) a backslash is a literal filename character, so + // Paths.get("a\\..") treats "a\\.." as a SINGLE opaque segment and + // normalize() leaves it unchanged — bypassing the ".." segment check. + // Unifying to "/" first forces the path parser to recognise each + // component correctly on all platforms. + final String unified = name.replace('\\', '/'); + if (unified.startsWith("/")) { + return true; + } + try { + final Path normalised = Paths.get(unified).normalize(); + final String normStr = normalised.toString().replace('\\', '/'); + if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) { + return true; + } + for (final Path part : normalised) { + if ("..".equals(part.toString())) { + return true; + } + } + } catch (final InvalidPathException ipe) { + return true; + } + return false; + } + + /** + * Saturating add: returns {@code value + 1} unless that would overflow + * {@code Long.MAX_VALUE}, in which case {@code Long.MAX_VALUE} is returned. + * Used when computing a read limit that is one byte beyond a cap so that + * the caller can detect "exactly at the cap" vs "exceeds the cap" without + * silently wrapping to a negative limit and reading nothing. + * + * @param value the value to increment (must be non-negative) + * @return {@code value + 1} or {@code Long.MAX_VALUE} if already at the + * maximum + */ + protected static long addOneSaturating(final long value) { + return value >= Long.MAX_VALUE ? Long.MAX_VALUE : value + 1L; + } + + /** + * Copies up to {@code limit} bytes from {@code in} to {@code out}, returning + * the actual number of bytes copied. Used by archive extractors to bound + * the amount of memory consumed when buffering an entry's uncompressed + * payload. + * + * @param in the source stream + * @param out the sink stream + * @param limit the maximum number of bytes to copy (inclusive). Must be + * non-negative; a negative value throws + * {@link IllegalArgumentException} so misconfiguration is + * surfaced immediately rather than silently reading nothing. + * @return the number of bytes actually copied + * @throws IllegalArgumentException if {@code limit} is negative + * @throws IOException if reading from {@code in} or writing to {@code out} + * fails + */ + protected static long copyBounded(final InputStream in, final OutputStream out, final long limit) throws IOException { + if (limit < 0) { + throw new IllegalArgumentException("copyBounded: limit must be non-negative, got " + limit); + } + if (limit == 0) { + return 0; + } + final byte[] buffer = new byte[8192]; + long total = 0; + int read; + while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) { + out.write(buffer, 0, read); + total += read; + } + return total; + } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java index 7a5ff29a..5938f857 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java @@ -15,18 +15,18 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.Enumeration; -import java.util.HashMap; import java.util.Map; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.core.io.CloseableUtil; -import org.codelibs.core.io.CopyUtil; import org.codelibs.core.io.FileUtil; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; @@ -35,7 +35,6 @@ import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; -import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; import jp.gr.java_conf.dangan.util.lha.LhaFile; import jp.gr.java_conf.dangan.util.lha.LhaHeader; @@ -45,15 +44,59 @@ * This extractor can extract text content from files within LHA archives * by using appropriate extractors for each contained file type. * + *

+ * Defends against decompression / many-entry / recursion bombs and Zip Slip + * style path traversal in entry names. + *

+ * * @author shinsuke */ public class LhaExtractor extends AbstractExtractor { /** Logger for this class. */ private static final Logger logger = LogManager.getLogger(LhaExtractor.class); - /** Maximum content size for extraction. -1 means no limit. */ + /** + * Legacy total cap on uncompressed bytes actually buffered from + * supported entries. The cap is also folded into the read budget so a + * single oversized entry cannot be buffered up to + * {@link #maxBytesPerEntry} when the user only asked for a much smaller + * total. Set to {@code -1} to disable. + */ protected long maxContentSize = -1; + /** + * Maximum total uncompressed bytes that may be read from all entries + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes + * from entries that have a registered {@link Extractor} contribute to + * this total — unsupported entries are skipped without buffering. + */ + protected long maxBytes = 1L << 31; + + /** + * Maximum uncompressed bytes that may be buffered for a SINGLE entry. + * Enforced against the actual bytes read from the entry stream (NOT the + * header-reported size, which is attacker-controlled). Defaults to + * 256 MiB. Set to {@code -1} to disable. Enforced independently of + * {@link #maxBytes}. Only applies to entries that have a registered + * {@link Extractor}; an unsupported entry is never buffered. + */ + protected long maxBytesPerEntry = 256L * 1024L * 1024L; + + /** + * Maximum bytes copied from the input stream to the local temporary file + * before {@link LhaFile} is opened. The LHA library requires a seekable + * file, so the entire archive must be staged on disk; this cap prevents a + * hostile producer from filling local storage. Defaults to 1 GiB. Set to + * {@code -1} to disable. + */ + protected long maxInputBytes = 1L << 30; + + /** + * Maximum allowed number of entries to iterate. Defaults to 100,000. + * Set to {@code -1} to disable. + */ + protected int maxEntries = 100_000; + /** * Creates a new LhaExtractor instance. */ @@ -73,54 +116,141 @@ public LhaExtractor() { */ @Override public ExtractData getText(final InputStream in, final Map params) { - if (in == null) { - throw new CrawlerSystemException("LHA archive input stream is null. Cannot extract text from null input."); - } + validateInputStream(in); + checkDepth(params, maxArchiveDepth); final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); final StringBuilder buf = new StringBuilder(1000); + int processedEntries = 0; + int failedEntries = 0; File tempFile = null; LhaFile lhaFile = null; try { tempFile = createTempFile("crawler-", ".lzh", null); try (FileOutputStream fos = new FileOutputStream(tempFile)) { - CopyUtil.copy(in, fos); + // Stage the (untrusted) archive bytes to disk under a hard + // cap so a hostile producer cannot exhaust local storage by + // streaming an arbitrarily large body. + final long inputReadLimit = maxInputBytes > 0 ? addOneSaturating(maxInputBytes) : Long.MAX_VALUE; + final long staged = copyBounded(in, fos, inputReadLimit); + if (maxInputBytes > 0 && staged > maxInputBytes) { + throw new MaxLengthExceededException("lha input size exceeded: bytes=" + staged + " max=" + maxInputBytes); + } } lhaFile = new LhaFile(tempFile); @SuppressWarnings("unchecked") final Enumeration entries = lhaFile.entries(); - long contentSize = 0; + long totalBytes = 0; + int entryCount = 0; while (entries.hasMoreElements()) { final LhaHeader head = entries.nextElement(); - contentSize += head.getOriginalSize(); - if (maxContentSize != -1 && contentSize > maxContentSize) { - throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize); + entryCount++; + if (maxEntries > 0 && entryCount > maxEntries) { + throw new MaxLengthExceededException("lha entry count exceeded: count=" + entryCount + " max=" + maxEntries); } final String filename = head.getPath(); + if (isPathTraversal(filename)) { + logger.warn("lha entry rejected: name={} reason=path-traversal", filename); + continue; + } + + // Decide MIME / extractor up front so an unsupported entry + // is skipped without opening its decompressor at all. This + // mirrors the legacy behaviour and keeps a large irrelevant + // entry from consuming the per-entry / total caps reserved + // for entries the crawler actually wants to extract. final String mimeType = mimeTypeHelper.getContentType(null, filename); - if (mimeType != null) { - final Extractor extractor = extractorFactory.getExtractor(mimeType); - if (extractor != null) { - InputStream is = null; - try { - is = lhaFile.getInputStream(head); - final Map map = new HashMap<>(); - map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent()); - buf.append('\n'); - } catch (final Exception e) { - if (logger.isDebugEnabled()) { - logger.debug("Exception in an internal extractor.", e); - } - } finally { - CloseableUtil.closeQuietly(is); - } + final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null; + if (extractor == null) { + continue; + } + + // Read the entry payload through copyBounded so the cap is + // enforced against bytes actually decompressed, not the + // header-reported size (which is attacker-controlled). + final long actualBytes; + final byte[] entryBytes; + InputStream is = null; + try { + // getInputStream(LhaHeader) can return null when the header + // is not found in the archive index, and can throw + // RuntimeException (e.g. IllegalArgumentException from + // CompressMethod.getCore) for unknown/corrupt compression + // methods — both must be handled before touching `is`. + is = lhaFile.getInputStream(head); + if (is == null) { + logger.warn("lha entry stream is null: name={}", filename); + failedEntries++; + continue; + } + final long totalReadLimit; + if (maxBytes > 0) { + totalReadLimit = addOneSaturating(Math.max(0L, maxBytes - totalBytes)); + } else { + totalReadLimit = Long.MAX_VALUE; + } + // Fold maxContentSize into the read budget so a small + // legacy cap is honoured before a large per-entry cap + // can buffer hundreds of MiB into memory. + final long contentReadLimit; + if (maxContentSize >= 0) { + contentReadLimit = addOneSaturating(Math.max(0L, maxContentSize - totalBytes)); + } else { + contentReadLimit = Long.MAX_VALUE; + } + final long perEntryReadLimit = maxBytesPerEntry > 0 ? addOneSaturating(maxBytesPerEntry) : Long.MAX_VALUE; + final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + actualBytes = copyBounded(is, out, readLimit); + entryBytes = out.toByteArray(); + } catch (final IOException | RuntimeException ioe) { + // IOException: normal read failure. + // RuntimeException: includes IllegalArgumentException thrown + // by CompressMethod.getCore() for corrupt/unknown compression + // methods in the dangan LHA library (getInputStream does not + // declare checked exceptions for these cases). + logger.warn("Failed to read lha entry: name={}", filename, ioe); + failedEntries++; + continue; + } finally { + CloseableUtil.closeQuietly(is); + } + + if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) { + throw new MaxLengthExceededException( + "lha per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry); + } + + totalBytes += actualBytes; + if (maxBytes > 0 && totalBytes > maxBytes) { + throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); + } + if (maxContentSize >= 0 && totalBytes > maxContentSize) { + throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); + } + + try { + final Map map = incrementDepth(params); + map.put(ExtractData.RESOURCE_NAME_KEY, filename); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); + buf.append('\n'); + processedEntries++; + } catch (final MaxLengthExceededException e) { + throw e; + } catch (final Exception e) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Exception in an internal extractor: name={}", filename, e); } } } + // Summary warn when the loop completed normally but some entries failed. + if (failedEntries > 0) { + logger.warn("LHA archive partially processed: processed={} failed={}", processedEntries, failedEntries); + } } catch (final MaxLengthExceededException e) { throw e; } catch (final Exception e) { @@ -130,7 +260,7 @@ public ExtractData getText(final InputStream in, final Map param try { lhaFile.close(); } catch (final IOException e) { - // ignore + logger.warn("Failed to close LHA file. tempFile={}", tempFile, e); } } FileUtil.deleteInBackground(tempFile); @@ -147,4 +277,41 @@ public ExtractData getText(final InputStream in, final Map param public void setMaxContentSize(final long maxContentSize) { this.maxContentSize = maxContentSize; } + + /** + * Sets the cap on total uncompressed bytes read from all entries. + * @param maxBytes the maximum total bytes (use {@code -1} to disable) + */ + public void setMaxBytes(final long maxBytes) { + this.maxBytes = maxBytes; + } + + /** + * Sets the per-entry cap on uncompressed bytes buffered in memory. The + * cap is enforced against bytes actually decompressed (not the + * header-reported size). Set to {@code -1} to disable. + * + * @param maxBytesPerEntry the per-entry maximum + */ + public void setMaxBytesPerEntry(final long maxBytesPerEntry) { + this.maxBytesPerEntry = maxBytesPerEntry; + } + + /** + * Sets the cap on the number of input bytes staged to a temporary file + * before {@link LhaFile} is opened. Set to {@code -1} to disable. + * + * @param maxInputBytes the input-stage maximum + */ + public void setMaxInputBytes(final long maxInputBytes) { + this.maxInputBytes = maxInputBytes; + } + + /** + * Sets the maximum number of entries that may be iterated. + * @param maxEntries the maximum entry count (use {@code -1} to disable) + */ + public void setMaxEntries(final int maxEntries) { + this.maxEntries = maxEntries; + } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java index b5bb0238..0bebefc7 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java @@ -15,8 +15,10 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.io.InputStream; -import java.util.HashMap; import java.util.Map; import org.apache.commons.compress.archivers.ArchiveInputStream; @@ -30,12 +32,17 @@ import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; -import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; import jakarta.annotation.Resource; /** * Extracts text content from TAR archives. + * + *

+ * Defends against decompression / many-entry / recursion bombs and Zip Slip + * style path traversal. Symbolic and hard link entries are skipped because + * they can reference files outside the archive sandbox. + *

*/ public class TarExtractor extends AbstractExtractor { private static final Logger logger = LogManager.getLogger(TarExtractor.class); @@ -47,10 +54,38 @@ public class TarExtractor extends AbstractExtractor { protected ArchiveStreamFactory archiveStreamFactory; /** - * Maximum content size. + * Legacy total cap on uncompressed bytes actually buffered from + * supported entries. The cap is also folded into the read budget so a + * single oversized entry cannot be buffered up to + * {@link #maxBytesPerEntry} when the user only asked for a much smaller + * total. Set to {@code -1} to disable. */ protected long maxContentSize = -1; + /** + * Maximum total uncompressed bytes that may be read from all entries + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes + * from entries that have a registered {@link Extractor} contribute to + * this total — unsupported entries are skipped without buffering. + */ + protected long maxBytes = 1L << 31; + + /** + * Maximum uncompressed bytes that may be buffered for a SINGLE entry. + * Guards against an oversized entry exhausting the JVM heap when + * buffered into memory. Defaults to 256 MiB. Set to {@code -1} to + * disable. Enforced independently of {@link #maxBytes}. Only applies to + * entries that have a registered {@link Extractor}; an unsupported + * entry is never buffered, so this cap is irrelevant for it. + */ + protected long maxBytesPerEntry = 256L * 1024L * 1024L; + + /** + * Maximum allowed number of entries to iterate. Defaults to 100,000. + * Set to {@code -1} to disable. + */ + protected int maxEntries = 100_000; + /** * Creates a new TarExtractor instance. */ @@ -61,10 +96,11 @@ public TarExtractor() { @Override public ExtractData getText(final InputStream in, final Map params) { validateInputStream(in); + checkDepth(params, maxArchiveDepth); final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); - return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory)); + return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory, params)); } /** @@ -73,40 +109,117 @@ public ExtractData getText(final InputStream in, final Map param * @param in The input stream. * @param mimeTypeHelper The mime type helper. * @param extractorFactory The extractor factory. + * @param params Extractor parameters used to track recursion depth. * @return A text. */ - protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory) { + protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory, + final Map params) { final StringBuilder buf = new StringBuilder(1000); int processedEntries = 0; int failedEntries = 0; try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream("tar", in)) { - TarArchiveEntry entry = null; - long contentSize = 0; + TarArchiveEntry entry; + long totalBytes = 0; + int entryCount = 0; while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) { - contentSize += entry.getSize(); - if (maxContentSize != -1 && contentSize > maxContentSize) { - throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize); + // PAX extended / global headers are metadata-only pseudo-entries + // that must NOT count toward maxEntries. Commons Compress + // usually absorbs them, but skip defensively for non-standard + // tar dialects that surface them through getNextEntry(). + if (entry.isPaxHeader() || entry.isGlobalPaxHeader()) { + continue; + } + entryCount++; + if (maxEntries > 0 && entryCount > maxEntries) { + throw new MaxLengthExceededException("tar entry count exceeded: count=" + entryCount + " max=" + maxEntries); } final String filename = entry.getName(); + if (entry.isDirectory()) { + continue; + } + if (entry.isSymbolicLink() || entry.isLink()) { + logger.warn("tar entry skipped: name={} reason=link link={}", filename, entry.getLinkName()); + continue; + } + if (isPathTraversal(filename)) { + logger.warn("tar entry rejected: name={} reason=path-traversal", filename); + continue; + } + + // Decide MIME / extractor up front. An unsupported entry + // (e.g. a video alongside a small .txt) is skipped without + // buffering, so a large irrelevant entry does not consume + // the per-entry / total caps that should be reserved for + // entries the crawler actually wants to extract. final String mimeType = mimeTypeHelper.getContentType(null, filename); - if (mimeType != null) { - final Extractor extractor = extractorFactory.getExtractor(mimeType); - if (extractor != null) { - try { - final Map map = new HashMap<>(); - map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent()); - buf.append('\n'); - processedEntries++; - } catch (final Exception e) { - failedEntries++; - if (logger.isDebugEnabled()) { - logger.debug("Failed to extract content from archive entry: {}", filename, e); - } - } + final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null; + if (extractor == null) { + continue; + } + + final long actualBytes; + final byte[] entryBytes; + try { + final long totalReadLimit; + if (maxBytes > 0) { + totalReadLimit = addOneSaturating(Math.max(0L, maxBytes - totalBytes)); + } else { + totalReadLimit = Long.MAX_VALUE; + } + // Fold maxContentSize into the read budget so a small + // legacy cap is honoured before a large per-entry cap + // can buffer hundreds of MiB into memory. + final long contentReadLimit; + if (maxContentSize >= 0) { + contentReadLimit = addOneSaturating(Math.max(0L, maxContentSize - totalBytes)); + } else { + contentReadLimit = Long.MAX_VALUE; + } + final long perEntryReadLimit = maxBytesPerEntry > 0 ? addOneSaturating(maxBytesPerEntry) : Long.MAX_VALUE; + final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + actualBytes = copyBounded(ais, out, readLimit); + entryBytes = out.toByteArray(); + } catch (final IOException ioe) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to read tar entry: name={}", filename, ioe); } + continue; + } + + if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) { + throw new MaxLengthExceededException( + "tar per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry); } + + totalBytes += actualBytes; + if (maxBytes > 0 && totalBytes > maxBytes) { + throw new MaxLengthExceededException("tar uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); + } + if (maxContentSize >= 0 && totalBytes > maxContentSize) { + throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); + } + + try { + final Map map = incrementDepth(params); + map.put(ExtractData.RESOURCE_NAME_KEY, filename); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); + buf.append('\n'); + processedEntries++; + } catch (final MaxLengthExceededException e) { + throw e; + } catch (final Exception e) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to extract content from archive entry: name={}", filename, e); + } + } + } + // Summary warn when the loop completed normally but some entries failed. + if (failedEntries > 0) { + logger.warn("TAR archive partially processed: processed={} failed={}", processedEntries, failedEntries); } } catch (final MaxLengthExceededException e) { throw e; @@ -115,7 +228,7 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime throw new ExtractException("Failed to extract content from TAR archive. No entries could be processed.", e); } if (logger.isWarnEnabled()) { - logger.warn("Partial extraction from TAR archive. Processed: {}, Failed: {}", processedEntries, failedEntries, e); + logger.warn("Partial extraction from TAR archive. processed={} failed={}", processedEntries, failedEntries, e); } } @@ -129,4 +242,30 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime public void setMaxContentSize(final long maxContentSize) { this.maxContentSize = maxContentSize; } + + /** + * Sets the cap on total uncompressed bytes read from all entries. + * @param maxBytes the maximum total bytes (use {@code -1} to disable) + */ + public void setMaxBytes(final long maxBytes) { + this.maxBytes = maxBytes; + } + + /** + * Sets the per-entry cap on uncompressed bytes buffered in memory. Set + * to {@code -1} to disable. + * + * @param maxBytesPerEntry the per-entry maximum + */ + public void setMaxBytesPerEntry(final long maxBytesPerEntry) { + this.maxBytesPerEntry = maxBytesPerEntry; + } + + /** + * Sets the maximum number of entries that may be iterated. + * @param maxEntries the maximum entry count (use {@code -1} to disable) + */ + public void setMaxEntries(final int maxEntries) { + this.maxEntries = maxEntries; + } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java index a543b3a9..ad2f8714 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java @@ -16,13 +16,15 @@ package org.codelibs.fess.crawler.extractor.impl; import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.io.InputStream; -import java.util.HashMap; import java.util.Map; -import org.apache.commons.compress.archivers.ArchiveInputStream; -import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.io.input.CountingInputStream; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.fess.crawler.entity.ExtractData; @@ -31,26 +33,81 @@ import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; -import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; - -import jakarta.annotation.Resource; /** * Extracts text content from ZIP archives. + * + *

+ * The extractor defends against several content-driven attack vectors. The + * input stream itself is treated as untrusted, while the {@code params} map is + * assumed to be admin-configured / trusted. Protections include: + *

+ *
    + *
  • Total uncompressed-size cap ({@link #setMaxBytes(long)})
  • + *
  • Maximum number of entries ({@link #setMaxEntries(int)})
  • + *
  • Per-entry compression-ratio threshold + * ({@link #setMaxCompressionRatio(long)}) to detect zip bombs
  • + *
  • Recursion-depth check (via {@link AbstractExtractor#checkDepth})
  • + *
  • Zip Slip path-traversal detection (entry names normalised and + * rejected when they escape the conceptual extraction root)
  • + *
  • Configurable filename encoding (e.g. {@code "CP932"} / + * {@code "MS932"} for Japanese filenames)
  • + *
*/ public class ZipExtractor extends AbstractExtractor { private static final Logger logger = LogManager.getLogger(ZipExtractor.class); + /** Threshold below which compression-ratio checks are skipped (bytes). */ + private static final long COMPRESSION_RATIO_MIN_BYTES = 1L << 20; // 1 MiB + + /** + * Legacy total cap on uncompressed bytes actually buffered from + * supported entries. The cap is also folded into the read budget so a + * single oversized entry cannot be buffered up to + * {@link #maxBytesPerEntry} when the user only asked for a much smaller + * total. Set to {@code -1} to disable. + */ + protected long maxContentSize = -1; + /** - * The archive stream factory. + * Maximum total uncompressed bytes that may be read from all entries + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes + * from entries that have a registered {@link Extractor} contribute to + * this total — unsupported entries are skipped without buffering or + * draining, mirroring the pre-defence behaviour. */ - @Resource - protected ArchiveStreamFactory archiveStreamFactory; + protected long maxBytes = 1L << 31; /** - * The maximum content size. + * Maximum uncompressed bytes that may be buffered for a SINGLE entry. + * This guards against a legitimate-looking but oversized entry (e.g. a + * 1.9 GiB file inside an otherwise small archive) exhausting the JVM + * heap when buffered into memory. Defaults to 256 MiB. Set to + * {@code -1} to disable. Enforced independently of {@link #maxBytes}. + * Only applies to entries that have a registered {@link Extractor}; an + * unsupported entry is never buffered, so this cap is irrelevant for it. */ - protected long maxContentSize = -1; + protected long maxBytesPerEntry = 256L * 1024L * 1024L; + + /** + * Maximum allowed compression ratio (uncompressed / compressed). Entries + * exceeding this ratio AND larger than 1 MiB are rejected as suspected + * zip bombs. Set to {@code -1} to disable. + */ + protected long maxCompressionRatio = 100L; + + /** + * Maximum allowed number of entries to iterate. Defaults to 100,000. + * Set to {@code -1} to disable. + */ + protected int maxEntries = 100_000; + + /** + * Filename encoding used to decode entry names that lack the UTF-8 flag. + * Defaults to {@code "UTF-8"}; set to {@code "CP932"} or {@code "MS932"} + * for archives created on Japanese Windows systems. + */ + protected String filenameEncoding = "UTF-8"; /** * Creates a new ZipExtractor instance. @@ -62,6 +119,7 @@ public ZipExtractor() { @Override public ExtractData getText(final InputStream in, final Map params) { validateInputStream(in); + checkDepth(params, maxArchiveDepth); final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); @@ -69,35 +127,177 @@ public ExtractData getText(final InputStream in, final Map param int processedEntries = 0; int failedEntries = 0; - try (final ArchiveInputStream ais = - archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) { - ZipArchiveEntry entry = null; - long contentSize = 0; - while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) { - contentSize += entry.getSize(); - if (maxContentSize != -1 && contentSize > maxContentSize) { - throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize); + final InputStream wrapped = in.markSupported() ? in : new BufferedInputStream(in); + // Early-validate the ZIP magic so a clearly non-zip blob is reported + // as ExtractException rather than silently returning empty text. + wrapped.mark(4); + try { + final byte[] sig = new byte[4]; + int read = 0; + while (read < 4) { + final int n = wrapped.read(sig, read, 4 - read); + if (n < 0) { + break; + } + read += n; + } + wrapped.reset(); + // ZIP local-file-header: PK\x03\x04 + // ZIP empty-archive EOCD: PK\x05\x06 + // PK\x07\x08 is a data-descriptor signature and must NEVER appear + // at the very start of a valid ZIP; reject it along with anything + // else that is not a recognised opening signature. + if (read < 4 || sig[0] != 'P' || sig[1] != 'K') { + throw new ExtractException("Failed to extract content from ZIP archive. Not a recognised ZIP signature."); + } + if (sig[2] == 0x05 && sig[3] == 0x06) { + // Valid but empty archive — short-circuit immediately. + return new ExtractData(""); + } + if (sig[2] != 0x03 || sig[3] != 0x04) { + throw new ExtractException("Failed to extract content from ZIP archive. Not a recognised ZIP signature."); + } + } catch (final IOException ioe) { + throw new ExtractException("Failed to extract content from ZIP archive. No entries could be processed.", ioe); + } + // CountingInputStream lets us measure the compressed bytes consumed + // from the underlying stream per entry, which is the only reliable + // signal in streaming mode (ZipArchiveEntry#getCompressedSize() is + // often -1 when entries use a data descriptor). + final CountingInputStream counter = new CountingInputStream(wrapped); + // allowStoredEntriesWithDataDescriptor=false (the default) to avoid + // widening the attack surface; the pre-PR factory default was also + // false and no test requires the looser mode. + try (final ZipArchiveInputStream ais = new ZipArchiveInputStream(counter, filenameEncoding, true, false)) { + ZipArchiveEntry entry; + long totalBytes = 0; + long lastCompressedBytes = counter.getByteCount(); + int entryCount = 0; + while ((entry = ais.getNextEntry()) != null) { + entryCount++; + if (maxEntries > 0 && entryCount > maxEntries) { + throw new MaxLengthExceededException("zip entry count exceeded: count=" + entryCount + " max=" + maxEntries); } final String filename = entry.getName(); + if (entry.isDirectory()) { + lastCompressedBytes = counter.getByteCount(); + continue; + } + if (isPathTraversal(filename)) { + logger.warn("zip entry rejected: name={} reason=path-traversal", filename); + // Keep the compressed-bytes anchor in step with the + // stream so the next supported entry's ratio is + // computed against ITS own compressed bytes, not also + // those of the rejected entry. + lastCompressedBytes = counter.getByteCount(); + continue; + } + + // Decide MIME / extractor up front. An unsupported entry + // (e.g. a video alongside a small .txt) is skipped without + // buffering, so a large irrelevant entry does not consume + // the per-entry / total caps that should be reserved for + // entries the crawler actually wants to extract. final String mimeType = mimeTypeHelper.getContentType(null, filename); - if (mimeType != null) { - final Extractor extractor = extractorFactory.getExtractor(mimeType); - if (extractor != null) { - try { - final Map map = new HashMap<>(); - map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent()); - buf.append('\n'); - processedEntries++; - } catch (final Exception e) { - failedEntries++; - if (logger.isDebugEnabled()) { - logger.debug("Failed to extract content from archive entry: {}", filename, e); - } - } + final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null; + if (extractor == null) { + lastCompressedBytes = counter.getByteCount(); + continue; + } + + // Read entry into bounded buffer while counting actual bytes. + final long actualBytes; + final byte[] entryBytes; + try { + final long totalReadLimit; + if (maxBytes > 0) { + totalReadLimit = addOneSaturating(Math.max(0L, maxBytes - totalBytes)); + } else { + totalReadLimit = Long.MAX_VALUE; + } + // Fold maxContentSize into the read budget so a small + // legacy cap is honoured before a large per-entry cap + // can buffer hundreds of MiB into memory. + final long contentReadLimit; + if (maxContentSize >= 0) { + contentReadLimit = addOneSaturating(Math.max(0L, maxContentSize - totalBytes)); + } else { + contentReadLimit = Long.MAX_VALUE; + } + // Enforce a per-entry cap independently of the total + // cap so that a single oversized entry cannot exhaust + // the JVM heap. We read one byte beyond the cap so the + // explicit overflow check below can distinguish + // "exactly at the cap" from "exceeds the cap". + final long perEntryReadLimit = maxBytesPerEntry > 0 ? addOneSaturating(maxBytesPerEntry) : Long.MAX_VALUE; + final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + actualBytes = copyBounded(ais, out, readLimit); + entryBytes = out.toByteArray(); + } catch (final IOException ioe) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to read zip entry: name={}", filename, ioe); + } + lastCompressedBytes = counter.getByteCount(); + continue; + } + + if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) { + throw new MaxLengthExceededException( + "zip per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry); + } + + totalBytes += actualBytes; + if (maxBytes > 0 && totalBytes > maxBytes) { + throw new MaxLengthExceededException("zip uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); + } + if (maxContentSize >= 0 && totalBytes > maxContentSize) { + throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); + } + + // Compression-ratio check (only meaningful for non-tiny entries). + // Always measure the bytes actually consumed from the underlying + // stream during this entry's read (this is the only reliable + // signal in streaming mode where data descriptors are used). + // When the header also reports a positive compressed size, take + // the minimum of both so an attacker who inflates the header + // value cannot suppress the ratio check. + final long now = counter.getByteCount(); + final long measuredCompressed = Math.max(0L, now - lastCompressedBytes); + lastCompressedBytes = now; + final long headerCompressed = entry.getCompressedSize(); + final long compressed = (headerCompressed > 0) ? Math.min(headerCompressed, measuredCompressed) : measuredCompressed; + if (maxCompressionRatio > 0 && actualBytes > COMPRESSION_RATIO_MIN_BYTES) { + if (compressed == 0) { + // Zero compressed bytes with substantial output is suspicious; + // log it but let the per-entry cap enforce the real limit. + logger.warn("zip entry has 0 compressed bytes with {} uncompressed bytes: name={}", actualBytes, filename); + } else if (actualBytes / compressed > maxCompressionRatio) { + throw new MaxLengthExceededException("zip compression ratio exceeded: name=" + filename + " ratio=" + + (actualBytes / compressed) + " max=" + maxCompressionRatio); + } + } + + try { + final Map map = incrementDepth(params); + map.put(ExtractData.RESOURCE_NAME_KEY, filename); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); + buf.append('\n'); + processedEntries++; + } catch (final MaxLengthExceededException e) { + throw e; + } catch (final Exception e) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to extract content from archive entry: name={}", filename, e); } } } + // Summary warn when the loop completed normally but some entries failed. + if (failedEntries > 0) { + logger.warn("ZIP archive partially processed: processed={} failed={}", processedEntries, failedEntries); + } } catch (final MaxLengthExceededException e) { throw e; } catch (final Exception e) { @@ -105,7 +305,7 @@ public ExtractData getText(final InputStream in, final Map param throw new ExtractException("Failed to extract content from ZIP archive. No entries could be processed.", e); } if (logger.isWarnEnabled()) { - logger.warn("Partial extraction from ZIP archive. Processed: {}, Failed: {}", processedEntries, failedEntries, e); + logger.warn("Partial extraction from ZIP archive. processed={} failed={}", processedEntries, failedEntries, e); } } @@ -119,4 +319,49 @@ public ExtractData getText(final InputStream in, final Map param public void setMaxContentSize(final long maxContentSize) { this.maxContentSize = maxContentSize; } -} \ No newline at end of file + + /** + * Sets the cap on total uncompressed bytes read from all entries. + * @param maxBytes the maximum total bytes (use {@code -1} to disable) + */ + public void setMaxBytes(final long maxBytes) { + this.maxBytes = maxBytes; + } + + /** + * Sets the per-entry cap on uncompressed bytes buffered in memory. Set + * to {@code -1} to disable. + * + * @param maxBytesPerEntry the per-entry maximum + */ + public void setMaxBytesPerEntry(final long maxBytesPerEntry) { + this.maxBytesPerEntry = maxBytesPerEntry; + } + + /** + * Sets the maximum permitted uncompressed/compressed ratio per entry. + * @param maxCompressionRatio the threshold (use {@code -1} to disable) + */ + public void setMaxCompressionRatio(final long maxCompressionRatio) { + this.maxCompressionRatio = maxCompressionRatio; + } + + /** + * Sets the maximum number of entries that may be iterated. + * @param maxEntries the maximum entry count (use {@code -1} to disable) + */ + public void setMaxEntries(final int maxEntries) { + this.maxEntries = maxEntries; + } + + /** + * Sets the filename encoding used to decode entry names that lack the + * UTF-8 flag (e.g. {@code "CP932"} / {@code "MS932"} for Japanese + * archives). + * + * @param filenameEncoding the charset name + */ + public void setFilenameEncoding(final String filenameEncoding) { + this.filenameEncoding = filenameEncoding; + } +} diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java index 30e7db86..4054092b 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java @@ -17,10 +17,12 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; +import java.util.HashMap; import java.util.Map; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.dbflute.utflute.core.PlainTestCase; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -64,6 +66,28 @@ public void resetTestState() { public void testValidateInputStream(final InputStream in) { validateInputStream(in); } + + // Expose depth helpers for testing. + public int testGetCurrentDepth(final Map params) { + return getCurrentDepth(params); + } + + public Map testIncrementDepth(final Map params) { + return incrementDepth(params); + } + + public void testCheckDepth(final Map params, final int maxDepth) { + checkDepth(params, maxDepth); + } + + // Expose static helpers for testing. + public boolean testIsPathTraversal(final String name) { + return isPathTraversal(name); + } + + public long testAddOneSaturating(final long value) { + return addOneSaturating(value); + } } private TestExtractor extractor; @@ -243,4 +267,185 @@ public void test_validateInputStream_throwsCorrectExceptionType() { fail(); } } + + /** Recursion-depth helper: missing/null params return 0. */ + @Test + public void test_getCurrentDepth_returnsZeroForMissing() { + assertEquals(0, extractor.testGetCurrentDepth(null)); + assertEquals(0, extractor.testGetCurrentDepth(new HashMap<>())); + final Map blank = new HashMap<>(); + blank.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, ""); + assertEquals(0, extractor.testGetCurrentDepth(blank)); + final Map garbage = new HashMap<>(); + garbage.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "not-a-number"); + assertEquals(0, extractor.testGetCurrentDepth(garbage)); + } + + /** Recursion-depth helper: depth value is parsed and clamped to >= 0. */ + @Test + public void test_getCurrentDepth_parsesValidValue() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + assertEquals(3, extractor.testGetCurrentDepth(params)); + + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "-5"); + assertEquals(0, extractor.testGetCurrentDepth(params)); + } + + /** incrementDepth must return a NEW map and not mutate the input. */ + @Test + public void test_incrementDepth_returnsNewMap() { + final Map original = new HashMap<>(); + original.put("foo", "bar"); + final Map next = extractor.testIncrementDepth(original); + + assertFalse(original == next); + // original is unchanged + assertFalse(original.containsKey(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + assertEquals("bar", next.get("foo")); + assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + + final Map after = extractor.testIncrementDepth(next); + assertEquals("2", after.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + // first map still says "1" + assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + } + + /** incrementDepth on null produces depth=1. */ + @Test + public void test_incrementDepth_nullInput() { + final Map next = extractor.testIncrementDepth(null); + assertNotNull(next); + assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + } + + /** checkDepth allows depths below the limit. */ + @Test + public void test_checkDepth_belowLimit_passes() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + extractor.testCheckDepth(params, 10); // no throw + extractor.testCheckDepth(null, 10); + } + + /** checkDepth rejects depths at or above the limit. */ + @Test + public void test_checkDepth_atOrAboveLimit_throws() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); + try { + extractor.testCheckDepth(params, 10); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("recursion depth")); + } + + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "99"); + try { + extractor.testCheckDepth(params, 10); + fail(); + } catch (final MaxLengthExceededException e) { + // pass + } + } + + // ----------------------------------------------------------------------- + // isPathTraversal tests (C3 fix validation) + // ----------------------------------------------------------------------- + + /** null and empty are always traversals. */ + @Test + public void test_isPathTraversal_nullAndEmpty() { + assertTrue(extractor.testIsPathTraversal(null)); + assertTrue(extractor.testIsPathTraversal("")); + } + + /** Drive letter prefix is always rejected. */ + @Test + public void test_isPathTraversal_driveLetter() { + assertTrue(extractor.testIsPathTraversal("C:\\foo")); + assertTrue(extractor.testIsPathTraversal("C:foo")); + } + + /** Leading slash (Unix absolute) is rejected. */ + @Test + public void test_isPathTraversal_leadingSlash() { + assertTrue(extractor.testIsPathTraversal("/etc/passwd")); + } + + /** Leading backslash is rejected. */ + @Test + public void test_isPathTraversal_leadingBackslash() { + assertTrue(extractor.testIsPathTraversal("\\foo\\bar")); + } + + /** Lone ".." is rejected. */ + @Test + public void test_isPathTraversal_loneDotDot() { + assertTrue(extractor.testIsPathTraversal("..")); + } + + /** Classic traversal sequences are rejected. */ + @Test + public void test_isPathTraversal_classicTraversal() { + assertTrue(extractor.testIsPathTraversal("../../etc/passwd")); + assertTrue(extractor.testIsPathTraversal("foo/../../etc/passwd")); + } + + /** Safe name that resolves inside the root is allowed. */ + @Test + public void test_isPathTraversal_safeRelativePath() { + assertFalse(extractor.testIsPathTraversal("foo/bar.txt")); + assertFalse(extractor.testIsPathTraversal("foo/../bar.txt")); // normalises to bar.txt + } + + /** + * Single-segment backslash traversal (C3 regression). + * On Linux the path "a\..\..\etc" is a single opaque filename when + * Paths.get() is called without pre-normalisation, so ".." segments are + * not detected. After unifying backslash to forward-slash before + * Paths.get(), "a/../../etc" normalises to "../etc" which starts with + * ".." and is correctly rejected. + * Note: "a\.." unifies to "a/.." which normalises to the empty path + * (current dir, i.e. the archive root) — that is safe and is NOT rejected. + */ + @Test + public void test_isPathTraversal_backslashSingleSegment() { + // "a\..\..\etc" must be caught — escapes the archive root. + assertTrue(extractor.testIsPathTraversal("a\\..\\..\\etc")); + // Three levels up — definitely escapes. + assertTrue(extractor.testIsPathTraversal("a\\..\\..\\..")); // escapes + // "a\.." normalises to the archive root (current dir) — safe. + assertFalse(extractor.testIsPathTraversal("a\\..")); + // A purely safe backslash path: "foo\\bar.txt" → "foo/bar.txt" — safe. + assertFalse(extractor.testIsPathTraversal("foo\\bar.txt")); + } + + /** NUL character in path — should be rejected (InvalidPathException path). */ + @Test + public void test_isPathTraversal_nulCharacter() { + assertTrue(extractor.testIsPathTraversal("a\0b")); + } + + // ----------------------------------------------------------------------- + // addOneSaturating (C2 fix validation) + // ----------------------------------------------------------------------- + + /** addOneSaturating returns value+1 for normal inputs. */ + @Test + public void test_addOneSaturating_normalIncrement() { + assertEquals(1L, extractor.testAddOneSaturating(0L)); + assertEquals(101L, extractor.testAddOneSaturating(100L)); + assertEquals(Long.MAX_VALUE - 1L, extractor.testAddOneSaturating(Long.MAX_VALUE - 2L)); + } + + /** addOneSaturating returns Long.MAX_VALUE when already at max. */ + @Test + public void test_addOneSaturating_saturatesAtMax() { + assertEquals(Long.MAX_VALUE, extractor.testAddOneSaturating(Long.MAX_VALUE)); + // (MAX-1)+1 = MAX naturally — not overflow. + assertEquals(Long.MAX_VALUE, extractor.testAddOneSaturating(Long.MAX_VALUE - 1L)); + // Verify the result is positive (not wrapped to negative). + assertTrue(extractor.testAddOneSaturating(Long.MAX_VALUE) > 0); + } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java new file mode 100644 index 00000000..0bab94c4 --- /dev/null +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java @@ -0,0 +1,966 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.extractor.impl; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.compress.archivers.ArchiveStreamFactory; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.codelibs.fess.crawler.exception.ExtractException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.extractor.ExtractorFactory; +import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl; +import org.dbflute.utflute.core.PlainTestCase; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; + +/** + * Security-oriented tests that validate the archive-bomb / recursion / Zip + * Slip / link-skipping defences added to the Zip / Tar / Lha extractors. + * + *

+ * Synthetic archives are constructed in-memory with Apache Commons Compress + * so the tests are fully self-contained. + *

+ */ +public class ArchiveExtractorSecurityTest extends PlainTestCase { + + private ZipExtractor zipExtractor; + private TarExtractor tarExtractor; + private LhaExtractor lhaExtractor; + + @Override + protected void setUp(final TestInfo testInfo) throws Exception { + super.setUp(testInfo); + final StandardCrawlerContainer container = new StandardCrawlerContainer(); + container.singleton("archiveStreamFactory", ArchiveStreamFactory.class) + .singleton("compressorStreamFactory", CompressorStreamFactory.class) + .singleton("mimeTypeHelper", MimeTypeHelperImpl.class) + .singleton("textExtractor", TextExtractor.class) + .singleton("zipExtractor", ZipExtractor.class) + .singleton("tarExtractor", TarExtractor.class) + .singleton("lhaExtractor", LhaExtractor.class) + . singleton("extractorFactory", ExtractorFactory.class, factory -> { + final TextExtractor textExtractor = container.getComponent("textExtractor"); + final ZipExtractor zip = container.getComponent("zipExtractor"); + final TarExtractor tar = container.getComponent("tarExtractor"); + final LhaExtractor lha = container.getComponent("lhaExtractor"); + factory.addExtractor("text/plain", textExtractor); + factory.addExtractor("application/zip", zip); + factory.addExtractor("application/x-tar", tar); + factory.addExtractor("application/x-lha", lha); + }); + + zipExtractor = container.getComponent("zipExtractor"); + tarExtractor = container.getComponent("tarExtractor"); + lhaExtractor = container.getComponent("lhaExtractor"); + } + + // --------------------------------------------------------------------- + // Helpers + // --------------------------------------------------------------------- + + private byte[] buildZip(final EntrySpec... specs) throws IOException { + return buildZipWithCharset(StandardCharsets.UTF_8, specs); + } + + private byte[] buildZipWithCharset(final Charset charset, final EntrySpec... specs) throws IOException { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + zos.setEncoding(charset.name()); + // Disable the UTF-8 flag so the encoding parameter is honoured by + // ZipArchiveInputStream during read. + zos.setUseLanguageEncodingFlag(false); + zos.setCreateUnicodeExtraFields(ZipArchiveOutputStream.UnicodeExtraFieldPolicy.NEVER); + for (final EntrySpec spec : specs) { + final ZipArchiveEntry entry = new ZipArchiveEntry(spec.name); + zos.putArchiveEntry(entry); + if (spec.content != null) { + zos.write(spec.content); + } + zos.closeArchiveEntry(); + } + zos.finish(); + } + return baos.toByteArray(); + } + + private byte[] buildTar(final TarEntrySpec... specs) throws IOException { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (TarArchiveOutputStream tos = new TarArchiveOutputStream(baos)) { + tos.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX); + for (final TarEntrySpec spec : specs) { + final TarArchiveEntry entry; + if (spec.linkType != 0) { + entry = new TarArchiveEntry(spec.name, spec.linkType); + if (spec.linkName != null) { + entry.setLinkName(spec.linkName); + } + } else { + entry = new TarArchiveEntry(spec.name); + entry.setSize(spec.content == null ? 0 : spec.content.length); + } + tos.putArchiveEntry(entry); + if (spec.linkType == 0 && spec.content != null) { + tos.write(spec.content); + } + tos.closeArchiveEntry(); + } + tos.finish(); + } + return baos.toByteArray(); + } + + private static final class EntrySpec { + final String name; + final byte[] content; + + EntrySpec(final String name, final byte[] content) { + this.name = name; + this.content = content; + } + } + + private static final class TarEntrySpec { + final String name; + final byte[] content; + final byte linkType; + final String linkName; + + TarEntrySpec(final String name, final byte[] content) { + this(name, content, (byte) 0, null); + } + + TarEntrySpec(final String name, final byte[] content, final byte linkType, final String linkName) { + this.name = name; + this.content = content; + this.linkType = linkType; + this.linkName = linkName; + } + } + + // --------------------------------------------------------------------- + // Zip — byte-limit bomb + // --------------------------------------------------------------------- + + @Test + public void test_zipBomb_byteLimit() throws Exception { + final byte[] payload = new byte[64 * 1024]; + final byte[] data = buildZip(new EntrySpec("a.txt", payload), new EntrySpec("b.txt", payload), new EntrySpec("c.txt", payload)); + + zipExtractor.setMaxBytes(64 * 1024); // exactly one entry's worth -> 2nd should fail + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("zip uncompressed size exceeded")); + } + } + + // --------------------------------------------------------------------- + // Zip — many-entry bomb + // --------------------------------------------------------------------- + + @Test + public void test_zipBomb_entryLimit() throws Exception { + final EntrySpec[] specs = new EntrySpec[20]; + for (int i = 0; i < specs.length; i++) { + specs[i] = new EntrySpec("e" + i + ".txt", new byte[0]); + } + final byte[] data = buildZip(specs); + + zipExtractor.setMaxEntries(5); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("zip entry count exceeded")); + } + } + + // --------------------------------------------------------------------- + // Zip — Zip Slip path traversal + // --------------------------------------------------------------------- + + @Test + public void test_zipSlip_pathTraversal() throws Exception { + final byte[] data = buildZip(new EntrySpec("../../etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)), + new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + // Bad entry must be skipped; good entry must still be processed. + assertFalse(content.contains("evil")); + assertTrue(content.contains("good")); + } + } + + @Test + public void test_zipSlip_absolutePath() throws Exception { + final byte[] data = buildZip(new EntrySpec("/etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)), + new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertFalse(content.contains("evil")); + assertTrue(content.contains("good")); + } + } + + // --------------------------------------------------------------------- + // Recursion-depth bomb + // --------------------------------------------------------------------- + + @Test + public void test_recursionDepth_exceeded() throws Exception { + final byte[] data = buildZip(new EntrySpec("ok.txt", "hello".getBytes(StandardCharsets.UTF_8))); + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); // == default max + + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, params); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } + } + + @Test + public void test_recursionDepth_belowLimit_succeeds() throws Exception { + final byte[] data = buildZip(new EntrySpec("ok.txt", "hello".getBytes(StandardCharsets.UTF_8))); + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, params).getContent(); + assertTrue(content.contains("hello")); + } + // Original params must be unchanged. + assertEquals("3", params.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + } + + // --------------------------------------------------------------------- + // CP932 / non-UTF-8 filename encoding + // --------------------------------------------------------------------- + + @Test + public void test_cp932Filename() throws Exception { + final Charset cp932; + try { + cp932 = Charset.forName("MS932"); + } catch (final Exception e) { + // CP932/MS932 not available on this JVM; skip. + return; + } + + final byte[] data = buildZipWithCharset(cp932, new EntrySpec("テスト.txt", "japan".getBytes(StandardCharsets.UTF_8))); + + // Default UTF-8 encoding may mojibake the filename, but once we set + // CP932 the filename should round-trip cleanly. We assert by + // inspecting the entry list directly via the public API: setting the + // proper encoding allows the .txt suffix to be detected and the + // entry's content extracted. + zipExtractor.setFilenameEncoding("MS932"); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertTrue(content.contains("japan")); + } + } + + // --------------------------------------------------------------------- + // Tar — symlink / hardlink entries are skipped + // --------------------------------------------------------------------- + + @Test + public void test_tar_symlinkSkipped() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_SYMLINK, "/etc/passwd")); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("regular")); + // Symlink target text must NOT leak into the output. + assertFalse(content.contains("/etc/passwd")); + } + } + + @Test + public void test_tar_hardlinkSkipped() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_LINK, "ok.txt")); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("regular")); + // The hardlink should not have introduced a duplicate of the + // referenced entry's content. + assertEquals(content.indexOf("regular"), content.lastIndexOf("regular")); + } + } + + @Test + public void test_tar_pathTraversal() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("../../etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertFalse(content.contains("evil")); + assertTrue(content.contains("good")); + } + } + + // --------------------------------------------------------------------- + // Compression-ratio bomb — produce a highly-compressible big entry + // --------------------------------------------------------------------- + + @Test + public void test_compressionRatioExceeded() throws Exception { + // 2 MiB of zeroes compresses extremely well, well above the 100:1 + // default threshold. Build the entry with explicit method/size/crc so + // the local file header carries the compressed size (otherwise a + // streaming DEFLATED entry uses a data descriptor, leaving + // ZipArchiveEntry#getCompressedSize() as -1 and bypassing the ratio + // check). + final byte[] payload = new byte[2 * 1024 * 1024]; + final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); + def.setInput(payload); + def.finish(); + final ByteArrayOutputStream compBuf = new ByteArrayOutputStream(); + final byte[] tmpBuf = new byte[8192]; + while (!def.finished()) { + final int n = def.deflate(tmpBuf); + compBuf.write(tmpBuf, 0, n); + } + def.end(); + final java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(payload); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + final ZipArchiveEntry entry = new ZipArchiveEntry("zeros.txt"); + entry.setMethod(ZipArchiveEntry.DEFLATED); + entry.setSize(payload.length); + entry.setCompressedSize(compBuf.size()); + entry.setCrc(crc.getValue()); + zos.putArchiveEntry(entry); + zos.write(payload); + zos.closeArchiveEntry(); + zos.finish(); + } + final byte[] data = baos.toByteArray(); + + // Disable the byte cap so the compression-ratio check is the one that + // fires. + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("compression ratio") || e.getMessage().contains("uncompressed size")); + } + } + + // --------------------------------------------------------------------- + // Tar byte/entry limits + // --------------------------------------------------------------------- + + @Test + public void test_tarBomb_byteLimit() throws Exception { + final byte[] payload = new byte[64 * 1024]; + final byte[] data = buildTar(new TarEntrySpec("a.txt", payload), new TarEntrySpec("b.txt", payload)); + + tarExtractor.setMaxBytes(64 * 1024); + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("tar uncompressed size exceeded")); + } + } + + @Test + public void test_tarBomb_entryLimit() throws Exception { + final TarEntrySpec[] specs = new TarEntrySpec[20]; + for (int i = 0; i < specs.length; i++) { + specs[i] = new TarEntrySpec("e" + i + ".txt", new byte[0]); + } + final byte[] data = buildTar(specs); + + tarExtractor.setMaxEntries(5); + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("tar entry count exceeded")); + } + } + + @Test + public void test_tar_recursionDepth_exceeded() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "hi".getBytes(StandardCharsets.UTF_8))); + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); + + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, params); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } + } + + // --------------------------------------------------------------------- + // Lha recursion-depth check (uses isPathTraversal helper too) + // --------------------------------------------------------------------- + + // --------------------------------------------------------------------- + // Per-entry size cap — guards against a single oversized entry + // --------------------------------------------------------------------- + + @Test + public void test_perEntryCapEnforced() throws Exception { + // Build a zip whose single SUPPORTED entry exceeds the configured + // per-entry cap. The extractor must trip the cap before buffering + // the whole payload. We use a small cap (1 MiB) and a slightly + // larger payload (2 MiB) so the test stays cheap on parallel / + // low-memory CI. The extension is .txt so the entry routes through + // the registered text/plain extractor — only supported entries are + // buffered (and therefore can hit the per-entry memory cap). + final int perEntryCap = 1024 * 1024; + final int entrySize = 2 * perEntryCap; + final byte[] payload = new byte[entrySize]; + + final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); + def.setInput(payload); + def.finish(); + final ByteArrayOutputStream compBuf = new ByteArrayOutputStream(); + final byte[] tmpBuf = new byte[8192]; + while (!def.finished()) { + final int n = def.deflate(tmpBuf); + compBuf.write(tmpBuf, 0, n); + } + def.end(); + final java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(payload); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + final ZipArchiveEntry entry = new ZipArchiveEntry("big.txt"); + entry.setMethod(ZipArchiveEntry.DEFLATED); + entry.setSize(payload.length); + entry.setCompressedSize(compBuf.size()); + entry.setCrc(crc.getValue()); + zos.putArchiveEntry(entry); + zos.write(payload); + zos.closeArchiveEntry(); + zos.finish(); + } + final byte[] data = baos.toByteArray(); + + // Disable the total-size and ratio checks so only the per-entry cap + // can trigger. + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + zipExtractor.setMaxCompressionRatio(-1); + zipExtractor.setMaxBytesPerEntry(perEntryCap); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("per-entry size exceeded")); + } + } + + // --------------------------------------------------------------------- + // Unsupported entries must NOT consume the per-entry / total caps — + // they are skipped without buffering so that supported entries + // alongside them still extract successfully (regression for PR #161 + // review feedback). + // --------------------------------------------------------------------- + + @Test + public void test_zip_unsupportedEntryDoesNotConsumeCaps() throws Exception { + // A "big.bin" payload that, were it to be buffered, would exceed + // both the per-entry cap and the total cap. The supported "ok.txt" + // alongside it must still extract because no extractor is + // registered for application/octet-stream. + final byte[] big = new byte[4 * 1024 * 1024]; + final byte[] data = buildZip(new EntrySpec("big.bin", big), new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + zipExtractor.setMaxBytes(64 * 1024); // smaller than big.bin + zipExtractor.setMaxBytesPerEntry(64 * 1024); // also smaller + zipExtractor.setMaxContentSize(64 * 1024); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertTrue(content.contains("good")); + } + } + + @Test + public void test_tar_unsupportedEntryDoesNotConsumeCaps() throws Exception { + final byte[] big = new byte[4 * 1024 * 1024]; + final byte[] data = buildTar(new TarEntrySpec("big.bin", big), new TarEntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + tarExtractor.setMaxBytes(64 * 1024); + tarExtractor.setMaxBytesPerEntry(64 * 1024); + tarExtractor.setMaxContentSize(64 * 1024); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("good")); + } + } + + // --------------------------------------------------------------------- + // maxContentSize is folded into the read budget — a small legacy cap + // must trip BEFORE the buffer grows to the much larger per-entry cap + // (regression for PR #161 review feedback). + // --------------------------------------------------------------------- + + @Test + public void test_zip_maxContentSize_capsBufferBeforePerEntryCap() throws Exception { + // 4 MiB supported entry; per-entry cap default is large; legacy + // maxContentSize is small. Without the fix the buffer would grow + // up to maxBytesPerEntry+1 before throwing. With the fix the read + // budget is bounded by maxContentSize+1 so buffering stops early. + final int legacyCap = 64 * 1024; + final byte[] payload = new byte[4 * 1024 * 1024]; + final byte[] data = buildZip(new EntrySpec("big.txt", payload)); + + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxCompressionRatio(-1); + zipExtractor.setMaxBytesPerEntry(8L * 1024L * 1024L); // intentionally larger than payload + zipExtractor.setMaxContentSize(legacyCap); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("Extracted size is")); + } + } + + @Test + public void test_lha_recursionDepth_exceeded() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); + // We pass a tiny non-archive stream; the depth check fires before + // the LHA library is invoked. + try (InputStream in = new ByteArrayInputStream("dummy".getBytes(StandardCharsets.UTF_8))) { + lhaExtractor.getText(in, params); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } catch (final IOException e) { + fail(); + } + } + + @Test + public void test_lha_maxInputBytes_capsStaging() { + // Stage cap is enforced during the temp-file copy, before LhaFile + // is opened. Any blob larger than the cap must be rejected — we use + // arbitrary bytes since the failure precedes archive parsing. + lhaExtractor.setMaxInputBytes(1024L); + final byte[] payload = new byte[4 * 1024]; + try (InputStream in = new ByteArrayInputStream(payload)) { + lhaExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("input size exceeded")); + } catch (final IOException e) { + fail(); + } + } + + // --------------------------------------------------------------------- + // ZIP signature checks (M1) + // --------------------------------------------------------------------- + + @Test + public void test_zip_signatureCheck_rejectsDataDescriptorPrefix() throws Exception { + // PK\x07\x08 is a data-descriptor signature and must not appear at + // the start of a valid ZIP; reject it as ExtractException. + final byte[] data = new byte[] { 'P', 'K', 0x07, 0x08, 0, 0, 0, 0 }; + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final ExtractException e) { + assertTrue(e.getMessage().contains("ZIP")); + } + } + + @Test + public void test_zip_signatureCheck_acceptsEmptyArchive() throws Exception { + // PK\x05\x06 is a valid empty-archive EOCD; extractor must return + // empty content rather than throwing. + final byte[] eocd = new byte[22]; + eocd[0] = 'P'; + eocd[1] = 'K'; + eocd[2] = 0x05; + eocd[3] = 0x06; + // Remaining 18 bytes stay 0 (valid minimal EOCD). + try (InputStream in = new ByteArrayInputStream(eocd)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertEquals("", content); + } + } + + @Test + public void test_zip_signatureCheck_rejectsTruncatedStream() throws Exception { + // 2 bytes — not enough for a valid ZIP magic → ExtractException. + final byte[] data = new byte[] { 'P', 'K' }; + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final ExtractException e) { + assertTrue(e.getMessage().contains("ZIP")); + } + } + + @Test + public void test_zip_signatureCheck_rejectsNonZip() throws Exception { + // Completely wrong magic. + final byte[] data = "not a zip file at all".getBytes(StandardCharsets.UTF_8); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final ExtractException e) { + assertTrue(e.getMessage().contains("ZIP")); + } + } + + // --------------------------------------------------------------------- + // Overflow: saturating +1L at Long.MAX_VALUE (C2) + // --------------------------------------------------------------------- + + @Test + public void test_overflow_saturatingAdd_atLongMaxValue() throws Exception { + // With maxBytes=Long.MAX_VALUE a small archive must succeed, not + // silently read 0 bytes due to Long overflow wrapping to negative. + final byte[] payload = "hello world".getBytes(StandardCharsets.UTF_8); + final byte[] data = buildZip(new EntrySpec("ok.txt", payload)); + + zipExtractor.setMaxBytes(Long.MAX_VALUE); + zipExtractor.setMaxBytesPerEntry(Long.MAX_VALUE); + zipExtractor.setMaxContentSize(-1); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertTrue(content.contains("hello world")); + } + } + + // --------------------------------------------------------------------- + // Compression ratio — min(header, measured) (M3) + // --------------------------------------------------------------------- + + @Test + public void test_zip_compressionRatio_usesMinOfHeaderAndMeasured() throws Exception { + // Build a zip where the entry header reports a huge compressedSize. + // The ratio check must use the minimum of header vs. measured bytes + // so a lying header cannot suppress the check. We build a highly + // compressible 2 MiB entry; the measured compressed bytes will be + // small, making the ratio >> 100 regardless of the header claim. + final byte[] payload = new byte[2 * 1024 * 1024]; // all zeros + + final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); + def.setInput(payload); + def.finish(); + final ByteArrayOutputStream compBuf = new ByteArrayOutputStream(); + final byte[] tmpBuf = new byte[8192]; + while (!def.finished()) { + final int n = def.deflate(tmpBuf); + compBuf.write(tmpBuf, 0, n); + } + def.end(); + final java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(payload); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + final ZipArchiveEntry entry = new ZipArchiveEntry("zeros.txt"); + entry.setMethod(ZipArchiveEntry.DEFLATED); + entry.setSize(payload.length); + // Set a deliberately inflated compressedSize in the header so + // that ratio = uncompressed / fakeCompressed would be < threshold. + // If the code uses min(header, measured) the measured value wins + // and ratio >> threshold => MaxLengthExceededException fires. + entry.setCompressedSize(payload.length); // 1:1 — no bomb per header + entry.setCrc(crc.getValue()); + zos.putArchiveEntry(entry); + zos.write(payload); + zos.closeArchiveEntry(); + zos.finish(); + } + final byte[] data = baos.toByteArray(); + + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + // Ratio threshold: 100 — actual ratio >> 100 using measured bytes. + zipExtractor.setMaxCompressionRatio(100L); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("compression ratio")); + } + } + + // --------------------------------------------------------------------- + // Nested recursion depth (zip-in-zip) (test 6) + // --------------------------------------------------------------------- + + @Test + public void test_zip_nestedRecursionCountsDepth() throws Exception { + // Build inner zip containing a text file. + final byte[] innerPayload = buildZip(new EntrySpec("inner.txt", "hello".getBytes(StandardCharsets.UTF_8))); + // Build outer zip containing the inner zip. + final byte[] data = buildZip(new EntrySpec("nested.zip", innerPayload)); + + // Allow depth=1 only — outer zip processes ok (depth 0→1), + // inner zip invocation is at depth=1 which == maxArchiveDepth → throws. + zipExtractor.setMaxArchiveDepth(1); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } + } + + // --------------------------------------------------------------------- + // Per-entry cap fires when total cap disabled (test 7) + // --------------------------------------------------------------------- + + @Test + public void test_zip_perEntryCap_whenMaxBytesDisabled() throws Exception { + final byte[] payload = new byte[2 * 1024 * 1024]; // 2 MiB + final byte[] data = buildZip(new EntrySpec("big.txt", payload)); + + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + zipExtractor.setMaxCompressionRatio(-1); + zipExtractor.setMaxBytesPerEntry(1024 * 1024); // 1 MiB cap + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("per-entry size exceeded")); + } + } + + // --------------------------------------------------------------------- + // Total cap fires when per-entry cap disabled (test 8) + // --------------------------------------------------------------------- + + @Test + public void test_zip_maxBytes_whenPerEntryDisabled() throws Exception { + final byte[] payload = new byte[64 * 1024]; // 64 KiB each + final byte[] data = buildZip(new EntrySpec("a.txt", payload), new EntrySpec("b.txt", payload)); + + zipExtractor.setMaxBytesPerEntry(-1); // disable per-entry + zipExtractor.setMaxCompressionRatio(-1); + zipExtractor.setMaxBytes(64 * 1024); // exactly one entry → second exceeds + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("zip uncompressed size exceeded")); + } + } + + // --------------------------------------------------------------------- + // CP932 filename (m2: use Assumptions instead of silent return) + // --------------------------------------------------------------------- + + @Test + public void test_cp932Filename_withAssumption() throws Exception { + // If MS932 is unavailable, skip with Assumptions rather than a + // silent return, so the test result is clearly SKIPPED not PASSED. + org.junit.jupiter.api.Assumptions.assumeTrue(Charset.isSupported("MS932"), "MS932 charset not available on this JVM"); + + final Charset cp932 = Charset.forName("MS932"); + final byte[] data = buildZipWithCharset(cp932, new EntrySpec("テスト.txt", "japan".getBytes(StandardCharsets.UTF_8))); + + zipExtractor.setFilenameEncoding("MS932"); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertTrue(content.contains("japan")); + } + } + + // --------------------------------------------------------------------- + // Compression-ratio message tightened (m3) + // --------------------------------------------------------------------- + + @Test + public void test_compressionRatioExceeded_messageContainsRatio() throws Exception { + // Same high-ratio archive as the existing test; assert the message + // specifically contains "compression ratio" (not just "uncompressed + // size"), because maxBytes=-1 means the total cap is disabled. + final byte[] payload = new byte[2 * 1024 * 1024]; + final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); + def.setInput(payload); + def.finish(); + final ByteArrayOutputStream compBuf = new ByteArrayOutputStream(); + final byte[] tmpBuf = new byte[8192]; + while (!def.finished()) { + final int n = def.deflate(tmpBuf); + compBuf.write(tmpBuf, 0, n); + } + def.end(); + final java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(payload); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + final ZipArchiveEntry entry = new ZipArchiveEntry("zeros.txt"); + entry.setMethod(ZipArchiveEntry.DEFLATED); + entry.setSize(payload.length); + entry.setCompressedSize(compBuf.size()); + entry.setCrc(crc.getValue()); + zos.putArchiveEntry(entry); + zos.write(payload); + zos.closeArchiveEntry(); + zos.finish(); + } + final byte[] data = baos.toByteArray(); + + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + // With maxBytes=-1, only the ratio check can fire. + assertTrue(e.getMessage().contains("compression ratio")); + } + } + + // --------------------------------------------------------------------- + // Tar PAX global header does not consume entry count (m4) + // --------------------------------------------------------------------- + + @Test + public void test_tar_paxGlobalHeader_doesNotConsumeEntryCount() throws Exception { + // Build a tar that, when read by TarArchiveInputStream, produces a + // PAX global header entry followed by a real text entry. We use a + // long filename (>100 chars) with LONGFILE_POSIX mode, which causes + // Commons Compress to emit a PAX extended header (type 'x') for the + // long name before the real entry. isPaxHeader() returns true for + // type 'x', so the fix must skip it without incrementing entryCount. + // With maxEntries=1, if the PAX extension header is counted the real + // entry would push the count to 2 and trigger the cap. + final String longName = "a".repeat(110) + ".txt"; // > 100-char POSIX limit + final byte[] content = "hello".getBytes(StandardCharsets.UTF_8); + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (TarArchiveOutputStream tos = new TarArchiveOutputStream(baos)) { + tos.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX); + final TarArchiveEntry real = new TarArchiveEntry(longName); + real.setSize(content.length); + tos.putArchiveEntry(real); + tos.write(content); + tos.closeArchiveEntry(); + tos.finish(); + } + final byte[] data = baos.toByteArray(); + + tarExtractor.setMaxEntries(1); // only 1 real entry allowed + try (InputStream in = new ByteArrayInputStream(data)) { + final String text = tarExtractor.getText(in, null).getContent(); + assertTrue(text.contains("hello")); + } + } + + // --------------------------------------------------------------------- + // Tar per-entry cap enforced (test 13) + // --------------------------------------------------------------------- + + @Test + public void test_tar_perEntryCapEnforced() throws Exception { + final byte[] payload = new byte[2 * 1024 * 1024]; // 2 MiB + final byte[] data = buildTar(new TarEntrySpec("big.txt", payload)); + + tarExtractor.setMaxBytes(-1); + tarExtractor.setMaxContentSize(-1); + tarExtractor.setMaxBytesPerEntry(1024 * 1024); // 1 MiB cap + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("tar per-entry size exceeded")); + } + } + + // --------------------------------------------------------------------- + // Tar symlink skip now at WARN (m9) — verify it does not throw + // --------------------------------------------------------------------- + + @Test + public void test_tar_symlinkSkipped_doesNotThrow() throws Exception { + // Already covered by test_tar_symlinkSkipped; this confirms the + // behaviour is unchanged after upgrading the log level to WARN. + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_SYMLINK, "/etc/passwd")); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("regular")); + assertFalse(content.contains("passwd")); + } + } + + // --------------------------------------------------------------------- + // setMaxArchiveDepth changes threshold (test 16) + // --------------------------------------------------------------------- + + @Test + public void test_setMaxArchiveDepth_changesThreshold() throws Exception { + final byte[] data = buildZip(new EntrySpec("ok.txt", "hi".getBytes(StandardCharsets.UTF_8))); + + // depth=3 at maxArchiveDepth=3 → throws + zipExtractor.setMaxArchiveDepth(3); + final Map params3 = new HashMap<>(); + params3.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, params3); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } + + // depth=11 at maxArchiveDepth=20 → passes + zipExtractor.setMaxArchiveDepth(20); + final Map params11 = new HashMap<>(); + params11.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "11"); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, params11).getContent(); + assertTrue(content.contains("hi")); + } + } +}