Fix charset decoding for server.request.body.files_content in commons-fileupload (#11212)

jandro996 · devflow.devflow-routing-intake · web-flow · commit 9d737609c41e · 2026-04-28T12:33:12.000Z
fix(appsec): use per-part charset for files_content in commons-fileupload

fix(appsec): use CharsetDecoder with REPORT for charset fallback in FileItemContentReader

refactor(appsec): extract MultipartContentDecoder to internal-api for reuse across integrations

test(appsec): add missing corner cases to MultipartContentDecoderTest

test(appsec): trim redundant decoder tests from FileItemContentReaderTest

Charset fallback scenarios are covered by MultipartContentDecoderTest.
One integration test is kept to verify that getContentType() is passed through.

fix(appsec): use machine default charset as fallback in MultipartContentDecoder

Replaces hardcoded UTF-8 (no-charset default) and ISO-8859-1 (fallback)
with Charset.defaultCharset() in both cases, per reviewer feedback.

test(appsec): migrate MultipartContentDecoderTest from Groovy/Spock to Java/JUnit 5

fix(appsec): strip surrounding quotes from charset parameter in MultipartContentDecoder

RFC 2045 allows quoted parameter values (charset="UTF-8"). Without stripping
the quotes Charset.forName rejects the name and decodeBytes falls back to the
JVM default instead of the declared charset.

fix(appsec): replace String#split with char loop in MultipartContentDecoder

String#split is forbidden (uses regex internally). Replace with an explicit
char scan to find the first ; , or space after charset=.

Merge branch 'master' into alejandro.gonzalez/APPSEC-61875-files-content-encoding

refactor(appsec): avoid toLowerCase allocation in MultipartContentDecoder.extractCharset

Replace toLowerCase(Locale.ROOT).indexOf with an inline ASCII case-insensitive
scan to avoid allocating a full lowercase copy of the Content-Type string.
Also use the already-computed end variable as the loop bound.

chore: add CODEOWNERS entry for internal-api/datadog/trace/api/http

All files in this package (StoredByteBody, StoredBodySupplier,
MultipartContentDecoder, etc.) are AppSec HTTP body inspection
infrastructure.

Use REPLACE for malformed bytes so truncation preserves declared charset

When FileItemContentReader truncates at MAX_CONTENT_BYTES a cut in the
middle of a multibyte character no longer triggers the fallback path.
REPLACE substitutes the incomplete sequence with U+FFFD using the
declared charset; REPORT was throwing and silently switching to the
JVM default charset for the whole string.

Remove dead catch block from MultipartContentDecoder.decodeBytes

With CodingErrorAction.REPLACE the decoder never throws
CharacterCodingException, making the catch branch unreachable.

Fix charset parameter name matching to require exact boundary

Substring search could match 'xcharset=' as 'charset=', allowing
a client-controlled decoy parameter to override the real charset.
Now requires the match to be at position 0 or preceded by ';' or ' '.

Restore required catch for checked CharacterCodingException

CharsetDecoder.decode(ByteBuffer) declares throws CharacterCodingException
even though CodingErrorAction.REPLACE makes it unreachable; the compiler
still requires the exception to be caught or declared.

Co-authored-by: devflow.devflow-routing-intake &lt;devflow.devflow-routing-intake@kubernetes.us1.ddbuild.io&gt;
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -88,6 +88,7 @@
 /dd-trace-api/src/main/java/datadog/trace/api/aiguard/          @DataDog/asm-java
 /dd-trace-api/src/main/java/datadog/trace/api/EventTracker.java @DataDog/asm-java
 /internal-api/src/main/java/datadog/trace/api/gateway/          @DataDog/asm-java
+/internal-api/src/main/java/datadog/trace/api/http/             @DataDog/asm-java
 **/appsec/                                                      @DataDog/asm-java
 **/*CallSite*.java                                              @DataDog/asm-java
 **/*CallSite*.groovy                                            @DataDog/asm-java
diff --git a/dd-java-agent/instrumentation/commons-fileupload-1.5/src/main/java/datadog/trace/instrumentation/commons/fileupload/FileItemContentReader.java b/dd-java-agent/instrumentation/commons-fileupload-1.5/src/main/java/datadog/trace/instrumentation/commons/fileupload/FileItemContentReader.java
@@ -1,8 +1,8 @@
 package datadog.trace.instrumentation.commons.fileupload;
 
+import datadog.trace.api.http.MultipartContentDecoder;
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.commons.fileupload.FileItem;
@@ -35,7 +35,7 @@ public static String readContent(FileItem fileItem) {
           && (n = is.read(buf, total, MAX_CONTENT_BYTES - total)) != -1) {
         total += n;
       }
-      return new String(buf, 0, total, StandardCharsets.ISO_8859_1);
+      return MultipartContentDecoder.decodeBytes(buf, total, fileItem.getContentType());
     } catch (IOException ignored) {
       return "";
     }
diff --git a/dd-java-agent/instrumentation/commons-fileupload-1.5/src/test/groovy/FileItemContentReaderTest.groovy b/dd-java-agent/instrumentation/commons-fileupload-1.5/src/test/groovy/FileItemContentReaderTest.groovy
@@ -39,6 +39,15 @@ class FileItemContentReaderTest extends Specification {
     FileItemContentReader.readContent(item) == ''
   }
 
+  void 'readContent uses Content-Type from file item for charset decoding'() {
+    given:
+    def text = 'héllo wörld'
+    def item = fileItemFromBytes(text.getBytes('UTF-8'), 'file.txt', 'text/plain; charset=UTF-8')
+
+    expect:
+    FileItemContentReader.readContent(item) == text
+  }
+
   void 'readContents returns content for each non-form file with a name'() {
     given:
     def items = [fileItem('content-a', 'file-a.txt'), fileItem('content-b', 'file-b.txt'),]
@@ -101,10 +110,19 @@ class FileItemContentReaderTest extends Specification {
   }
 
   private FileItem fileItem(String content, String name) {
+    fileItem(content, name, null)
+  }
+
+  private FileItem fileItem(String content, String name, String contentType) {
+    fileItemFromBytes((content ?: '').getBytes('ISO-8859-1'), name, contentType)
+  }
+
+  private FileItem fileItemFromBytes(byte[] bytes, String name, String contentType) {
     FileItem item = Stub(FileItem)
     item.isFormField() >> false
     item.getName() >> name
-    item.getInputStream() >> new ByteArrayInputStream((content ?: '').getBytes('ISO-8859-1'))
+    item.getContentType() >> contentType
+    item.getInputStream() >> new ByteArrayInputStream(bytes)
     return item
   }
 }
diff --git a/internal-api/src/main/java/datadog/trace/api/http/MultipartContentDecoder.java b/internal-api/src/main/java/datadog/trace/api/http/MultipartContentDecoder.java
@@ -0,0 +1,74 @@
+package datadog.trace.api.http;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CodingErrorAction;
+
+/** Decodes multipart file content bytes to String using the per-part Content-Type charset. */
+public final class MultipartContentDecoder {
+
+  public static String decodeBytes(byte[] buf, int length, String contentType) {
+    Charset charset = extractCharset(contentType);
+    if (charset == null) charset = Charset.defaultCharset();
+    try {
+      return charset
+          .newDecoder()
+          .onMalformedInput(CodingErrorAction.REPLACE)
+          .onUnmappableCharacter(CodingErrorAction.REPLACE)
+          .decode(ByteBuffer.wrap(buf, 0, length))
+          .toString();
+    } catch (CharacterCodingException e) {
+      // unreachable: CodingErrorAction.REPLACE never throws CharacterCodingException
+      throw new IllegalStateException(e);
+    }
+  }
+
+  public static Charset extractCharset(String contentType) {
+    if (contentType == null) return null;
+    int searchFrom = 0;
+    while (true) {
+      int idx = indexOfIgnoreAsciiCase(contentType, "charset=", searchFrom);
+      if (idx < 0) return null;
+      // Require a parameter boundary before "charset=" so "xcharset=..." is not matched
+      if (idx == 0 || contentType.charAt(idx - 1) == ';' || contentType.charAt(idx - 1) == ' ') {
+        int nameStart = idx + 8;
+        int end = contentType.length();
+        for (int i = nameStart; i < end; i++) {
+          char c = contentType.charAt(i);
+          if (c == ';' || c == ',' || c == ' ') {
+            end = i;
+            break;
+          }
+        }
+        String name = contentType.substring(nameStart, end).trim();
+        if (name.length() > 1 && name.charAt(0) == '"' && name.charAt(name.length() - 1) == '"') {
+          name = name.substring(1, name.length() - 1);
+        }
+        try {
+          return Charset.forName(name);
+        } catch (IllegalArgumentException e) {
+          return null;
+        }
+      }
+      searchFrom = idx + 1;
+    }
+  }
+
+  private static int indexOfIgnoreAsciiCase(String s, String needle, int fromIndex) {
+    int sLen = s.length();
+    int nLen = needle.length();
+    outer:
+    for (int i = fromIndex, max = sLen - nLen; i <= max; i++) {
+      for (int j = 0; j < nLen; j++) {
+        if (Character.toLowerCase(s.charAt(i + j)) != needle.charAt(j)) {
+          continue outer;
+        }
+      }
+      return i;
+    }
+    return -1;
+  }
+
+  private MultipartContentDecoder() {}
+}
diff --git a/internal-api/src/test/java/datadog/trace/api/http/MultipartContentDecoderTest.java b/internal-api/src/test/java/datadog/trace/api/http/MultipartContentDecoderTest.java
@@ -0,0 +1,151 @@
+package datadog.trace.api.http;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.nio.charset.StandardCharsets;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
+import org.junit.jupiter.params.provider.NullAndEmptySource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+public class MultipartContentDecoderTest {
+
+  @Test
+  void decodeBytesUsesDeclaredUtf8Charset() {
+    String text = "héllo wörld";
+    byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
+    assertEquals(
+        text,
+        MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain; charset=UTF-8"));
+  }
+
+  @Test
+  void decodeBytesUsesDeclaredIso88591Charset() {
+    String text = "café";
+    byte[] bytes = text.getBytes(StandardCharsets.ISO_8859_1);
+    assertEquals(
+        text,
+        MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain; charset=ISO-8859-1"));
+  }
+
+  @Test
+  void decodeBytesDefaultsToMachineDefaultWhenNoCharset() {
+    String text = "hello world";
+    byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
+    assertEquals(text, MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain"));
+  }
+
+  @Test
+  void decodeBytesDefaultsToMachineDefaultWhenNullContentType() {
+    String text = "hello world";
+    byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
+    assertEquals(text, MultipartContentDecoder.decodeBytes(bytes, bytes.length, null));
+  }
+
+  @Test
+  void decodeBytesRespectsLengthParameter() {
+    byte[] bytes = "hello world".getBytes(StandardCharsets.UTF_8);
+    assertEquals("hello", MultipartContentDecoder.decodeBytes(bytes, 5, null));
+  }
+
+  @Test
+  void decodeBytesReturnsEmptyStringForZeroLength() {
+    assertEquals("", MultipartContentDecoder.decodeBytes(new byte[16], 0, null));
+  }
+
+  @Test
+  void decodeBytesReplacesMalformedBytesWithReplacementCharacterUsingDeclaredCharset() {
+    // 0xE9 (ISO-8859-1 'é') is not valid UTF-8; REPLACE substitutes U+FFFD
+    byte[] bytes = "café".getBytes(StandardCharsets.ISO_8859_1);
+    assertEquals(
+        "caf�",
+        MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain; charset=UTF-8"));
+  }
+
+  @Test
+  void decodeBytesHandlesTruncationAtMultibyteCharacterBoundary() {
+    // "€" encodes as 3 bytes in UTF-8: E2 82 AC
+    byte[] complete = "hello€".getBytes(StandardCharsets.UTF_8); // 8 bytes
+    // Pass only 6 bytes: "hello" + first byte of "€" (incomplete sequence)
+    String result = MultipartContentDecoder.decodeBytes(complete, 6, "text/plain; charset=UTF-8");
+    // Incomplete sequence → U+FFFD with declared charset, not fallback to JVM default
+    assertEquals("hello�", result);
+  }
+
+  @ParameterizedTest
+  @NullAndEmptySource
+  void extractCharsetReturnsNullForNullOrEmptyContentType(String contentType) {
+    assertNull(MultipartContentDecoder.extractCharset(contentType));
+  }
+
+  @ParameterizedTest
+  @ValueSource(strings = {"text/plain", "image/jpeg", "application/octet-stream"})
+  void extractCharsetReturnsNullForContentTypeWithoutCharset(String contentType) {
+    assertNull(MultipartContentDecoder.extractCharset(contentType));
+  }
+
+  @Test
+  void extractCharsetReturnsNullForInvalidCharsetName() {
+    assertNull(MultipartContentDecoder.extractCharset("text/plain; charset=NOTACHARSET"));
+  }
+
+  @ParameterizedTest
+  @ValueSource(
+      strings = {
+        "text/plain; CHARSET=UTF-8",
+        "text/plain; Charset=UTF-8",
+        "text/plain; charset=utf-8"
+      })
+  void extractCharsetIsCaseInsensitive(String contentType) {
+    assertEquals("UTF-8", MultipartContentDecoder.extractCharset(contentType).name());
+  }
+
+  @ParameterizedTest
+  @CsvSource({"text/plain; charset=UTF-8, UTF-8", "text/xml; charset=ISO-8859-1, ISO-8859-1"})
+  void extractCharsetFromStandardContentType(String contentType, String expectedCharset) {
+    assertEquals(expectedCharset, MultipartContentDecoder.extractCharset(contentType).name());
+  }
+
+  @Test
+  void extractCharsetIgnoresSubstringMatchInParameterName() {
+    // "xcharset=UTF-16" must not match; the real "charset=UTF-8" that follows must be used
+    assertEquals(
+        "UTF-8",
+        MultipartContentDecoder.extractCharset("text/plain; xcharset=UTF-16; charset=UTF-8")
+            .name());
+  }
+
+  @Test
+  void extractCharsetReturnsNullWhenOnlySubstringMatchExists() {
+    assertNull(MultipartContentDecoder.extractCharset("text/plain; xcharset=UTF-8"));
+  }
+
+  @Test
+  void extractCharsetHandlesAdditionalParameters() {
+    assertEquals(
+        "UTF-8",
+        MultipartContentDecoder.extractCharset("text/plain; charset=UTF-8; boundary=something")
+            .name());
+  }
+
+  @ParameterizedTest
+  @CsvSource({
+    "text/plain; charset=\"UTF-8\", UTF-8",
+    "text/xml; charset=\"ISO-8859-1\", ISO-8859-1"
+  })
+  void extractCharsetHandlesQuotedCharsetValue(String contentType, String expectedCharset) {
+    assertEquals(expectedCharset, MultipartContentDecoder.extractCharset(contentType).name());
+  }
+
+  @Test
+  void decodeBytesUsesQuotedDeclaredCharset() {
+    String text = "café";
+    byte[] bytes = text.getBytes(StandardCharsets.ISO_8859_1);
+    assertEquals(
+        text,
+        MultipartContentDecoder.decodeBytes(
+            bytes, bytes.length, "text/plain; charset=\"ISO-8859-1\""));
+  }
+}