DataDog · gh-worker-dd-mergequeue-cf854d · Apr 28, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
@@ -1,8 +1,8 @@
 package datadog.trace.instrumentation.commons.fileupload;
 
+import datadog.trace.api.http.MultipartContentDecoder;
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.commons.fileupload.FileItem;
@@ -35,7 +35,7 @@ public static String readContent(FileItem fileItem) {
           && (n = is.read(buf, total, MAX_CONTENT_BYTES - total)) != -1) {
         total += n;
       }
-      return new String(buf, 0, total, StandardCharsets.ISO_8859_1);
+      return MultipartContentDecoder.decodeBytes(buf, total, fileItem.getContentType());
     } catch (IOException ignored) {
       return "";
     }

@@ -39,6 +39,15 @@ class FileItemContentReaderTest extends Specification {
     FileItemContentReader.readContent(item) == ''
   }
 
+  void 'readContent uses Content-Type from file item for charset decoding'() {
+    given:
+    def text = 'héllo wörld'
+    def item = fileItemFromBytes(text.getBytes('UTF-8'), 'file.txt', 'text/plain; charset=UTF-8')
+
+    expect:
+    FileItemContentReader.readContent(item) == text
+  }
+
   void 'readContents returns content for each non-form file with a name'() {
     given:
     def items = [fileItem('content-a', 'file-a.txt'), fileItem('content-b', 'file-b.txt'),]
@@ -101,10 +110,19 @@ class FileItemContentReaderTest extends Specification {
   }
 
   private FileItem fileItem(String content, String name) {
+    fileItem(content, name, null)
+  }
+
+  private FileItem fileItem(String content, String name, String contentType) {
+    fileItemFromBytes((content ?: '').getBytes('ISO-8859-1'), name, contentType)
+  }
+
+  private FileItem fileItemFromBytes(byte[] bytes, String name, String contentType) {
     FileItem item = Stub(FileItem)
     item.isFormField() >> false
     item.getName() >> name
-    item.getInputStream() >> new ByteArrayInputStream((content ?: '').getBytes('ISO-8859-1'))
+    item.getContentType() >> contentType
+    item.getInputStream() >> new ByteArrayInputStream(bytes)
     return item
   }
 }
@@ -0,0 +1,41 @@
+package datadog.trace.api.http;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
+
+/** Decodes multipart file content bytes to String using the per-part Content-Type charset. */
+public final class MultipartContentDecoder {
+
+  public static String decodeBytes(byte[] buf, int length, String contentType) {
+    Charset charset = extractCharset(contentType);
+    if (charset == null) charset = StandardCharsets.UTF_8;
+    try {
+      return charset
+          .newDecoder()
+          .onMalformedInput(CodingErrorAction.REPORT)
+          .onUnmappableCharacter(CodingErrorAction.REPORT)
+          .decode(ByteBuffer.wrap(buf, 0, length))
+          .toString();
+    } catch (CharacterCodingException e) {
+      return new String(buf, 0, length, StandardCharsets.ISO_8859_1);
+    }
+  }
+
+  public static Charset extractCharset(String contentType) {
+    if (contentType == null) return null;
+    int idx = contentType.toLowerCase(Locale.ROOT).indexOf("charset=");
+    if (idx < 0) return null;
+    String name = contentType.substring(idx + 8).split("[;, ]")[0].trim();
+    try {
+      return Charset.forName(name);
+    } catch (IllegalArgumentException e) {
+      return null;
+    }
+  }
+
+  private MultipartContentDecoder() {}
+}
diff --git a/internal-api/src/test/groovy/datadog/trace/api/http/MultipartContentDecoderTest.groovy b/internal-api/src/test/groovy/datadog/trace/api/http/MultipartContentDecoderTest.groovy
@@ -0,0 +1,113 @@
+package datadog.trace.api.http
+
+import spock.lang.Specification
+
+class MultipartContentDecoderTest extends Specification {
+
+  void 'decodeBytes uses declared UTF-8 charset'() {
+    given:
+    def text = 'héllo wörld'
+    byte[] bytes = text.getBytes('UTF-8')
+
+    expect:
+    MultipartContentDecoder.decodeBytes(bytes, bytes.length, 'text/plain; charset=UTF-8') == text
+  }
+
+  void 'decodeBytes falls back to UTF-8 when Content-Type has no charset'() {
+    given:
+    def text = 'hello world'
+    byte[] bytes = text.getBytes('UTF-8')
+
+    expect:
+    MultipartContentDecoder.decodeBytes(bytes, bytes.length, 'text/plain') == text
+  }
+
+  void 'decodeBytes falls back to UTF-8 when Content-Type is null'() {
+    given:
+    def text = 'hello world'
+    byte[] bytes = text.getBytes('UTF-8')
+
+    expect:
+    MultipartContentDecoder.decodeBytes(bytes, bytes.length, null) == text
+  }
+
+  void 'decodeBytes falls back to ISO-8859-1 when bytes are invalid for declared charset'() {
+    given:
+    // 0xE9 is 'é' in ISO-8859-1 but an invalid lone UTF-8 byte
+    byte[] bytes = 'café'.getBytes('ISO-8859-1')
+
+    expect:
+    MultipartContentDecoder.decodeBytes(bytes, bytes.length, null) == 'café'
+  }
+
+  void 'decodeBytes uses declared ISO-8859-1 charset'() {
+    given:
+    def text = 'café'
+    byte[] bytes = text.getBytes('ISO-8859-1')
+
+    expect:
+    MultipartContentDecoder.decodeBytes(bytes, bytes.length, 'text/plain; charset=ISO-8859-1') == text
+  }
+
+  void 'decodeBytes respects length parameter'() {
+    given:
+    byte[] bytes = 'hello world'.getBytes('UTF-8')
+
+    expect:
+    MultipartContentDecoder.decodeBytes(bytes, 5, null) == 'hello'
+  }
+
+  void 'decodeBytes returns empty string for zero length'() {
+    expect:
+    MultipartContentDecoder.decodeBytes(new byte[16], 0, null) == ''
+  }
+
+  void 'decodeBytes falls back to ISO-8859-1 when declared charset cannot decode the bytes'() {
+    given:
+    // bytes are ISO-8859-1 encoded but Content-Type explicitly declares UTF-8
+    byte[] bytes = 'café'.getBytes('ISO-8859-1')
+
+    expect:
+    MultipartContentDecoder.decodeBytes(bytes, bytes.length, 'text/plain; charset=UTF-8') == 'café'
+  }
+
+  void 'extractCharset returns null for null contentType'() {
+    expect:
+    MultipartContentDecoder.extractCharset(null) == null
+  }
+
+  void 'extractCharset returns null for empty contentType'() {
+    expect:
+    MultipartContentDecoder.extractCharset('') == null
+  }
+
+  void 'extractCharset returns null for contentType without charset'() {
+    expect:
+    MultipartContentDecoder.extractCharset('text/plain') == null
+    MultipartContentDecoder.extractCharset('image/jpeg') == null
+    MultipartContentDecoder.extractCharset('application/octet-stream') == null
+  }
+
+  void 'extractCharset returns null for invalid charset name'() {
+    expect:
+    MultipartContentDecoder.extractCharset('text/plain; charset=NOTACHARSET') == null
+  }
+
+  void 'extractCharset extracts charset case-insensitively'() {
+    expect:
+    MultipartContentDecoder.extractCharset('text/plain; CHARSET=UTF-8').name() == 'UTF-8'
+    MultipartContentDecoder.extractCharset('text/plain; Charset=UTF-8').name() == 'UTF-8'
+    MultipartContentDecoder.extractCharset('text/plain; charset=utf-8').name() == 'UTF-8'
+  }
+
+  void 'extractCharset extracts charset from standard Content-Type'() {
+    expect:
+    MultipartContentDecoder.extractCharset('text/plain; charset=UTF-8').name() == 'UTF-8'
+    MultipartContentDecoder.extractCharset('text/xml; charset=ISO-8859-1').name() == 'ISO-8859-1'
+  }
+
+  void 'extractCharset extracts charset when followed by additional parameters'() {
+    expect:
+    MultipartContentDecoder.extractCharset('text/plain; charset=UTF-8; boundary=something').name() == 'UTF-8'
+  }
+}