Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
045ea63
fix(appsec): use per-part charset for files_content in commons-fileup…
jandro996 Apr 27, 2026
733393c
fix(appsec): use CharsetDecoder with REPORT for charset fallback in F…
jandro996 Apr 27, 2026
4738eed
refactor(appsec): extract MultipartContentDecoder to internal-api for…
jandro996 Apr 27, 2026
21b3820
test(appsec): add missing corner cases to MultipartContentDecoderTest
jandro996 Apr 27, 2026
b7a1a33
test(appsec): trim redundant decoder tests from FileItemContentReader…
jandro996 Apr 27, 2026
e74778b
fix(appsec): use machine default charset as fallback in MultipartCont…
jandro996 Apr 28, 2026
55044ff
test(appsec): migrate MultipartContentDecoderTest from Groovy/Spock t…
jandro996 Apr 28, 2026
6f0a3ec
fix(appsec): strip surrounding quotes from charset parameter in Multi…
jandro996 Apr 28, 2026
beea424
fix(appsec): replace String#split with char loop in MultipartContentD…
jandro996 Apr 28, 2026
228e1c4
Merge branch 'master' into alejandro.gonzalez/APPSEC-61875-files-cont…
jandro996 Apr 28, 2026
aa1b0b6
refactor(appsec): avoid toLowerCase allocation in MultipartContentDec…
jandro996 Apr 28, 2026
fd0ed70
chore: add CODEOWNERS entry for internal-api/datadog/trace/api/http
jandro996 Apr 28, 2026
22536b1
Use REPLACE for malformed bytes so truncation preserves declared charset
jandro996 Apr 28, 2026
bbaf10e
Remove dead catch block from MultipartContentDecoder.decodeBytes
jandro996 Apr 28, 2026
5d471a6
Fix charset parameter name matching to require exact boundary
jandro996 Apr 28, 2026
33f9eb1
Restore required catch for checked CharacterCodingException
jandro996 Apr 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
/dd-trace-api/src/main/java/datadog/trace/api/aiguard/ @DataDog/asm-java
/dd-trace-api/src/main/java/datadog/trace/api/EventTracker.java @DataDog/asm-java
/internal-api/src/main/java/datadog/trace/api/gateway/ @DataDog/asm-java
/internal-api/src/main/java/datadog/trace/api/http/ @DataDog/asm-java
**/appsec/ @DataDog/asm-java
**/*CallSite*.java @DataDog/asm-java
**/*CallSite*.groovy @DataDog/asm-java
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package datadog.trace.instrumentation.commons.fileupload;

import datadog.trace.api.http.MultipartContentDecoder;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.fileupload.FileItem;
Expand Down Expand Up @@ -35,7 +35,7 @@ public static String readContent(FileItem fileItem) {
&& (n = is.read(buf, total, MAX_CONTENT_BYTES - total)) != -1) {
total += n;
}
return new String(buf, 0, total, StandardCharsets.ISO_8859_1);
return MultipartContentDecoder.decodeBytes(buf, total, fileItem.getContentType());
} catch (IOException ignored) {
return "";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ class FileItemContentReaderTest extends Specification {
FileItemContentReader.readContent(item) == ''
}

void 'readContent uses Content-Type from file item for charset decoding'() {
given:
def text = 'héllo wörld'
def item = fileItemFromBytes(text.getBytes('UTF-8'), 'file.txt', 'text/plain; charset=UTF-8')

expect:
FileItemContentReader.readContent(item) == text
}

void 'readContents returns content for each non-form file with a name'() {
given:
def items = [fileItem('content-a', 'file-a.txt'), fileItem('content-b', 'file-b.txt'),]
Expand Down Expand Up @@ -101,10 +110,19 @@ class FileItemContentReaderTest extends Specification {
}

private FileItem fileItem(String content, String name) {
fileItem(content, name, null)
}

private FileItem fileItem(String content, String name, String contentType) {
fileItemFromBytes((content ?: '').getBytes('ISO-8859-1'), name, contentType)
}

private FileItem fileItemFromBytes(byte[] bytes, String name, String contentType) {
FileItem item = Stub(FileItem)
item.isFormField() >> false
item.getName() >> name
item.getInputStream() >> new ByteArrayInputStream((content ?: '').getBytes('ISO-8859-1'))
item.getContentType() >> contentType
item.getInputStream() >> new ByteArrayInputStream(bytes)
return item
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package datadog.trace.api.http;

import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CodingErrorAction;

/** Decodes multipart file content bytes to String using the per-part Content-Type charset. */
public final class MultipartContentDecoder {

public static String decodeBytes(byte[] buf, int length, String contentType) {
Charset charset = extractCharset(contentType);
if (charset == null) charset = Charset.defaultCharset();
try {
return charset
.newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.decode(ByteBuffer.wrap(buf, 0, length))
.toString();
} catch (CharacterCodingException e) {
// unreachable: CodingErrorAction.REPLACE never throws CharacterCodingException
throw new IllegalStateException(e);
}
}

public static Charset extractCharset(String contentType) {
if (contentType == null) return null;
int searchFrom = 0;
while (true) {
int idx = indexOfIgnoreAsciiCase(contentType, "charset=", searchFrom);
if (idx < 0) return null;
// Require a parameter boundary before "charset=" so "xcharset=..." is not matched
if (idx == 0 || contentType.charAt(idx - 1) == ';' || contentType.charAt(idx - 1) == ' ') {
int nameStart = idx + 8;
int end = contentType.length();
for (int i = nameStart; i < end; i++) {
char c = contentType.charAt(i);
if (c == ';' || c == ',' || c == ' ') {
end = i;
break;
}
}
String name = contentType.substring(nameStart, end).trim();
if (name.length() > 1 && name.charAt(0) == '"' && name.charAt(name.length() - 1) == '"') {
name = name.substring(1, name.length() - 1);
}
try {
return Charset.forName(name);
} catch (IllegalArgumentException e) {
return null;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this risk a NullPointerException elsewhere?
Should we fallback to a default Charset instead?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now it's only used in decodeBytes method, where that is done
if (charset == null) charset = Charset.defaultCharset();

Probably It makes more sense to return it where you mention, not sure if in the future it will be necessary to know exactly if we are not able to get the charset

I could change it in my next PR related with this topic if you want :)

}
}
searchFrom = idx + 1;
}
}

private static int indexOfIgnoreAsciiCase(String s, String needle, int fromIndex) {
int sLen = s.length();
int nLen = needle.length();
outer:
for (int i = fromIndex, max = sLen - nLen; i <= max; i++) {
for (int j = 0; j < nLen; j++) {
if (Character.toLowerCase(s.charAt(i + j)) != needle.charAt(j)) {
continue outer;
}
}
return i;
}
return -1;
}

private MultipartContentDecoder() {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
package datadog.trace.api.http;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

import java.nio.charset.StandardCharsets;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.junit.jupiter.params.provider.NullAndEmptySource;
import org.junit.jupiter.params.provider.ValueSource;

public class MultipartContentDecoderTest {

@Test
void decodeBytesUsesDeclaredUtf8Charset() {
String text = "héllo wörld";
byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
assertEquals(
text,
MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain; charset=UTF-8"));
}

@Test
void decodeBytesUsesDeclaredIso88591Charset() {
String text = "café";
byte[] bytes = text.getBytes(StandardCharsets.ISO_8859_1);
assertEquals(
text,
MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain; charset=ISO-8859-1"));
}

@Test
void decodeBytesDefaultsToMachineDefaultWhenNoCharset() {
String text = "hello world";
byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
assertEquals(text, MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain"));
}

@Test
void decodeBytesDefaultsToMachineDefaultWhenNullContentType() {
String text = "hello world";
byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
assertEquals(text, MultipartContentDecoder.decodeBytes(bytes, bytes.length, null));
}

@Test
void decodeBytesRespectsLengthParameter() {
byte[] bytes = "hello world".getBytes(StandardCharsets.UTF_8);
assertEquals("hello", MultipartContentDecoder.decodeBytes(bytes, 5, null));
}

@Test
void decodeBytesReturnsEmptyStringForZeroLength() {
assertEquals("", MultipartContentDecoder.decodeBytes(new byte[16], 0, null));
}

@Test
void decodeBytesReplacesMalformedBytesWithReplacementCharacterUsingDeclaredCharset() {
// 0xE9 (ISO-8859-1 'é') is not valid UTF-8; REPLACE substitutes U+FFFD
byte[] bytes = "café".getBytes(StandardCharsets.ISO_8859_1);
assertEquals(
"caf�",
MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain; charset=UTF-8"));
}

@Test
void decodeBytesHandlesTruncationAtMultibyteCharacterBoundary() {
// "€" encodes as 3 bytes in UTF-8: E2 82 AC
byte[] complete = "hello€".getBytes(StandardCharsets.UTF_8); // 8 bytes
// Pass only 6 bytes: "hello" + first byte of "€" (incomplete sequence)
String result = MultipartContentDecoder.decodeBytes(complete, 6, "text/plain; charset=UTF-8");
// Incomplete sequence → U+FFFD with declared charset, not fallback to JVM default
assertEquals("hello�", result);
}

@ParameterizedTest
@NullAndEmptySource
void extractCharsetReturnsNullForNullOrEmptyContentType(String contentType) {
assertNull(MultipartContentDecoder.extractCharset(contentType));
}

@ParameterizedTest
@ValueSource(strings = {"text/plain", "image/jpeg", "application/octet-stream"})
void extractCharsetReturnsNullForContentTypeWithoutCharset(String contentType) {
assertNull(MultipartContentDecoder.extractCharset(contentType));
}

@Test
void extractCharsetReturnsNullForInvalidCharsetName() {
assertNull(MultipartContentDecoder.extractCharset("text/plain; charset=NOTACHARSET"));
}

@ParameterizedTest
@ValueSource(
strings = {
"text/plain; CHARSET=UTF-8",
"text/plain; Charset=UTF-8",
"text/plain; charset=utf-8"
})
void extractCharsetIsCaseInsensitive(String contentType) {
assertEquals("UTF-8", MultipartContentDecoder.extractCharset(contentType).name());
}

@ParameterizedTest
@CsvSource({"text/plain; charset=UTF-8, UTF-8", "text/xml; charset=ISO-8859-1, ISO-8859-1"})
void extractCharsetFromStandardContentType(String contentType, String expectedCharset) {
assertEquals(expectedCharset, MultipartContentDecoder.extractCharset(contentType).name());
}

@Test
void extractCharsetIgnoresSubstringMatchInParameterName() {
// "xcharset=UTF-16" must not match; the real "charset=UTF-8" that follows must be used
assertEquals(
"UTF-8",
MultipartContentDecoder.extractCharset("text/plain; xcharset=UTF-16; charset=UTF-8")
.name());
}

@Test
void extractCharsetReturnsNullWhenOnlySubstringMatchExists() {
assertNull(MultipartContentDecoder.extractCharset("text/plain; xcharset=UTF-8"));
}

@Test
void extractCharsetHandlesAdditionalParameters() {
assertEquals(
"UTF-8",
MultipartContentDecoder.extractCharset("text/plain; charset=UTF-8; boundary=something")
.name());
}

@ParameterizedTest
@CsvSource({
"text/plain; charset=\"UTF-8\", UTF-8",
"text/xml; charset=\"ISO-8859-1\", ISO-8859-1"
})
void extractCharsetHandlesQuotedCharsetValue(String contentType, String expectedCharset) {
assertEquals(expectedCharset, MultipartContentDecoder.extractCharset(contentType).name());
}

@Test
void decodeBytesUsesQuotedDeclaredCharset() {
String text = "café";
byte[] bytes = text.getBytes(StandardCharsets.ISO_8859_1);
assertEquals(
text,
MultipartContentDecoder.decodeBytes(
bytes, bytes.length, "text/plain; charset=\"ISO-8859-1\""));
}
}
Loading