Skip to content

Commit 9d73760

Browse files
jandro996devflow.devflow-routing-intake
andauthored
Fix charset decoding for server.request.body.files_content in commons-fileupload (#11212)
fix(appsec): use per-part charset for files_content in commons-fileupload fix(appsec): use CharsetDecoder with REPORT for charset fallback in FileItemContentReader refactor(appsec): extract MultipartContentDecoder to internal-api for reuse across integrations test(appsec): add missing corner cases to MultipartContentDecoderTest test(appsec): trim redundant decoder tests from FileItemContentReaderTest Charset fallback scenarios are covered by MultipartContentDecoderTest. One integration test is kept to verify that getContentType() is passed through. fix(appsec): use machine default charset as fallback in MultipartContentDecoder Replaces hardcoded UTF-8 (no-charset default) and ISO-8859-1 (fallback) with Charset.defaultCharset() in both cases, per reviewer feedback. test(appsec): migrate MultipartContentDecoderTest from Groovy/Spock to Java/JUnit 5 fix(appsec): strip surrounding quotes from charset parameter in MultipartContentDecoder RFC 2045 allows quoted parameter values (charset="UTF-8"). Without stripping the quotes Charset.forName rejects the name and decodeBytes falls back to the JVM default instead of the declared charset. fix(appsec): replace String#split with char loop in MultipartContentDecoder String#split is forbidden (uses regex internally). Replace with an explicit char scan to find the first ; , or space after charset=. Merge branch 'master' into alejandro.gonzalez/APPSEC-61875-files-content-encoding refactor(appsec): avoid toLowerCase allocation in MultipartContentDecoder.extractCharset Replace toLowerCase(Locale.ROOT).indexOf with an inline ASCII case-insensitive scan to avoid allocating a full lowercase copy of the Content-Type string. Also use the already-computed end variable as the loop bound. chore: add CODEOWNERS entry for internal-api/datadog/trace/api/http All files in this package (StoredByteBody, StoredBodySupplier, MultipartContentDecoder, etc.) are AppSec HTTP body inspection infrastructure. Use REPLACE for malformed bytes so truncation preserves declared charset When FileItemContentReader truncates at MAX_CONTENT_BYTES a cut in the middle of a multibyte character no longer triggers the fallback path. REPLACE substitutes the incomplete sequence with U+FFFD using the declared charset; REPORT was throwing and silently switching to the JVM default charset for the whole string. Remove dead catch block from MultipartContentDecoder.decodeBytes With CodingErrorAction.REPLACE the decoder never throws CharacterCodingException, making the catch branch unreachable. Fix charset parameter name matching to require exact boundary Substring search could match 'xcharset=' as 'charset=', allowing a client-controlled decoy parameter to override the real charset. Now requires the match to be at position 0 or preceded by ';' or ' '. Restore required catch for checked CharacterCodingException CharsetDecoder.decode(ByteBuffer) declares throws CharacterCodingException even though CodingErrorAction.REPLACE makes it unreachable; the compiler still requires the exception to be caught or declared. Co-authored-by: devflow.devflow-routing-intake <devflow.devflow-routing-intake@kubernetes.us1.ddbuild.io>
1 parent 5609365 commit 9d73760

5 files changed

Lines changed: 247 additions & 3 deletions

File tree

.github/CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
/dd-trace-api/src/main/java/datadog/trace/api/aiguard/ @DataDog/asm-java
8989
/dd-trace-api/src/main/java/datadog/trace/api/EventTracker.java @DataDog/asm-java
9090
/internal-api/src/main/java/datadog/trace/api/gateway/ @DataDog/asm-java
91+
/internal-api/src/main/java/datadog/trace/api/http/ @DataDog/asm-java
9192
**/appsec/ @DataDog/asm-java
9293
**/*CallSite*.java @DataDog/asm-java
9394
**/*CallSite*.groovy @DataDog/asm-java

dd-java-agent/instrumentation/commons-fileupload-1.5/src/main/java/datadog/trace/instrumentation/commons/fileupload/FileItemContentReader.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
package datadog.trace.instrumentation.commons.fileupload;
22

3+
import datadog.trace.api.http.MultipartContentDecoder;
34
import java.io.IOException;
45
import java.io.InputStream;
5-
import java.nio.charset.StandardCharsets;
66
import java.util.ArrayList;
77
import java.util.List;
88
import org.apache.commons.fileupload.FileItem;
@@ -35,7 +35,7 @@ public static String readContent(FileItem fileItem) {
3535
&& (n = is.read(buf, total, MAX_CONTENT_BYTES - total)) != -1) {
3636
total += n;
3737
}
38-
return new String(buf, 0, total, StandardCharsets.ISO_8859_1);
38+
return MultipartContentDecoder.decodeBytes(buf, total, fileItem.getContentType());
3939
} catch (IOException ignored) {
4040
return "";
4141
}

dd-java-agent/instrumentation/commons-fileupload-1.5/src/test/groovy/FileItemContentReaderTest.groovy

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,15 @@ class FileItemContentReaderTest extends Specification {
3939
FileItemContentReader.readContent(item) == ''
4040
}
4141

42+
void 'readContent uses Content-Type from file item for charset decoding'() {
43+
given:
44+
def text = 'héllo wörld'
45+
def item = fileItemFromBytes(text.getBytes('UTF-8'), 'file.txt', 'text/plain; charset=UTF-8')
46+
47+
expect:
48+
FileItemContentReader.readContent(item) == text
49+
}
50+
4251
void 'readContents returns content for each non-form file with a name'() {
4352
given:
4453
def items = [fileItem('content-a', 'file-a.txt'), fileItem('content-b', 'file-b.txt'),]
@@ -101,10 +110,19 @@ class FileItemContentReaderTest extends Specification {
101110
}
102111

103112
private FileItem fileItem(String content, String name) {
113+
fileItem(content, name, null)
114+
}
115+
116+
private FileItem fileItem(String content, String name, String contentType) {
117+
fileItemFromBytes((content ?: '').getBytes('ISO-8859-1'), name, contentType)
118+
}
119+
120+
private FileItem fileItemFromBytes(byte[] bytes, String name, String contentType) {
104121
FileItem item = Stub(FileItem)
105122
item.isFormField() >> false
106123
item.getName() >> name
107-
item.getInputStream() >> new ByteArrayInputStream((content ?: '').getBytes('ISO-8859-1'))
124+
item.getContentType() >> contentType
125+
item.getInputStream() >> new ByteArrayInputStream(bytes)
108126
return item
109127
}
110128
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
package datadog.trace.api.http;
2+
3+
import java.nio.ByteBuffer;
4+
import java.nio.charset.CharacterCodingException;
5+
import java.nio.charset.Charset;
6+
import java.nio.charset.CodingErrorAction;
7+
8+
/** Decodes multipart file content bytes to String using the per-part Content-Type charset. */
9+
public final class MultipartContentDecoder {
10+
11+
public static String decodeBytes(byte[] buf, int length, String contentType) {
12+
Charset charset = extractCharset(contentType);
13+
if (charset == null) charset = Charset.defaultCharset();
14+
try {
15+
return charset
16+
.newDecoder()
17+
.onMalformedInput(CodingErrorAction.REPLACE)
18+
.onUnmappableCharacter(CodingErrorAction.REPLACE)
19+
.decode(ByteBuffer.wrap(buf, 0, length))
20+
.toString();
21+
} catch (CharacterCodingException e) {
22+
// unreachable: CodingErrorAction.REPLACE never throws CharacterCodingException
23+
throw new IllegalStateException(e);
24+
}
25+
}
26+
27+
public static Charset extractCharset(String contentType) {
28+
if (contentType == null) return null;
29+
int searchFrom = 0;
30+
while (true) {
31+
int idx = indexOfIgnoreAsciiCase(contentType, "charset=", searchFrom);
32+
if (idx < 0) return null;
33+
// Require a parameter boundary before "charset=" so "xcharset=..." is not matched
34+
if (idx == 0 || contentType.charAt(idx - 1) == ';' || contentType.charAt(idx - 1) == ' ') {
35+
int nameStart = idx + 8;
36+
int end = contentType.length();
37+
for (int i = nameStart; i < end; i++) {
38+
char c = contentType.charAt(i);
39+
if (c == ';' || c == ',' || c == ' ') {
40+
end = i;
41+
break;
42+
}
43+
}
44+
String name = contentType.substring(nameStart, end).trim();
45+
if (name.length() > 1 && name.charAt(0) == '"' && name.charAt(name.length() - 1) == '"') {
46+
name = name.substring(1, name.length() - 1);
47+
}
48+
try {
49+
return Charset.forName(name);
50+
} catch (IllegalArgumentException e) {
51+
return null;
52+
}
53+
}
54+
searchFrom = idx + 1;
55+
}
56+
}
57+
58+
private static int indexOfIgnoreAsciiCase(String s, String needle, int fromIndex) {
59+
int sLen = s.length();
60+
int nLen = needle.length();
61+
outer:
62+
for (int i = fromIndex, max = sLen - nLen; i <= max; i++) {
63+
for (int j = 0; j < nLen; j++) {
64+
if (Character.toLowerCase(s.charAt(i + j)) != needle.charAt(j)) {
65+
continue outer;
66+
}
67+
}
68+
return i;
69+
}
70+
return -1;
71+
}
72+
73+
private MultipartContentDecoder() {}
74+
}
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
package datadog.trace.api.http;
2+
3+
import static org.junit.jupiter.api.Assertions.assertEquals;
4+
import static org.junit.jupiter.api.Assertions.assertNull;
5+
6+
import java.nio.charset.StandardCharsets;
7+
import org.junit.jupiter.api.Test;
8+
import org.junit.jupiter.params.ParameterizedTest;
9+
import org.junit.jupiter.params.provider.CsvSource;
10+
import org.junit.jupiter.params.provider.NullAndEmptySource;
11+
import org.junit.jupiter.params.provider.ValueSource;
12+
13+
public class MultipartContentDecoderTest {
14+
15+
@Test
16+
void decodeBytesUsesDeclaredUtf8Charset() {
17+
String text = "héllo wörld";
18+
byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
19+
assertEquals(
20+
text,
21+
MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain; charset=UTF-8"));
22+
}
23+
24+
@Test
25+
void decodeBytesUsesDeclaredIso88591Charset() {
26+
String text = "café";
27+
byte[] bytes = text.getBytes(StandardCharsets.ISO_8859_1);
28+
assertEquals(
29+
text,
30+
MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain; charset=ISO-8859-1"));
31+
}
32+
33+
@Test
34+
void decodeBytesDefaultsToMachineDefaultWhenNoCharset() {
35+
String text = "hello world";
36+
byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
37+
assertEquals(text, MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain"));
38+
}
39+
40+
@Test
41+
void decodeBytesDefaultsToMachineDefaultWhenNullContentType() {
42+
String text = "hello world";
43+
byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
44+
assertEquals(text, MultipartContentDecoder.decodeBytes(bytes, bytes.length, null));
45+
}
46+
47+
@Test
48+
void decodeBytesRespectsLengthParameter() {
49+
byte[] bytes = "hello world".getBytes(StandardCharsets.UTF_8);
50+
assertEquals("hello", MultipartContentDecoder.decodeBytes(bytes, 5, null));
51+
}
52+
53+
@Test
54+
void decodeBytesReturnsEmptyStringForZeroLength() {
55+
assertEquals("", MultipartContentDecoder.decodeBytes(new byte[16], 0, null));
56+
}
57+
58+
@Test
59+
void decodeBytesReplacesMalformedBytesWithReplacementCharacterUsingDeclaredCharset() {
60+
// 0xE9 (ISO-8859-1 'é') is not valid UTF-8; REPLACE substitutes U+FFFD
61+
byte[] bytes = "café".getBytes(StandardCharsets.ISO_8859_1);
62+
assertEquals(
63+
"caf�",
64+
MultipartContentDecoder.decodeBytes(bytes, bytes.length, "text/plain; charset=UTF-8"));
65+
}
66+
67+
@Test
68+
void decodeBytesHandlesTruncationAtMultibyteCharacterBoundary() {
69+
// "€" encodes as 3 bytes in UTF-8: E2 82 AC
70+
byte[] complete = "hello€".getBytes(StandardCharsets.UTF_8); // 8 bytes
71+
// Pass only 6 bytes: "hello" + first byte of "€" (incomplete sequence)
72+
String result = MultipartContentDecoder.decodeBytes(complete, 6, "text/plain; charset=UTF-8");
73+
// Incomplete sequence → U+FFFD with declared charset, not fallback to JVM default
74+
assertEquals("hello�", result);
75+
}
76+
77+
@ParameterizedTest
78+
@NullAndEmptySource
79+
void extractCharsetReturnsNullForNullOrEmptyContentType(String contentType) {
80+
assertNull(MultipartContentDecoder.extractCharset(contentType));
81+
}
82+
83+
@ParameterizedTest
84+
@ValueSource(strings = {"text/plain", "image/jpeg", "application/octet-stream"})
85+
void extractCharsetReturnsNullForContentTypeWithoutCharset(String contentType) {
86+
assertNull(MultipartContentDecoder.extractCharset(contentType));
87+
}
88+
89+
@Test
90+
void extractCharsetReturnsNullForInvalidCharsetName() {
91+
assertNull(MultipartContentDecoder.extractCharset("text/plain; charset=NOTACHARSET"));
92+
}
93+
94+
@ParameterizedTest
95+
@ValueSource(
96+
strings = {
97+
"text/plain; CHARSET=UTF-8",
98+
"text/plain; Charset=UTF-8",
99+
"text/plain; charset=utf-8"
100+
})
101+
void extractCharsetIsCaseInsensitive(String contentType) {
102+
assertEquals("UTF-8", MultipartContentDecoder.extractCharset(contentType).name());
103+
}
104+
105+
@ParameterizedTest
106+
@CsvSource({"text/plain; charset=UTF-8, UTF-8", "text/xml; charset=ISO-8859-1, ISO-8859-1"})
107+
void extractCharsetFromStandardContentType(String contentType, String expectedCharset) {
108+
assertEquals(expectedCharset, MultipartContentDecoder.extractCharset(contentType).name());
109+
}
110+
111+
@Test
112+
void extractCharsetIgnoresSubstringMatchInParameterName() {
113+
// "xcharset=UTF-16" must not match; the real "charset=UTF-8" that follows must be used
114+
assertEquals(
115+
"UTF-8",
116+
MultipartContentDecoder.extractCharset("text/plain; xcharset=UTF-16; charset=UTF-8")
117+
.name());
118+
}
119+
120+
@Test
121+
void extractCharsetReturnsNullWhenOnlySubstringMatchExists() {
122+
assertNull(MultipartContentDecoder.extractCharset("text/plain; xcharset=UTF-8"));
123+
}
124+
125+
@Test
126+
void extractCharsetHandlesAdditionalParameters() {
127+
assertEquals(
128+
"UTF-8",
129+
MultipartContentDecoder.extractCharset("text/plain; charset=UTF-8; boundary=something")
130+
.name());
131+
}
132+
133+
@ParameterizedTest
134+
@CsvSource({
135+
"text/plain; charset=\"UTF-8\", UTF-8",
136+
"text/xml; charset=\"ISO-8859-1\", ISO-8859-1"
137+
})
138+
void extractCharsetHandlesQuotedCharsetValue(String contentType, String expectedCharset) {
139+
assertEquals(expectedCharset, MultipartContentDecoder.extractCharset(contentType).name());
140+
}
141+
142+
@Test
143+
void decodeBytesUsesQuotedDeclaredCharset() {
144+
String text = "café";
145+
byte[] bytes = text.getBytes(StandardCharsets.ISO_8859_1);
146+
assertEquals(
147+
text,
148+
MultipartContentDecoder.decodeBytes(
149+
bytes, bytes.length, "text/plain; charset=\"ISO-8859-1\""));
150+
}
151+
}

0 commit comments

Comments
 (0)