Skip to content

Commit 7ed9344

Browse files
perf: improve performance of regexps in IAST and query obfuscator (#11710)
Migrate the IAST evidence-redaction regexps to RE2/J for linear-time matching. RE2/J has no back-references, so the SQL tokenizer is reworked to find Postgres dollar-quoted literals via a precomputed tag index (binary search) and to enumerate Oracle q'...' delimiters explicitly instead of relying on a back-reference. Configured redaction patterns that are valid under java.util.regex but unsupported by RE2/J fall back to the defaults instead of failing to compile. Replace the query obfuscator's `while (matcher.find())` + per-match `Strings.replace` loop (O(N*Q)) with a single appendReplacement / appendTail pass (O(Q)). Add JUnit 5 tests for the tokenizers and the obfuscator, a tokenizer JMH benchmark, and migrate SensitiveHandlerTest from Groovy to JUnit 5. (cherry picked from commit 92ebc2a) Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent a5f984d commit 7ed9344

23 files changed

Lines changed: 949 additions & 105 deletions

File tree

dd-java-agent/agent-iast/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ dependencies {
4545
implementation libs.moshi
4646
implementation libs.bundles.asm
4747
implementation libs.instrument.java
48+
implementation libs.re2j
4849

4950
testImplementation project(':utils:test-utils')
5051
testImplementation project(':dd-java-agent:agent-bootstrap')
Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
package com.datadog.iast.sensitive;
2+
3+
import static datadog.trace.api.iast.sink.SqlInjectionModule.DATABASE_PARAMETER;
4+
import static java.util.concurrent.TimeUnit.MICROSECONDS;
5+
import static java.util.concurrent.TimeUnit.MILLISECONDS;
6+
7+
import com.datadog.iast.model.Evidence;
8+
import com.datadog.iast.sensitive.SensitiveHandler.Tokenizer;
9+
import java.util.Arrays;
10+
import org.openjdk.jmh.annotations.Benchmark;
11+
import org.openjdk.jmh.annotations.BenchmarkMode;
12+
import org.openjdk.jmh.annotations.Fork;
13+
import org.openjdk.jmh.annotations.Level;
14+
import org.openjdk.jmh.annotations.Measurement;
15+
import org.openjdk.jmh.annotations.Mode;
16+
import org.openjdk.jmh.annotations.OutputTimeUnit;
17+
import org.openjdk.jmh.annotations.Param;
18+
import org.openjdk.jmh.annotations.Scope;
19+
import org.openjdk.jmh.annotations.Setup;
20+
import org.openjdk.jmh.annotations.State;
21+
import org.openjdk.jmh.annotations.Warmup;
22+
23+
/** Tracks the cost of the IAST evidence-redaction "sensitive analyzer" tokenizers. */
24+
@Warmup(iterations = 2, time = 250, timeUnit = MILLISECONDS)
25+
@Measurement(iterations = 3, time = 250, timeUnit = MILLISECONDS)
26+
@Fork(1)
27+
@OutputTimeUnit(MICROSECONDS)
28+
@BenchmarkMode(Mode.AverageTime)
29+
@State(Scope.Benchmark)
30+
public class SensitiveTokenizerBenchmark {
31+
32+
/** Each scenario pairs a malformed payload shape with the tokenizer that processes it. */
33+
public enum Scenario {
34+
/** LDAP filter opened, never closed, packed with operators — quadratic: {@code "(" + "="*n}. */
35+
LDAP_UNCLOSED_FILTER {
36+
@Override
37+
String payload(final int n) {
38+
return "(" + repeat('=', n - 1);
39+
}
40+
41+
@Override
42+
Tokenizer tokenizer(final String payload) {
43+
return new LdapRegexTokenizer(new Evidence(payload));
44+
}
45+
},
46+
/** Repeated open-group + operator — CUBIC, the worst found: {@code "(="*n}. */
47+
LDAP_NESTED_OPEN_EQ {
48+
@Override
49+
String payload(final int n) {
50+
return repeatUnit("(=", n);
51+
}
52+
53+
@Override
54+
Tokenizer tokenizer(final String payload) {
55+
return new LdapRegexTokenizer(new Evidence(payload));
56+
}
57+
},
58+
/** ANSI SQL string literal opened but never closed — stack overflow: {@code "'" + "a"*n}. */
59+
SQL_ANSI_UNTERMINATED_STRING {
60+
@Override
61+
String payload(final int n) {
62+
return "'" + repeat('a', n - 1);
63+
}
64+
65+
@Override
66+
Tokenizer tokenizer(final String payload) {
67+
return sql(payload, null);
68+
}
69+
},
70+
/** Oracle {@code q'<delim> ...} escaped literal with no matching close — stack overflow. */
71+
SQL_ORACLE_ESCAPED_LITERAL {
72+
@Override
73+
String payload(final int n) {
74+
return "q'~" + repeat('a', n - 3);
75+
}
76+
77+
@Override
78+
Tokenizer tokenizer(final String payload) {
79+
return sql(payload, "oracle");
80+
}
81+
},
82+
/** MySQL double-quoted string literal opened but never closed — stack overflow. */
83+
SQL_MYSQL_UNTERMINATED_STRING {
84+
@Override
85+
String payload(final int n) {
86+
return "\"" + repeat('a', n - 1);
87+
}
88+
89+
@Override
90+
Tokenizer tokenizer(final String payload) {
91+
return sql(payload, "mysql");
92+
}
93+
},
94+
/** URL query separator + long key, no {@code =} value — linear baseline. */
95+
URL_QUERY {
96+
@Override
97+
String payload(final int n) {
98+
return "http://h/p?" + repeat('a', n - 11);
99+
}
100+
101+
@Override
102+
Tokenizer tokenizer(final String payload) {
103+
return new UrlRegexpTokenizer(new Evidence(payload));
104+
}
105+
},
106+
/** Run of {@code ?} (also matched by {@code [^=&;]}) — quadratic: {@code "?"*n}. */
107+
URL_QUESTION_RUN {
108+
@Override
109+
String payload(final int n) {
110+
return repeat('?', n);
111+
}
112+
113+
@Override
114+
Tokenizer tokenizer(final String payload) {
115+
return new UrlRegexpTokenizer(new Evidence(payload));
116+
}
117+
},
118+
/** URL authority started with {@code //}, no {@code @} terminator — linear baseline. */
119+
URL_AUTHORITY {
120+
@Override
121+
String payload(final int n) {
122+
return "//" + repeat('a', n - 2);
123+
}
124+
125+
@Override
126+
Tokenizer tokenizer(final String payload) {
127+
return new UrlRegexpTokenizer(new Evidence(payload));
128+
}
129+
},
130+
/** Single command + long argument — linear baseline. */
131+
COMMAND_SINGLE_TOKEN {
132+
@Override
133+
String payload(final int n) {
134+
return "cmd " + repeat('a', n - 4);
135+
}
136+
137+
@Override
138+
Tokenizer tokenizer(final String payload) {
139+
return new CommandRegexpTokenizer(new Evidence(payload));
140+
}
141+
},
142+
/**
143+
* Blank lines exploit MULTILINE {@code ^} + {@code \s*} backtracking — quadratic: {@code
144+
* "\n"*n}.
145+
*/
146+
COMMAND_BLANK_LINES {
147+
@Override
148+
String payload(final int n) {
149+
return repeat('\n', n);
150+
}
151+
152+
@Override
153+
Tokenizer tokenizer(final String payload) {
154+
return new CommandRegexpTokenizer(new Evidence(payload));
155+
}
156+
};
157+
158+
abstract String payload(int sizeBytes);
159+
160+
abstract Tokenizer tokenizer(String payload);
161+
162+
static Tokenizer sql(final String payload, final String dialect) {
163+
final Evidence evidence = new Evidence(payload);
164+
if (dialect != null) {
165+
evidence.getContext().put(DATABASE_PARAMETER, dialect);
166+
}
167+
return new SqlRegexpTokenizer(evidence);
168+
}
169+
170+
static String repeat(final char c, final int count) {
171+
final int n = Math.max(count, 0);
172+
final char[] chars = new char[n];
173+
Arrays.fill(chars, c);
174+
return new String(chars);
175+
}
176+
177+
static String repeatUnit(final String unit, final int totalLen) {
178+
final int n = Math.max(totalLen, 0);
179+
final StringBuilder sb = new StringBuilder(n);
180+
while (sb.length() < n) {
181+
sb.append(unit);
182+
}
183+
sb.setLength(n);
184+
return sb.toString();
185+
}
186+
}
187+
188+
@Param({
189+
"LDAP_UNCLOSED_FILTER",
190+
"LDAP_NESTED_OPEN_EQ",
191+
"SQL_ANSI_UNTERMINATED_STRING",
192+
"SQL_ORACLE_ESCAPED_LITERAL",
193+
"SQL_MYSQL_UNTERMINATED_STRING",
194+
"URL_QUERY",
195+
"URL_QUESTION_RUN",
196+
"URL_AUTHORITY",
197+
"COMMAND_SINGLE_TOKEN",
198+
"COMMAND_BLANK_LINES"
199+
})
200+
Scenario scenario;
201+
202+
@Param({"512", "1024", "2048"})
203+
int sizeBytes;
204+
205+
private String payload;
206+
207+
@Setup(Level.Trial)
208+
public void setup() {
209+
payload = scenario.payload(sizeBytes);
210+
}
211+
212+
/**
213+
* Builds the tokenizer and fully drains it, exactly as evidence redaction does. Returns the
214+
* number of tokens (consumed by JMH). A pathological pattern may overflow the stack; we catch it
215+
* so the run stays stable and report {@code -1} — see the class javadoc.
216+
*/
217+
@Benchmark
218+
public long tokenize() {
219+
try {
220+
final Tokenizer tokenizer = scenario.tokenizer(payload);
221+
long count = 0;
222+
while (tokenizer.next()) {
223+
tokenizer.current();
224+
count++;
225+
}
226+
return count;
227+
} catch (final Throwable pathological) {
228+
return -1;
229+
}
230+
}
231+
}

dd-java-agent/agent-iast/src/main/java/com/datadog/iast/model/json/EvidenceAdapter.java

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -446,23 +446,26 @@ public void write(final Context ctx, final JsonWriter writer) throws IOException
446446

447447
private List<ValuePart> split(final RedactionContext redaction) {
448448
final List<ValuePart> parts = new ArrayList<>();
449+
// Identical sensitive chunks redact to the same pattern (the first occurrence in the source),
450+
// so cache chunk -> offset to avoid an O(sourceLength) indexOf per repeated occurrence.
451+
final Map<String, Integer> matchingOffsets = new HashMap<>();
449452
if (redaction.isSensitive()) {
450453
// redact the full tainted value as the source is sensitive (password, certificate, ...)
451-
addValuePart(0, value.length(), redaction, true, parts);
454+
addValuePart(0, value.length(), redaction, matchingOffsets, true, parts);
452455
} else {
453456
// redact only sensitive parts
454457
int index = 0;
455458
for (final Ranged sensitive : this.sensitiveRanges) {
456459
final int start = sensitive.getStart();
457460
final int end = sensitive.getStart() + sensitive.getLength();
458461
// append previous tainted chunk (if any)
459-
addValuePart(index, start, redaction, false, parts);
462+
addValuePart(index, start, redaction, matchingOffsets, false, parts);
460463
// append current sensitive tainted chunk
461-
addValuePart(start, end, redaction, true, parts);
464+
addValuePart(start, end, redaction, matchingOffsets, true, parts);
462465
index = end;
463466
}
464467
// append last tainted chunk (if any)
465-
addValuePart(index, value.length(), redaction, false, parts);
468+
addValuePart(index, value.length(), redaction, matchingOffsets, false, parts);
466469
}
467470
return parts;
468471
}
@@ -471,6 +474,7 @@ private void addValuePart(
471474
final int start,
472475
final int end,
473476
final RedactionContext ctx,
477+
final Map<String, Integer> matchingOffsets,
474478
final boolean redact,
475479
final List<ValuePart> valueParts) {
476480
if (start < end) {
@@ -484,7 +488,9 @@ private void addValuePart(
484488
final int length = chunk.length();
485489
final String sourceValue = source.getValue();
486490
final String redactedValue = ctx.getRedactedValue();
487-
final int matching = (sourceValue == null) ? -1 : sourceValue.indexOf(chunk);
491+
final int matching =
492+
matchingOffsets.computeIfAbsent(
493+
chunk, c -> sourceValue == null ? -1 : sourceValue.indexOf(c));
488494
final String pattern;
489495
if (matching >= 0 && redactedValue != null) {
490496
// if matches append the matching part from the redacted value

dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/AbstractRegexTokenizer.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
package com.datadog.iast.sensitive;
22

33
import com.datadog.iast.util.Ranged;
4+
import com.google.re2j.Matcher;
5+
import com.google.re2j.Pattern;
46
import java.util.NoSuchElementException;
5-
import java.util.regex.Matcher;
6-
import java.util.regex.Pattern;
77
import javax.annotation.Nullable;
88

99
public abstract class AbstractRegexTokenizer implements SensitiveHandler.Tokenizer {

dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/CommandRegexpTokenizer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import com.datadog.iast.model.Evidence;
44
import com.datadog.iast.util.Ranged;
5-
import java.util.regex.Pattern;
5+
import com.google.re2j.Pattern;
66

77
public class CommandRegexpTokenizer extends AbstractRegexTokenizer {
88

dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/HeaderRegexpTokenizer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
import com.datadog.iast.model.Evidence;
44
import com.datadog.iast.util.Ranged;
5+
import com.google.re2j.Pattern;
56
import java.util.NoSuchElementException;
6-
import java.util.regex.Pattern;
77
import javax.annotation.Nullable;
88

99
public class HeaderRegexpTokenizer implements SensitiveHandler.Tokenizer {

dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/LdapRegexTokenizer.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import com.datadog.iast.model.Evidence;
44
import com.datadog.iast.util.Ranged;
5-
import java.util.regex.Pattern;
5+
import com.google.re2j.Pattern;
66

77
/**
88
* @see <a href="https://docs.ldap.com/specs/rfc4515.txt">Lightweight Directory Access Protocol
@@ -14,7 +14,7 @@ public class LdapRegexTokenizer extends AbstractRegexTokenizer {
1414

1515
private static final Pattern LDAP_PATTERN =
1616
Pattern.compile(
17-
String.format("\\(.*?(?:~=|=|<=|>=)(?<%s>[^)]+)\\)", LITERAL_GROUP), Pattern.MULTILINE);
17+
String.format("\\(.*?(?:~=|=|<=|>=)(?P<%s>[^)]+)\\)", LITERAL_GROUP), Pattern.MULTILINE);
1818

1919
public LdapRegexTokenizer(final Evidence evidence) {
2020
super(LDAP_PATTERN, evidence.getValue());

0 commit comments

Comments
 (0)