Skip to content

Commit e729013

Browse files
committed
Adding to StringUtiils
- fast replaceAll for a fixed string & replacement, 3x throughput compared to regex based solutions, 1/2x allocation compared to regex solutions - added SubSequence which provides a view into a subsequence of a String without incurring extra allocation - Strings.spliit returns an Iterable<SubSequence> can be used to do light weight processing of a String
1 parent 2fa3c0c commit e729013

7 files changed

Lines changed: 724 additions & 0 deletions

File tree

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
package datadog.trace.util;
2+
3+
import java.util.regex.Pattern;
4+
import org.openjdk.jmh.annotations.Benchmark;
5+
import org.openjdk.jmh.annotations.Fork;
6+
import org.openjdk.jmh.annotations.Measurement;
7+
import org.openjdk.jmh.annotations.Threads;
8+
import org.openjdk.jmh.annotations.Warmup;
9+
10+
/**
11+
* For simple replacements, Strings.replaceAll out performs String.replaceAll and
12+
* regex.Matcher.replaceAll by 3x. Strings.replaceAll also requires less allocation.
13+
*
14+
* <p>When pattern matching is needed, compiling the regex to Pattern slightly improves overhead,
15+
* but dramatically reduces memory allocation to 1/4x of String.replaceAll. <code>
16+
* MacBook M1 with 8 threads (Java 21)
17+
*
18+
* Benchmark Mode Cnt Score Error Units
19+
* StringReplacementBenchmark.regex_replaceAll thrpt 6 13795837.811 ± 3635087.691 ops/s
20+
* StringReplacementBenchmark.regex_replaceAll:gc.alloc.rate thrpt 6 3988.955 ± 1148.316 MB/sec
21+
*
22+
* StringReplacementBenchmark.string_replaceAll thrpt 6 14611046.391 ± 4865682.875 ops/s
23+
* StringReplacementBenchmark.string_replaceAll:gc.alloc.rate thrpt 6 11391.346 ± 3790.917 MB/sec
24+
*
25+
* StringReplacementBenchmark.strings_replaceAll thrpt 6 39514695.575 ± 7169844.210 ops/s
26+
* StringReplacementBenchmark.strings_replaceAll:gc.alloc.rate thrpt 6 2777.083 ± 506.909 MB/sec
27+
* </code>
28+
*/
29+
@Fork(2)
30+
@Warmup(iterations = 2)
31+
@Measurement(iterations = 3)
32+
@Threads(8)
33+
public class StringReplacementBenchmark {
34+
static final String[] INPUTS = {
35+
"foo",
36+
"baz",
37+
"foobar",
38+
"foobaz",
39+
"foo=baz",
40+
"bar=foo",
41+
"foo=foo&bar=foo",
42+
"lorem ipsum",
43+
"datadog"
44+
};
45+
46+
static int sharedInputIndex = 0;
47+
48+
static String nextInput() {
49+
int localIndex = ++sharedInputIndex;
50+
if (localIndex >= INPUTS.length) {
51+
sharedInputIndex = localIndex = 0;
52+
}
53+
return INPUTS[localIndex];
54+
}
55+
56+
@Benchmark
57+
public String string_replaceAll() {
58+
return _string_replaceAll(nextInput());
59+
}
60+
61+
static String _string_replaceAll(String input) {
62+
// Underneath, this does Pattern.compile("foo").matcher(str).replaceAll()
63+
return input.replaceAll("foo", "*redacted*");
64+
}
65+
66+
static final Pattern REGEX_COMPILED = Pattern.compile("foo");
67+
68+
@Benchmark
69+
public String regex_replaceAll() {
70+
return _regex_replaceAll(nextInput());
71+
}
72+
73+
static String _regex_replaceAll(String input) {
74+
return REGEX_COMPILED.matcher(input).replaceAll("*redcated*");
75+
}
76+
77+
@Benchmark
78+
public String strings_replaceAll() {
79+
return _strings_replaceAll(nextInput());
80+
}
81+
82+
static String _strings_replaceAll(String input) {
83+
return Strings.replaceAll(input, "foo", "*redacted*");
84+
}
85+
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
package datadog.trace.util;
2+
3+
import java.util.regex.Pattern;
4+
import org.openjdk.jmh.annotations.Benchmark;
5+
import org.openjdk.jmh.annotations.Fork;
6+
import org.openjdk.jmh.annotations.Measurement;
7+
import org.openjdk.jmh.annotations.Param;
8+
import org.openjdk.jmh.annotations.Scope;
9+
import org.openjdk.jmh.annotations.State;
10+
import org.openjdk.jmh.annotations.Threads;
11+
import org.openjdk.jmh.annotations.Warmup;
12+
import org.openjdk.jmh.infra.Blackhole;
13+
14+
/**
15+
* Strings.split is generally faster for String processing, since it create SubSequences that are
16+
* view into the backing String rather than new String objects.
17+
*
18+
* <p>Benchmark (testStr) Mode Cnt Score Error Units StringSplitBenchmark.pattern_split EMPTY thrpt
19+
* 6 291274421.621 ± 14834420.899 ops/s StringSplitBenchmark.string_split EMPTY thrpt 6
20+
* 1035461179.368 ± 60212686.921 ops/s StringSplitBenchmark.strings_split EMPTY thrpt 6
21+
* 8161781738.019 ± 178530888.497 ops/s
22+
*
23+
* <p>StringSplitBenchmark.pattern_split TRIVIAL thrpt 6 83982270.075 ± 10250565.633 ops/s
24+
* StringSplitBenchmark.string_split TRIVIAL thrpt 6 848615850.339 ± 42453569.634 ops/s
25+
* StringSplitBenchmark.strings_split TRIVIAL thrpt 6 1765290890.948 ± 160053487.111 ops/s
26+
*
27+
* <p>StringSplitBenchmark.pattern_split SMALL thrpt 6 27383819.756 ± 5454020.100 ops/s
28+
* StringSplitBenchmark.string_split SMALL thrpt 6 149047480.037 ± 6124271.615 ops/s
29+
* StringSplitBenchmark.strings_split SMALL thrpt 6 564058097.162 ± 49305418.971 ops/s
30+
*
31+
* <p>StringSplitBenchmark.pattern_split MEDIUM thrpt 6 14879131.729 ± 1981850.920 ops/s
32+
* StringSplitBenchmark.string_split MEDIUM thrpt 6 51237769.598 ± 1808521.138 ops/s
33+
* StringSplitBenchmark.strings_split MEDIUM thrpt 6 176976970.705 ± 6813886.658 ops/s
34+
*
35+
* <p>StringSplitBenchmark.pattern_split LARGE thrpt 6 482340.838 ± 24903.187 ops/s
36+
* StringSplitBenchmark.string_split LARGE thrpt 6 2460212.879 ± 86911.652 ops/s
37+
* StringSplitBenchmark.strings_split LARGE thrpt 6 4023658.103 ± 30305.699 ops/s
38+
*/
39+
@Fork(2)
40+
@Warmup(iterations = 2)
41+
@Measurement(iterations = 3)
42+
@Threads(8)
43+
@State(Scope.Benchmark)
44+
public class StringSplitBenchmark {
45+
public enum TestString {
46+
EMPTY(""),
47+
TRIVIAL("app_key=1111"),
48+
SMALL("app_key=1111&foo=bar&baz=quux"),
49+
MEDIUM(repeat("app_key=1111", '&', 100)),
50+
LARGE(repeat("app_key=1111&application_key=2222&token=0894-4832", '&', 4096));
51+
52+
final String str;
53+
54+
TestString(String str) {
55+
this.str = str;
56+
}
57+
};
58+
59+
@Param TestString testStr;
60+
61+
static final String repeat(String repeat, char separator, int length) {
62+
StringBuilder builder = new StringBuilder(length);
63+
builder.append(repeat);
64+
while (builder.length() + repeat.length() + 1 < length) {
65+
builder.append(separator).append(repeat);
66+
}
67+
return builder.toString();
68+
}
69+
70+
@Benchmark
71+
public void string_split(Blackhole bh) {
72+
for (String substr : this.testStr.str.split("\\&")) {
73+
bh.consume(substr);
74+
}
75+
}
76+
77+
static final Pattern PATTERN = Pattern.compile("\\&");
78+
79+
@Benchmark
80+
public void pattern_split(Blackhole bh) {
81+
for (String str : PATTERN.split(this.testStr.str)) {
82+
bh.consume(str);
83+
}
84+
}
85+
86+
@Benchmark
87+
public void strings_split(Blackhole bh) {
88+
for (SubSequence subSeq : Strings.split(this.testStr.str, '&')) {
89+
bh.consume(subSeq);
90+
}
91+
}
92+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
package datadog.trace.util;
2+
3+
import org.openjdk.jmh.annotations.Benchmark;
4+
import org.openjdk.jmh.annotations.Fork;
5+
import org.openjdk.jmh.annotations.Measurement;
6+
import org.openjdk.jmh.annotations.Threads;
7+
import org.openjdk.jmh.annotations.Warmup;
8+
import org.openjdk.jmh.infra.Blackhole;
9+
10+
/**
11+
* Strings.substring has 5x throughput. This is primarily achieved through less allocation. NOTE:
12+
* The higher allocation rate is misleading because 5x the work was performed. After accounting for
13+
* the 5x, the actual allocation rate is 0.25x that of String.substring or String.subSequence.
14+
*
15+
* <p>Benchmark Mode Cnt Score Error Units StringSubSequenceBenchmark.string_subSequence thrpt 6
16+
* 140369998.493 ± 4387855.861 ops/s StringSubSequenceBenchmark.string_subSequence:gc.alloc.rate
17+
* thrpt 6 88880.463 ± 2778.032 MB/sec
18+
*
19+
* <p>StringSubSequenceBenchmark.string_substring thrpt 6 136916708.207 ± 12299226.575 ops/s
20+
* StringSubSequenceBenchmark.string_substring:gc.alloc.rate thrpt 6 86689.852 ± 7777.642 MB/sec
21+
*
22+
* <p>StringSubSequenceBenchmark.strings_substring thrpt 6 679669385.260 ± 7194043.619 ops/s
23+
* StringSubSequenceBenchmark.strings_substring:gc.alloc.rate thrpt 6 103702.745 ± 1095.741 MB/sec
24+
*/
25+
@Fork(2)
26+
@Warmup(iterations = 2)
27+
@Measurement(iterations = 3)
28+
@Threads(8)
29+
public class StringSubSequenceBenchmark {
30+
static final String LOREM_IPSUM =
31+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.";
32+
33+
@Benchmark
34+
public void string_substring(Blackhole bh) {
35+
String str = LOREM_IPSUM;
36+
int len = str.length();
37+
38+
for (int i = 0; i < str.length(); i += 100) {
39+
bh.consume(str.substring(i, Math.min(i + 100, len)));
40+
}
41+
}
42+
43+
@Benchmark
44+
public void string_subSequence(Blackhole bh) {
45+
String str = LOREM_IPSUM;
46+
int len = str.length();
47+
48+
for (int i = 0; i < str.length(); i += 100) {
49+
bh.consume(str.subSequence(i, Math.min(i + 100, len)));
50+
}
51+
}
52+
53+
@Benchmark
54+
public void strings_substring(Blackhole bh) {
55+
String str = LOREM_IPSUM;
56+
int len = str.length();
57+
58+
for (int i = 0; i < str.length(); i += 100) {
59+
bh.consume(SubSequence.of(str, i, Math.min(i + 100, len)));
60+
}
61+
}
62+
}

internal-api/src/main/java/datadog/trace/util/Strings.java

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
import java.nio.charset.StandardCharsets;
66
import java.security.MessageDigest;
77
import java.security.NoSuchAlgorithmException;
8+
import java.util.Collections;
9+
import java.util.Iterator;
10+
import java.util.NoSuchElementException;
811
import java.util.concurrent.ThreadLocalRandom;
912
import javax.annotation.Nullable;
1013

@@ -180,4 +183,143 @@ public static String coalesce(@Nullable final String first, @Nullable final Stri
180183
return null;
181184
}
182185
}
186+
187+
/** Low overhead replaceAll */
188+
public static String replaceAll(String input, String needle, String replacement) {
189+
int index = input.indexOf(needle);
190+
if (index == -1) return input;
191+
192+
int needleLen = needle.length();
193+
194+
StringBuilder builder = new StringBuilder(input.length() + 10);
195+
builder.append(input, 0, index);
196+
builder.append(replacement);
197+
198+
int prevIndex = index;
199+
index = input.indexOf(needle, index + needleLen);
200+
for (; index != -1; prevIndex = index, index = input.indexOf(needle, index + needleLen)) {
201+
builder.append(input, prevIndex + needleLen, index);
202+
builder.append(replacement);
203+
}
204+
builder.append(input, prevIndex + needleLen, input.length());
205+
206+
return builder.toString();
207+
}
208+
209+
/**
210+
* Provides a SubSequence which a view into the provided String Unlike String.subSequence (which
211+
* is usually just a wrapper around String.substring), this routine doesn't allocate a new String
212+
* or byte[]/char[].
213+
*/
214+
public static final SubSequence subSequence(String str, int beginIndex) {
215+
return new SubSequence(str, beginIndex, str.length());
216+
}
217+
218+
/**
219+
* Provides a SubSequence which a view into the provided String Unlike String.subSequence (which
220+
* is usually just a wrapper around String.substring), this routine doesn't allocate a new <code>
221+
* String</code> or <code>byte[]</code> / <code>char[]</code>.
222+
*/
223+
public static final SubSequence subSequence(String str, int beginIndex, int endIndex) {
224+
return new SubSequence(str, beginIndex, endIndex);
225+
}
226+
227+
/**
228+
* Provides an Iterable<SubSequence> where the sub-sequences are separated by <code>splitChar
229+
* </code>. Unlike other approaches to splitting, this routine doesn't allocate any new <code>
230+
* String</code> or <code>byte[]</code> / <code>char[]</code>
231+
*/
232+
public static final Iterable<SubSequence> split(String str, char splitChar) {
233+
if (str.isEmpty()) {
234+
return Collections.emptyList();
235+
}
236+
237+
int firstIndex = str.indexOf(splitChar);
238+
if (firstIndex == -1) {
239+
return Collections.singletonList(subSequence(str, 0));
240+
}
241+
242+
return new SplitIterable(str, splitChar, firstIndex);
243+
}
244+
245+
static final class SplitIterable implements Iterable<SubSequence> {
246+
private final String str;
247+
private final int len;
248+
private final char splitChar;
249+
private final int firstIndex;
250+
251+
SplitIterable(String str, char splitChar, int firstIndex) {
252+
this.str = str;
253+
this.len = str.length();
254+
this.splitChar = splitChar;
255+
this.firstIndex = firstIndex;
256+
}
257+
258+
@Override
259+
public SplitIterator iterator() {
260+
return new SplitIterator(this.str, this.len, this.splitChar, this.firstIndex);
261+
}
262+
}
263+
264+
static final class SplitIterator implements Iterator<SubSequence> {
265+
private final String str;
266+
private final int len;
267+
private final char splitChar;
268+
269+
private int curIndex;
270+
private int nextIndex;
271+
272+
SplitIterator(String str, int len, char splitChar, int firstIndex) {
273+
this.str = str;
274+
this.len = len;
275+
this.splitChar = splitChar;
276+
277+
this.curIndex = 0;
278+
this.nextIndex = firstIndex == -1 ? len : firstIndex;
279+
}
280+
281+
@Override
282+
public boolean hasNext() {
283+
return (this.curIndex <= this.len);
284+
}
285+
286+
@Override
287+
public SubSequence next() {
288+
int curIndex = this.curIndex;
289+
int len = this.len;
290+
291+
if (curIndex > len) throw new NoSuchElementException();
292+
293+
SubSequence subSeq;
294+
295+
int nextIndex = this.nextIndex;
296+
if (nextIndex == len - 1) {
297+
// Handles the case where there's a trailing separator,
298+
// curIndex is moved to len to represent the empty string
299+
// after the trailing separator
300+
301+
// Next call then goes into the special case below
302+
subSeq = new SubSequence(this.str, curIndex, nextIndex);
303+
this.curIndex = len;
304+
this.nextIndex = len;
305+
} else if (curIndex == len) {
306+
// Handles the empty string after the trailing separator
307+
// curIndex is given the terminating value `len + 1`
308+
309+
// Don't use SubSequence.EMPTY because it wouldn't have
310+
// the correct beginIndex
311+
subSeq = new SubSequence(this.str, len, len);
312+
this.curIndex = len + 1;
313+
} else {
314+
subSeq = new SubSequence(this.str, curIndex, nextIndex);
315+
316+
// core advancing logic
317+
this.curIndex = nextIndex + 1;
318+
int searchIndex = this.str.indexOf(this.splitChar, nextIndex + 1);
319+
this.nextIndex = (searchIndex == -1) ? len : searchIndex;
320+
}
321+
322+
return subSeq;
323+
}
324+
}
183325
}

0 commit comments

Comments
 (0)