Skip to content

Commit 9130ea9

Browse files
committed
Push the validateUTF8String into the StringScanner to reduce the work being done if UTF-8 validation is disabled.
1 parent 54ce667 commit 9130ea9

4 files changed

Lines changed: 68 additions & 15 deletions

File tree

java/src/json/ext/Parser.java

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

java/src/json/ext/StringDecoder.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ ByteList decode(ThreadContext context, ByteList src, int start, int end) {
5252
}
5353
}
5454

55-
// Decodes ASCII-only strings with no UTF-8 validation. It is assumed this is only called when
56-
// the StringScanner#scan returned a result with the ASCII_BIT bit set.
55+
// Decodes strings with no UTF-8 validation. It is assumed the string is either ASCII-only
56+
// or UTF-8 avlidation has been disabled.
5757
ByteList decodeNoValidate(ThreadContext context, ByteList src, int start, int end, ByteBuffer chunks) {
5858
try {
5959
init(src, start, end);

java/src/json/ext/StringScanner.java

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ static StringScanner getInstance() {
6969
* {@link #ASCII_BIT} is set when the body contains no non-ASCII byte
7070
* (it may still hold escapes or control characters)
7171
*/
72-
long scan(byte[] data, ByteBuffer chunks, int start, int end) {
72+
long scan(byte[] data, ByteBuffer chunks, int start, int end, boolean validateUtf8) {
7373
int p = start;
7474
boolean plain = true;
7575
boolean ascii = true;
@@ -86,9 +86,20 @@ long scan(byte[] data, ByteBuffer chunks, int start, int end) {
8686
// clears ASCII_BIT
8787
// non-plain, !ASCII -> quote and backslash only; multi-byte
8888
// UTF-8 is skipped eight bytes at a time
89-
long m = plain ? stringScanMask(x)
90-
: ascii ? quoteBackslashHighMask(x)
91-
: quoteBackslashMask(x);
89+
//
90+
// When UTF-8 validation is disabled, non-ASCII bytes are copied
91+
// verbatim, so they stay on the plain fast path and the high-bit
92+
// term drops out of every mask: plain scans for control, quote
93+
// and backslash; non-plain scans for quote and backslash only.
94+
long m;
95+
if (validateUtf8) {
96+
m = plain ? stringScanMask(x)
97+
: ascii ? quoteBackslashHighMask(x)
98+
: quoteBackslashMask(x);
99+
} else {
100+
m = plain ? controlQuoteBackslashMask(x)
101+
: quoteBackslashMask(x);
102+
}
92103
if (m == 0) {
93104
p += 8;
94105
} else {
@@ -113,8 +124,10 @@ long scan(byte[] data, ByteBuffer chunks, int start, int end) {
113124
continue outer;
114125
}
115126
if (b >= 0x80) {
116-
plain = false;
117127
ascii = false;
128+
if (validateUtf8) {
129+
plain = false;
130+
}
118131
p++;
119132
continue outer;
120133
}
@@ -184,6 +197,21 @@ private static long backslashControlMask(long x) {
184197
return (control | bslash) & HIGH_BITS;
185198
}
186199

200+
/**
201+
* Like {@link #stringScanMask} but omits the non-ASCII (high-bit) term:
202+
* flags double quotes, backslashes and ASCII control characters (< 0x20)
203+
* only. Used as the starting mask when UTF-8 validation is disabled, where
204+
* non-ASCII bytes are copied verbatim and so stay on the plain fast path.
205+
*/
206+
private static long controlQuoteBackslashMask(long x) {
207+
long control = (x - SPACES) & ~x; // bytes < 0x20 (ASCII)
208+
long q = x ^ DOUBLE_QUOTES;
209+
long quote = (q - ONES) & ~q;
210+
long s = x ^ BACKSLASHES;
211+
long bslash = (s - ONES) & ~s;
212+
return (control | quote | bslash) & HIGH_BITS;
213+
}
214+
187215
/**
188216
* Like {@link #stringScanMask} but only flags double quotes and backslashes.
189217
* Used once a string is known to require the decoder <em>and</em> to already

java/src/json/ext/VectorizedStringScanner.java

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,32 @@ final class VectorizedStringScanner extends StringScanner {
1717
private static final ByteVector DQUOTE = ByteVector.broadcast(SP, '"');
1818

1919
@Override
20-
long scan(byte[] data, ByteBuffer chunks, int start, int end) {
20+
long scan(byte[] data, ByteBuffer chunks, int start, int end, boolean validateUtf8) {
2121
final int width = SP.length();
2222
int p = start;
2323
boolean plain = true;
2424
boolean ascii = true;
2525

2626
// The same structure as the StringEncoder. The logic is
2727
// duplicated for maximum inlining.
28+
//
29+
// When UTF-8 validation is disabled, non-ASCII lanes are copied verbatim
30+
// and stay on the plain fast path, so the high-bit term drops out: plain
31+
// scans for control, quote and backslash; non-plain for quote and
32+
// backslash only.
2833
outer:
2934
while (true) {
3035
while (p + width <= end) {
3136
ByteVector chunk = ByteVector.fromArray(SP, data, p);
32-
VectorMask<Byte> interesting =
33-
plain ? interestingLanes(chunk)
34-
: ascii ? quoteBackslashOrHighLanes(chunk)
35-
: quoteOrBackslashLanes(chunk);
37+
VectorMask<Byte> interesting;
38+
if (validateUtf8) {
39+
interesting = plain ? interestingLanes(chunk)
40+
: ascii ? quoteBackslashOrHighLanes(chunk)
41+
: quoteOrBackslashLanes(chunk);
42+
} else {
43+
interesting = plain ? controlQuoteBackslashLanes(chunk)
44+
: quoteOrBackslashLanes(chunk);
45+
}
3646
if (interesting.anyTrue()) {
3747
p += interesting.firstTrue();
3848
break;
@@ -50,8 +60,10 @@ long scan(byte[] data, ByteBuffer chunks, int start, int end) {
5060
continue outer;
5161
}
5262
if (b >= 0x80) {
53-
plain = false;
5463
ascii = false;
64+
if (validateUtf8) {
65+
plain = false;
66+
}
5567
p++;
5668
continue outer;
5769
}
@@ -85,7 +97,9 @@ int scanEscape(byte[] data, ByteBuffer chunks, int start, int end) {
8597
}
8698

8799
private static VectorMask<Byte> escapeOrControlLanes(ByteVector chunk) {
88-
return chunk.lt(SPACE).or(chunk.eq(BACKSLASH));
100+
VectorMask<Byte> negative = chunk.lt(ZERO);
101+
VectorMask<Byte> control = chunk.lt(SPACE).andNot(negative);
102+
return control.or(chunk.eq(BACKSLASH));
89103
}
90104

91105
private static VectorMask<Byte> interestingLanes(ByteVector chunk) {
@@ -100,6 +114,17 @@ private static VectorMask<Byte> quoteOrBackslashLanes(ByteVector chunk) {
100114
return chunk.eq(DQUOTE).or(chunk.eq(BACKSLASH));
101115
}
102116

117+
// Like interestingLanes but without the non-ASCII (high-bit) term: flags
118+
// ASCII control characters, double quotes and backslashes only. Used as the
119+
// plain-path mask when UTF-8 validation is disabled.
120+
private static VectorMask<Byte> controlQuoteBackslashLanes(ByteVector chunk) {
121+
VectorMask<Byte> negative = chunk.lt(ZERO);
122+
VectorMask<Byte> lowOrQuote = chunk.lanewise(VectorOperators.XOR, TWO)
123+
.lt(THIRTY_THREE)
124+
.andNot(negative);
125+
return lowOrQuote.or(chunk.eq(BACKSLASH));
126+
}
127+
103128
private static VectorMask<Byte> quoteBackslashOrHighLanes(ByteVector chunk) {
104129
return chunk.eq(DQUOTE).or(chunk.eq(BACKSLASH)).or(chunk.lt(ZERO));
105130
}

0 commit comments

Comments
 (0)