@@ -17,22 +17,32 @@ final class VectorizedStringScanner extends StringScanner {
1717 private static final ByteVector DQUOTE = ByteVector .broadcast (SP , '"' );
1818
1919 @ Override
20- long scan (byte [] data , ByteBuffer chunks , int start , int end ) {
20+ long scan (byte [] data , ByteBuffer chunks , int start , int end , boolean validateUtf8 ) {
2121 final int width = SP .length ();
2222 int p = start ;
2323 boolean plain = true ;
2424 boolean ascii = true ;
2525
2626 // The same structure as the StringEncoder. The logic is
2727 // duplicated for maximum inlining.
28+ //
29+ // When UTF-8 validation is disabled, non-ASCII lanes are copied verbatim
30+ // and stay on the plain fast path, so the high-bit term drops out: plain
31+ // scans for control, quote and backslash; non-plain for quote and
32+ // backslash only.
2833 outer :
2934 while (true ) {
3035 while (p + width <= end ) {
3136 ByteVector chunk = ByteVector .fromArray (SP , data , p );
32- VectorMask <Byte > interesting =
33- plain ? interestingLanes (chunk )
34- : ascii ? quoteBackslashOrHighLanes (chunk )
35- : quoteOrBackslashLanes (chunk );
37+ VectorMask <Byte > interesting ;
38+ if (validateUtf8 ) {
39+ interesting = plain ? interestingLanes (chunk )
40+ : ascii ? quoteBackslashOrHighLanes (chunk )
41+ : quoteOrBackslashLanes (chunk );
42+ } else {
43+ interesting = plain ? controlQuoteBackslashLanes (chunk )
44+ : quoteOrBackslashLanes (chunk );
45+ }
3646 if (interesting .anyTrue ()) {
3747 p += interesting .firstTrue ();
3848 break ;
@@ -50,8 +60,10 @@ long scan(byte[] data, ByteBuffer chunks, int start, int end) {
5060 continue outer ;
5161 }
5262 if (b >= 0x80 ) {
53- plain = false ;
5463 ascii = false ;
64+ if (validateUtf8 ) {
65+ plain = false ;
66+ }
5567 p ++;
5668 continue outer ;
5769 }
@@ -85,7 +97,9 @@ int scanEscape(byte[] data, ByteBuffer chunks, int start, int end) {
8597 }
8698
8799 private static VectorMask <Byte > escapeOrControlLanes (ByteVector chunk ) {
88- return chunk .lt (SPACE ).or (chunk .eq (BACKSLASH ));
100+ VectorMask <Byte > negative = chunk .lt (ZERO );
101+ VectorMask <Byte > control = chunk .lt (SPACE ).andNot (negative );
102+ return control .or (chunk .eq (BACKSLASH ));
89103 }
90104
91105 private static VectorMask <Byte > interestingLanes (ByteVector chunk ) {
@@ -100,6 +114,17 @@ private static VectorMask<Byte> quoteOrBackslashLanes(ByteVector chunk) {
100114 return chunk .eq (DQUOTE ).or (chunk .eq (BACKSLASH ));
101115 }
102116
117+ // Like interestingLanes but without the non-ASCII (high-bit) term: flags
118+ // ASCII control characters, double quotes and backslashes only. Used as the
119+ // plain-path mask when UTF-8 validation is disabled.
120+ private static VectorMask <Byte > controlQuoteBackslashLanes (ByteVector chunk ) {
121+ VectorMask <Byte > negative = chunk .lt (ZERO );
122+ VectorMask <Byte > lowOrQuote = chunk .lanewise (VectorOperators .XOR , TWO )
123+ .lt (THIRTY_THREE )
124+ .andNot (negative );
125+ return lowOrQuote .or (chunk .eq (BACKSLASH ));
126+ }
127+
103128 private static VectorMask <Byte > quoteBackslashOrHighLanes (ByteVector chunk ) {
104129 return chunk .eq (DQUOTE ).or (chunk .eq (BACKSLASH )).or (chunk .lt (ZERO ));
105130 }
0 commit comments