@@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {}
743743
744744struct State < ' a > {
745745 peekable : Peekable < Chars < ' a > > ,
746+ /// Reference to the original source string being tokenized
747+ source : & ' a str ,
746748 pub line : u64 ,
747749 pub col : u64 ,
750+ /// Byte position in the source string
751+ pub byte_pos : usize ,
748752}
749753
750754impl State < ' _ > {
@@ -759,6 +763,8 @@ impl State<'_> {
759763 } else {
760764 self . col += 1 ;
761765 }
766+ // Update byte position (characters can be multi-byte in UTF-8)
767+ self . byte_pos += s. len_utf8 ( ) ;
762768 Some ( s)
763769 }
764770 }
@@ -769,6 +775,12 @@ impl State<'_> {
769775 self . peekable . peek ( )
770776 }
771777
778+ /// return the character after the next character (lookahead by 2) without advancing the stream
779+ pub fn peek_next ( & self ) -> Option < char > {
780+ // Use the source and byte_pos instead of cloning the peekable iterator
781+ self . source [ self . byte_pos ..] . chars ( ) . nth ( 1 )
782+ }
783+
772784 pub fn location ( & self ) -> Location {
773785 Location {
774786 line : self . line ,
@@ -893,8 +905,10 @@ impl<'a> Tokenizer<'a> {
893905 ) -> Result < ( ) , TokenizerError > {
894906 let mut state = State {
895907 peekable : self . query . chars ( ) . peekable ( ) ,
908+ source : self . query ,
896909 line : 1 ,
897910 col : 1 ,
911+ byte_pos : 0 ,
898912 } ;
899913
900914 let mut location = state. location ( ) ;
@@ -912,18 +926,21 @@ impl<'a> Tokenizer<'a> {
912926 fn tokenize_identifier_or_keyword (
913927 & self ,
914928 ch : impl IntoIterator < Item = char > ,
915- chars : & mut State ,
929+ chars : & mut State < ' a > ,
916930 ) -> Result < Option < Token > , TokenizerError > {
917931 chars. next ( ) ; // consume the first char
918- let ch: String = ch. into_iter ( ) . collect ( ) ;
919- let word = self . tokenize_word ( ch, chars) ;
932+ // Calculate total byte length without allocating a String
933+ let consumed_byte_len: usize = ch. into_iter ( ) . map ( |c| c. len_utf8 ( ) ) . sum ( ) ;
934+ let word = self . tokenize_word ( consumed_byte_len, chars) ;
920935
921936 // TODO: implement parsing of exponent here
922937 if word. chars ( ) . all ( |x| x. is_ascii_digit ( ) || x == '.' ) {
923938 let mut inner_state = State {
924939 peekable : word. chars ( ) . peekable ( ) ,
940+ source : & word,
925941 line : 0 ,
926942 col : 0 ,
943+ byte_pos : 0 ,
927944 } ;
928945 let mut s = peeking_take_while ( & mut inner_state, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
929946 let s2 = peeking_take_while ( chars, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
@@ -937,7 +954,7 @@ impl<'a> Tokenizer<'a> {
937954 /// Get the next token or return None
938955 fn next_token (
939956 & self ,
940- chars : & mut State ,
957+ chars : & mut State < ' a > ,
941958 prev_token : Option < & Token > ,
942959 ) -> Result < Option < Token > , TokenizerError > {
943960 match chars. peek ( ) {
@@ -988,7 +1005,7 @@ impl<'a> Tokenizer<'a> {
9881005 }
9891006 _ => {
9901007 // regular identifier starting with an "b" or "B"
991- let s = self . tokenize_word ( b, chars) ;
1008+ let s = self . tokenize_word ( b. len_utf8 ( ) , chars) ;
9921009 Ok ( Some ( Token :: make_word ( & s, None ) ) )
9931010 }
9941011 }
@@ -1015,7 +1032,7 @@ impl<'a> Tokenizer<'a> {
10151032 ) ,
10161033 _ => {
10171034 // regular identifier starting with an "r" or "R"
1018- let s = self . tokenize_word ( b, chars) ;
1035+ let s = self . tokenize_word ( b. len_utf8 ( ) , chars) ;
10191036 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10201037 }
10211038 }
@@ -1034,7 +1051,7 @@ impl<'a> Tokenizer<'a> {
10341051 }
10351052 _ => {
10361053 // regular identifier starting with an "N"
1037- let s = self . tokenize_word ( n, chars) ;
1054+ let s = self . tokenize_word ( n. len_utf8 ( ) , chars) ;
10381055 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10391056 }
10401057 }
@@ -1051,7 +1068,7 @@ impl<'a> Tokenizer<'a> {
10511068 }
10521069 _ => {
10531070 // regular identifier starting with an "E" or "e"
1054- let s = self . tokenize_word ( x, chars) ;
1071+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10551072 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10561073 }
10571074 }
@@ -1070,7 +1087,7 @@ impl<'a> Tokenizer<'a> {
10701087 }
10711088 }
10721089 // regular identifier starting with an "U" or "u"
1073- let s = self . tokenize_word ( x, chars) ;
1090+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10741091 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10751092 }
10761093 // The spec only allows an uppercase 'X' to introduce a hex
@@ -1085,7 +1102,7 @@ impl<'a> Tokenizer<'a> {
10851102 }
10861103 _ => {
10871104 // regular identifier starting with an "X"
1088- let s = self . tokenize_word ( x, chars) ;
1105+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10891106 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10901107 }
10911108 }
@@ -1876,13 +1893,29 @@ impl<'a> Tokenizer<'a> {
18761893 comment
18771894 }
18781895
1879- /// Tokenize an identifier or keyword, after the first char is already consumed.
1880- fn tokenize_word ( & self , first_chars : impl Into < String > , chars : & mut State ) -> String {
1881- let mut s = first_chars. into ( ) ;
1882- s. push_str ( & peeking_take_while ( chars, |ch| {
1883- self . dialect . is_identifier_part ( ch)
1884- } ) ) ;
1885- s
1896+ /// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
1897+ /// `consumed_byte_len` is the byte length of the consumed character(s).
1898+ fn tokenize_word ( & self , consumed_byte_len : usize , chars : & mut State < ' a > ) -> String {
1899+ // Calculate where the first character started
1900+ let first_char_byte_pos = chars. byte_pos - consumed_byte_len;
1901+
1902+ // Use the zero-copy version and convert to String
1903+ self . tokenize_word_borrowed ( first_char_byte_pos, chars) . to_string ( )
1904+ }
1905+
1906+ /// Tokenize an identifier or keyword, returning a borrowed slice when possible.
1907+ /// The first character position must be provided (before it was consumed).
1908+ /// Returns a slice with the same lifetime as the State's source.
1909+ fn tokenize_word_borrowed (
1910+ & self ,
1911+ first_char_byte_pos : usize ,
1912+ chars : & mut State < ' a > ,
1913+ ) -> & ' a str {
1914+ // Consume the rest of the word
1915+ borrow_slice_until ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
1916+
1917+ // Return a slice from the first char to the current position
1918+ & chars. source [ first_char_byte_pos..chars. byte_pos ]
18861919 }
18871920
18881921 /// Read a quoted identifier
@@ -2176,35 +2209,82 @@ impl<'a> Tokenizer<'a> {
21762209/// Read from `chars` until `predicate` returns `false` or EOF is hit.
21772210/// Return the characters read as String, and keep the first non-matching
21782211/// char available as `chars.next()`.
2179- fn peeking_take_while ( chars : & mut State , mut predicate : impl FnMut ( char ) -> bool ) -> String {
2180- let mut s = String :: new ( ) ;
2212+ fn peeking_take_while ( chars : & mut State , predicate : impl FnMut ( char ) -> bool ) -> String {
2213+ borrow_slice_until ( chars, predicate) . to_string ( )
2214+ }
2215+
2216+ /// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2217+ ///
2218+ /// # Arguments
2219+ /// * `chars` - The character iterator state (contains reference to original source)
2220+ /// * `predicate` - Function that returns true while we should continue taking characters
2221+ ///
2222+ /// # Returns
2223+ /// A borrowed slice of the source string containing the matched characters
2224+ fn borrow_slice_until < ' a > (
2225+ chars : & mut State < ' a > ,
2226+ mut predicate : impl FnMut ( char ) -> bool ,
2227+ ) -> & ' a str {
2228+ // Record the starting byte position
2229+ let start_pos = chars. byte_pos ;
2230+
2231+ // Consume characters while predicate is true
21812232 while let Some ( & ch) = chars. peek ( ) {
21822233 if predicate ( ch) {
2183- chars. next ( ) ; // consume
2184- s. push ( ch) ;
2234+ chars. next ( ) ; // consume (this updates byte_pos)
21852235 } else {
21862236 break ;
21872237 }
21882238 }
2189- s
2239+
2240+ // Get the ending byte position
2241+ let end_pos = chars. byte_pos ;
2242+
2243+ // Return the slice from the original source
2244+ & chars. source [ start_pos..end_pos]
21902245}
21912246
2192- /// Same as peeking_take_while, but also passes the next character to the predicate.
2193- fn peeking_next_take_while (
2194- chars : & mut State ,
2247+ /// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2248+ /// This version also passes the next character to the predicate for lookahead.
2249+ /// This is a zero-copy version of `peeking_next_take_while`.
2250+ ///
2251+ /// # Arguments
2252+ /// * `chars` - The character iterator state (contains reference to original source)
2253+ /// * `predicate` - Function that returns true while we should continue taking characters.
2254+ /// Takes current char and optional next char for lookahead.
2255+ ///
2256+ /// # Returns
2257+ /// A borrowed slice of the source string containing the matched characters
2258+ fn borrow_slice_until_next < ' a > (
2259+ chars : & mut State < ' a > ,
21952260 mut predicate : impl FnMut ( char , Option < char > ) -> bool ,
2196- ) -> String {
2197- let mut s = String :: new ( ) ;
2261+ ) -> & ' a str {
2262+ // Record the starting byte position
2263+ let start_pos = chars. byte_pos ;
2264+
2265+ // Consume characters while predicate is true
21982266 while let Some ( & ch) = chars. peek ( ) {
2199- let next_char = chars. peekable . clone ( ) . nth ( 1 ) ;
2267+ let next_char = chars. peek_next ( ) ;
22002268 if predicate ( ch, next_char) {
2201- chars. next ( ) ; // consume
2202- s. push ( ch) ;
2269+ chars. next ( ) ; // consume (this updates byte_pos)
22032270 } else {
22042271 break ;
22052272 }
22062273 }
2207- s
2274+
2275+ // Get the ending byte position
2276+ let end_pos = chars. byte_pos ;
2277+
2278+ // Return the slice from the original source
2279+ & chars. source [ start_pos..end_pos]
2280+ }
2281+
2282+ /// Same as peeking_take_while, but also passes the next character to the predicate.
2283+ fn peeking_next_take_while (
2284+ chars : & mut State ,
2285+ predicate : impl FnMut ( char , Option < char > ) -> bool ,
2286+ ) -> String {
2287+ borrow_slice_until_next ( chars, predicate) . to_string ( )
22082288}
22092289
22102290fn unescape_single_quoted_string ( chars : & mut State < ' _ > ) -> Option < String > {
@@ -3496,8 +3576,10 @@ mod tests {
34963576 let s = format ! ( "'{s}'" ) ;
34973577 let mut state = State {
34983578 peekable : s. chars ( ) . peekable ( ) ,
3579+ source : & s,
34993580 line : 0 ,
35003581 col : 0 ,
3582+ byte_pos : 0 ,
35013583 } ;
35023584
35033585 assert_eq ! (
0 commit comments