Skip to content

Commit ba88d35

Browse files
committed
Use a &str for the input instead of a Vec<Char> in the tokenizer
This avoids allocating the Vec and expanding the &str into it. It also makes possible for the Token to use a &str instead of a String. This improves pattern parsing performance by about 3%
1 parent f58e042 commit ba88d35

2 files changed

Lines changed: 9 additions & 14 deletions

File tree

src/constructor_parser.rs

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -231,12 +231,7 @@ impl<'a> ConstructorStringParser<'a> {
231231
let token = &self.token_list[self.token_index];
232232
let component_start_index = self.get_safe_token(self.component_start).index;
233233

234-
self
235-
.input
236-
.chars()
237-
.skip(component_start_index)
238-
.take(token.index - component_start_index)
239-
.collect()
234+
self.input[component_start_index..token.index].to_owned()
240235
}
241236

242237
// Ref: https://wicg.github.io/urlpattern/#rewind-and-set-state

src/tokenizer.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,21 +41,22 @@ pub enum TokenizePolicy {
4141
}
4242

4343
// Ref: https://wicg.github.io/urlpattern/#tokenizer
44-
struct Tokenizer {
45-
input: Vec<char>,
44+
struct Tokenizer<'a> {
45+
input: &'a str,
4646
policy: TokenizePolicy,
4747
token_list: Vec<Token>,
4848
index: usize,
4949
next_index: usize,
5050
code_point: Option<char>, // TODO: get rid of Option
5151
}
5252

53-
impl Tokenizer {
53+
impl<'a> Tokenizer<'a> {
5454
// Ref: https://wicg.github.io/urlpattern/#get-the-next-code-point
5555
#[inline]
5656
fn get_next_codepoint(&mut self) {
57-
self.code_point = Some(self.input[self.next_index]);
58-
self.next_index += 1;
57+
let next_char = self.input[self.next_index..].chars().next().unwrap();
58+
self.code_point = Some(next_char);
59+
self.next_index += next_char.len_utf8();
5960
}
6061

6162
// Ref: https://wicg.github.io/urlpattern/#add-a-token-with-default-position-and-length
@@ -85,7 +86,7 @@ impl Tokenizer {
8586
value_len: usize,
8687
) {
8788
let range = value_pos..(value_pos + value_len);
88-
let value = self.input[range].iter().collect::<String>();
89+
let value = self.input[range].to_owned();
8990
self.token_list.push(Token {
9091
kind,
9192
index: self.index,
@@ -127,7 +128,7 @@ pub fn tokenize(
127128
policy: TokenizePolicy,
128129
) -> Result<Vec<Token>, Error> {
129130
let mut tokenizer = Tokenizer {
130-
input: input.chars().collect::<Vec<char>>(),
131+
input,
131132
policy,
132133
token_list: vec![],
133134
index: 0,
@@ -318,7 +319,6 @@ pub fn tokenize(
318319

319320
tokenizer.add_token_with_default_pos_and_len(TokenType::Char);
320321
}
321-
322322
tokenizer.add_token_with_default_len(
323323
TokenType::End,
324324
tokenizer.index,

0 commit comments

Comments
 (0)