Skip to content

Commit 076e47c

Browse files
author
Roman Borschel
committed
Fix tokenization of qualified identifiers with numeric prefix.
Queries with qualified identifiers having numeric prefixes currently fail to parse due to incorrect tokenization. Currently, "t.123abc" tokenizes as "t" (Word) followed by ".123abc" (Number).
1 parent 0d2976d commit 076e47c

File tree

2 files changed

+146
-12
lines changed

2 files changed

+146
-12
lines changed

src/tokenizer.rs

Lines changed: 68 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> {
895895
};
896896

897897
let mut location = state.location();
898-
while let Some(token) = self.next_token(&mut state)? {
898+
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
899899
let span = location.span_to(state.location());
900900

901901
buf.push(TokenWithSpan { token, span });
@@ -932,7 +932,7 @@ impl<'a> Tokenizer<'a> {
932932
}
933933

934934
/// Get the next token or return None
935-
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
935+
fn next_token(&self, chars: &mut State, prev_token: Option<&Token>) -> Result<Option<Token>, TokenizerError> {
936936
match chars.peek() {
937937
Some(&ch) => match ch {
938938
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@@ -1211,17 +1211,29 @@ impl<'a> Tokenizer<'a> {
12111211
chars.next();
12121212
}
12131213

1214+
// If the dialect supports identifiers that start with a numeric prefix
1215+
// and we have now consumed a dot, check if the previous token was a Word.
1216+
// If so, what follows is definitely not part of a decimal number and
1217+
// we should yield the dot as a dedicated token so compound identifiers
1218+
// starting with digits can be parsed correctly.
1219+
if s == "." && self.dialect.supports_numeric_prefix() {
1220+
if let Some(Token::Word(_)) = prev_token {
1221+
return Ok(Some(Token::Period));
1222+
}
1223+
}
1224+
1225+
// Consume fractional digits.
12141226
s += &peeking_next_take_while(chars, |ch, next_ch| {
12151227
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
12161228
});
12171229

1218-
// No number -> Token::Period
1230+
// No fraction -> Token::Period
12191231
if s == "." {
12201232
return Ok(Some(Token::Period));
12211233
}
12221234

1223-
let mut exponent_part = String::new();
12241235
// Parse exponent as number
1236+
let mut exponent_part = String::new();
12251237
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
12261238
let mut char_clone = chars.peekable.clone();
12271239
exponent_part.push(char_clone.next().unwrap());
@@ -1250,14 +1262,23 @@ impl<'a> Tokenizer<'a> {
12501262
}
12511263
}
12521264

1253-
// mysql dialect supports identifiers that start with a numeric prefix,
1254-
// as long as they aren't an exponent number.
1255-
if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
1256-
let word =
1257-
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1258-
1259-
if !word.is_empty() {
1260-
s += word.as_str();
1265+
// If the dialect supports identifiers that start with a numeric prefix,
1266+
// we need to check if the value is in fact an identifier and must thus
1267+
// be tokenized as a word.
1268+
if self.dialect.supports_numeric_prefix() {
1269+
if exponent_part.is_empty() {
1270+
// If it is not a number with an exponent, it may be
1271+
// an unqualified identifier starting with digits.
1272+
let word =
1273+
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1274+
1275+
if !word.is_empty() {
1276+
s += word.as_str();
1277+
return Ok(Some(Token::make_word(s.as_str(), None)));
1278+
}
1279+
} else if prev_token.map_or(false, |t| t == &Token::Period) {
1280+
// If the previous token was a period, thus not belonging to a number,
1281+
// the value we have is part of an identifier.
12611282
return Ok(Some(Token::make_word(s.as_str(), None)));
12621283
}
12631284
}
@@ -3960,4 +3981,39 @@ mod tests {
39603981
],
39613982
);
39623983
}
3984+
3985+
#[test]
3986+
fn test_tokenize_identifiers_numeric_prefix() {
3987+
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
3988+
"123abc",
3989+
vec![
3990+
Token::make_word("123abc", None),
3991+
],
3992+
);
3993+
3994+
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
3995+
"12e34",
3996+
vec![
3997+
Token::Number("12e34".to_string(), false),
3998+
],
3999+
);
4000+
4001+
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4002+
"t.12e34",
4003+
vec![
4004+
Token::make_word("t", None),
4005+
Token::Period,
4006+
Token::make_word("12e34", None),
4007+
],
4008+
);
4009+
4010+
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4011+
"t.1two3",
4012+
vec![
4013+
Token::make_word("t", None),
4014+
Token::Period,
4015+
Token::make_word("1two3", None),
4016+
],
4017+
);
4018+
}
39634019
}

tests/sqlparser_mysql.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1926,6 +1926,84 @@ fn parse_select_with_numeric_prefix_column_name() {
19261926
}
19271927
}
19281928

1929+
#[test]
1930+
fn parse_qualified_identifiers_with_numeric_prefix() {
1931+
// Case 1: Qualified column name that starts with digits.
1932+
mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t");
1933+
match mysql().parse_sql_statements("SELECT t.15to29 FROM my_table AS t").unwrap().pop() {
1934+
Some(Statement::Query(q)) => match *q.body {
1935+
SetExpr::Select(s) => match s.projection.last() {
1936+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
1937+
assert_eq!(&[Ident::new("t"), Ident::new("15to29")], &parts[..]);
1938+
}
1939+
proj => panic!("Unexpected projection: {:?}", proj),
1940+
}
1941+
body => panic!("Unexpected statement body: {:?}", body),
1942+
}
1943+
stmt => panic!("Unexpected statement: {:?}", stmt),
1944+
}
1945+
1946+
// Case 2: Qualified column name that starts with digits and on its own represents a number.
1947+
mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t");
1948+
match mysql().parse_sql_statements("SELECT t.15e29 FROM my_table AS t").unwrap().pop() {
1949+
Some(Statement::Query(q)) => match *q.body {
1950+
SetExpr::Select(s) => match s.projection.last() {
1951+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
1952+
assert_eq!(&[Ident::new("t"), Ident::new("15e29")], &parts[..]);
1953+
}
1954+
proj => panic!("Unexpected projection: {:?}", proj),
1955+
}
1956+
body => panic!("Unexpected statement body: {:?}", body),
1957+
}
1958+
stmt => panic!("Unexpected statement: {:?}", stmt),
1959+
}
1960+
1961+
// Case 3: Unqualified, the same token is parsed as a number.
1962+
mysql().verified_stmt("SELECT 15e29 FROM my_table");
1963+
match mysql().parse_sql_statements("SELECT 15e29 FROM my_table").unwrap().pop() {
1964+
Some(Statement::Query(q)) => match *q.body {
1965+
SetExpr::Select(s) => match s.projection.last() {
1966+
Some(SelectItem::UnnamedExpr(Expr::Value(ValueWithSpan { value: Value::Number(n, _), ..}))) => {
1967+
assert_eq!("15e29", n);
1968+
}
1969+
proj => panic!("Unexpected projection: {:?}", proj),
1970+
}
1971+
body => panic!("Unexpected statement body: {:?}", body),
1972+
}
1973+
stmt => panic!("Unexpected statement: {:?}", stmt),
1974+
}
1975+
1976+
// Case 4: Quoted simple identifier.
1977+
mysql().verified_stmt("SELECT `15e29` FROM my_table");
1978+
match mysql().parse_sql_statements("SELECT `15e29` FROM my_table").unwrap().pop() {
1979+
Some(Statement::Query(q)) => match *q.body {
1980+
SetExpr::Select(s) => match s.projection.last() {
1981+
Some(SelectItem::UnnamedExpr(Expr::Identifier(name))) => {
1982+
assert_eq!(&Ident::with_quote('`', "15e29"), name);
1983+
}
1984+
proj => panic!("Unexpected projection: {:?}", proj),
1985+
}
1986+
body => panic!("Unexpected statement body: {:?}", body),
1987+
}
1988+
stmt => panic!("Unexpected statement: {:?}", stmt),
1989+
}
1990+
1991+
// Case 5: Quoted compound identifier.
1992+
mysql().verified_stmt("SELECT t.`15e29` FROM my_table");
1993+
match mysql().parse_sql_statements("SELECT t.`15e29` FROM my_table AS t").unwrap().pop() {
1994+
Some(Statement::Query(q)) => match *q.body {
1995+
SetExpr::Select(s) => match s.projection.last() {
1996+
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
1997+
assert_eq!(&[Ident::new("t"), Ident::with_quote('`', "15e29")], &parts[..]);
1998+
}
1999+
proj => panic!("Unexpected projection: {:?}", proj),
2000+
}
2001+
body => panic!("Unexpected statement body: {:?}", body),
2002+
}
2003+
stmt => panic!("Unexpected statement: {:?}", stmt),
2004+
}
2005+
}
2006+
19292007
// Don't run with bigdecimal as it fails like this on rust beta:
19302008
//
19312009
// 'parse_select_with_concatenation_of_exp_number_and_numeric_prefix_column'

0 commit comments

Comments
 (0)