Skip to content

Commit 93ea5d2

Browse files
Extended CSV STDIN tests and resolved more corner cases in tokenizer
1 parent b862dc7 commit 93ea5d2

File tree

8 files changed

+834
-775
lines changed

8 files changed

+834
-775
lines changed

src/ast/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4649,7 +4649,7 @@ impl fmt::Display for Statement {
46494649
let data = String::from_utf8(writer.into_inner().map_err(|_| fmt::Error)?)
46504650
.map_err(|_| fmt::Error)?;
46514651
write!(f, "{}", data)?;
4652-
write!(f, "\n\\.")?;
4652+
write!(f, "\\.")?;
46534653
}
46544654
Ok(())
46554655
}

src/dialect/bigquery.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,11 @@ impl Dialect for BigQueryDialect {
8383
}
8484

8585
fn is_identifier_part(&self, ch: char) -> bool {
86-
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_' || ch == '-'
86+
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
87+
}
88+
89+
fn supports_hyphenated_identifiers(&self) -> bool {
90+
true
8791
}
8892

8993
/// See [doc](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)

src/dialect/mod.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,11 @@ pub trait Dialect: Debug + Any {
178178
/// Determine if a character is a valid unquoted identifier character
179179
fn is_identifier_part(&self, ch: char) -> bool;
180180

181+
/// Returns whether the dialect supports hyphenated identifiers
182+
fn supports_hyphenated_identifiers(&self) -> bool {
183+
false
184+
}
185+
181186
/// Most dialects do not have custom operators. Override this method to provide custom operators.
182187
fn is_custom_operator_part(&self, _ch: char) -> bool {
183188
false

src/parser/mod.rs

Lines changed: 75 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -9539,13 +9539,11 @@ impl<'a> Parser<'a> {
95399539
legacy_options: &[CopyLegacyOption],
95409540
) -> Result<Vec<Vec<Option<String>>>, ParserError> {
95419541
let Token::CopyFromStdin(body) = self.next_token().token else {
9542-
return self.expected(
9543-
"COPY ... FROM STDIN with CSV body",
9544-
self.peek_token(),
9545-
);
9542+
return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token());
95469543
};
95479544

95489545
let mut reader_builder = csv::ReaderBuilder::new();
9546+
reader_builder.has_headers(false);
95499547

95509548
let mut null_symbol = "\\N";
95519549

@@ -11336,80 +11334,69 @@ impl<'a> Parser<'a> {
1133611334
/// Return a tuple of the identifier and a boolean indicating it ends with a period.
1133711335
fn parse_unquoted_hyphenated_identifier(&mut self) -> Result<(Ident, bool), ParserError> {
1133811336
match self.peek_token().token {
11339-
Token::UnquotedDashStringLiteral(lit) => {
11340-
let span = self.next_token().span;
11341-
Ok((
11342-
Ident {
11343-
value: lit,
11344-
quote_style: None,
11345-
span,
11346-
},
11347-
false,
11348-
))
11349-
}
11350-
Token::Word(w) => {
11351-
let quote_style_is_none = w.quote_style.is_none();
11352-
let mut requires_whitespace = false;
11353-
let mut ident = w.into_ident(self.next_token().span);
11354-
if quote_style_is_none {
11355-
while matches!(self.peek_token().token, Token::Minus) {
11356-
unreachable!("Something went wrong in the tokenizer!");
11357-
// self.next_token();
11358-
// ident.value.push('-');
11359-
11360-
// let token = self
11361-
// .next_token_no_skip()
11362-
// .cloned()
11363-
// .unwrap_or(TokenWithSpan::wrap(Token::EOF));
11364-
// requires_whitespace = match token.token {
11365-
// Token::Word(next_word) if next_word.quote_style.is_none() => {
11366-
// ident.value.push_str(&next_word.value);
11367-
// false
11368-
// }
11369-
// Token::Number(s, false) => {
11370-
// // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`.
11371-
// // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`.
11372-
// //
11373-
// // If a number token is followed by a period, it is part of an [ObjectName].
11374-
// // Return the identifier with `true` if the number token is followed by a period, indicating that
11375-
// // parsing should continue for the next part of the hyphenated identifier.
11376-
// if s.ends_with('.') {
11377-
// let Some(s) = s.split('.').next().filter(|s| {
11378-
// !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())
11379-
// }) else {
11380-
// return self.expected(
11381-
// "continuation of hyphenated identifier",
11382-
// TokenWithSpan::new(Token::Number(s, false), token.span),
11383-
// );
11384-
// };
11385-
// ident.value.push_str(s);
11386-
// return Ok((ident, true));
11387-
// } else {
11388-
// ident.value.push_str(&s);
11389-
// }
11390-
// // If next token is period, then it is part of an ObjectName and we don't expect whitespace
11391-
// // after the number.
11392-
// !matches!(self.peek_token().token, Token::Period)
11393-
// }
11394-
// _ => {
11395-
// return self
11396-
// .expected("continuation of hyphenated identifier", token);
11397-
// }
11398-
// }
11399-
}
11400-
11401-
// If the last segment was a number, we must check that it's followed by whitespace,
11402-
// otherwise foo-123a will be parsed as `foo-123` with the alias `a`.
11403-
if requires_whitespace {
11404-
let token = self.next_token();
11405-
if !matches!(token.token, Token::EOF) {
11406-
return self
11407-
.expected("whitespace following hyphenated identifier", token);
11408-
}
11409-
}
11410-
}
11411-
Ok((ident, false))
11412-
}
11337+
// Token::Word(w) => {
11338+
// let quote_style_is_none = w.quote_style.is_none();
11339+
// let mut requires_whitespace = false;
11340+
// let mut ident = w.into_ident(self.next_token().span);
11341+
// if quote_style_is_none {
11342+
// while matches!(self.peek_token().token, Token::Minus) {
11343+
// unreachable!("Something went wrong in the tokenizer!");
11344+
// // self.next_token();
11345+
// // ident.value.push('-');
11346+
11347+
// // let token = self
11348+
// // .next_token_no_skip()
11349+
// // .cloned()
11350+
// // .unwrap_or(TokenWithSpan::wrap(Token::EOF));
11351+
// // requires_whitespace = match token.token {
11352+
// // Token::Word(next_word) if next_word.quote_style.is_none() => {
11353+
// // ident.value.push_str(&next_word.value);
11354+
// // false
11355+
// // }
11356+
// // Token::Number(s, false) => {
11357+
// // // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`.
11358+
// // // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`.
11359+
// // //
11360+
// // // If a number token is followed by a period, it is part of an [ObjectName].
11361+
// // // Return the identifier with `true` if the number token is followed by a period, indicating that
11362+
// // // parsing should continue for the next part of the hyphenated identifier.
11363+
// // if s.ends_with('.') {
11364+
// // let Some(s) = s.split('.').next().filter(|s| {
11365+
// // !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())
11366+
// // }) else {
11367+
// // return self.expected(
11368+
// // "continuation of hyphenated identifier",
11369+
// // TokenWithSpan::new(Token::Number(s, false), token.span),
11370+
// // );
11371+
// // };
11372+
// // ident.value.push_str(s);
11373+
// // return Ok((ident, true));
11374+
// // } else {
11375+
// // ident.value.push_str(&s);
11376+
// // }
11377+
// // // If next token is period, then it is part of an ObjectName and we don't expect whitespace
11378+
// // // after the number.
11379+
// // !matches!(self.peek_token().token, Token::Period)
11380+
// // }
11381+
// // _ => {
11382+
// // return self
11383+
// // .expected("continuation of hyphenated identifier", token);
11384+
// // }
11385+
// // }
11386+
// }
11387+
11388+
// // If the last segment was a number, we must check that it's followed by whitespace,
11389+
// // otherwise foo-123a will be parsed as `foo-123` with the alias `a`.
11390+
// if requires_whitespace {
11391+
// let token = self.next_token();
11392+
// if !matches!(token.token, Token::EOF) {
11393+
// return self
11394+
// .expected("whitespace following hyphenated identifier", token);
11395+
// }
11396+
// }
11397+
// }
11398+
// Ok((ident, false))
11399+
// }
1141311400
_ => Ok((self.parse_identifier()?, false)),
1141411401
}
1141511402
}
@@ -18530,9 +18517,17 @@ mod tests {
1853018517

1853118518
#[test]
1853218519
fn test_placeholder_invalid_whitespace() {
18533-
for w in [" ", " ", "/*invalid*/", "\n", "\t", "\r\n", "--comment\n"] {
18520+
for w in [
18521+
" ",
18522+
"/*invalid*/",
18523+
"\n",
18524+
"\t\t",
18525+
"\r\n",
18526+
"--comment\n",
18527+
"/* multi\nline\ncomment */",
18528+
] {
1853418529
let sql = format!("\nSELECT\n :{w}fooBar");
18535-
assert!(Parser::parse_sql(&GenericDialect, &sql).is_err());
18530+
assert!(Parser::parse_sql(&GenericDialect, &sql).is_err(), "Failed to error on when inserting the whitespace {w:?} within the placeholder SQL: `{sql}`");
1853618531
}
1853718532
}
1853818533
}

src/test_utils.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ impl TestedDialects {
154154
///
155155
/// For multiple statements, use [`statements_parse_to`].
156156
pub fn one_statement_parses_to(&self, sql: &str, canonical: &str) -> Statement {
157+
println!("Testing SQL: {}", sql);
157158
let mut statements = self.parse_sql_statements(sql).expect(sql);
158159
assert_eq!(statements.len(), 1);
159160
if !canonical.is_empty() && sql != canonical {

0 commit comments

Comments
 (0)