From 5663464066ecdae4385be32d4be65993c7e76de3 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Fri, 8 May 2026 19:14:31 -0400 Subject: [PATCH] parser: validate escape sequences for unicode escape ident --- ...test__unicode_escape_ident_validation.snap | 136 ++++++++++++++++++ crates/squawk_syntax/src/validation.rs | 59 ++++++-- .../validation/unicode_escape_ident.sql | 12 ++ 3 files changed, 197 insertions(+), 10 deletions(-) create mode 100644 crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap create mode 100644 crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql diff --git a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap new file mode 100644 index 00000000..a819c9cd --- /dev/null +++ b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap @@ -0,0 +1,136 @@ +--- +source: crates/squawk_syntax/src/test.rs +input_file: crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql +--- +SOURCE_FILE@0..243 + COMMENT@0..5 "-- ok" + WHITESPACE@5..6 "\n" + SELECT@6..32 + SELECT_CLAUSE@6..32 + SELECT_KW@6..12 "select" + WHITESPACE@12..13 " " + TARGET_LIST@13..32 + TARGET@13..32 + NAME_REF@13..32 + IDENT@13..32 "U&\"d\\0061t\\+000061\"" + SEMICOLON@32..33 ";" + WHITESPACE@33..34 "\n" + SELECT@34..47 + SELECT_CLAUSE@34..47 + SELECT_KW@34..40 "select" + WHITESPACE@40..41 " " + TARGET_LIST@41..47 + TARGET@41..47 + NAME_REF@41..47 + IDENT@41..47 "U&\"\\\\\"" + SEMICOLON@47..48 ";" + WHITESPACE@48..49 "\n" + SELECT@49..81 + SELECT_CLAUSE@49..81 + SELECT_KW@49..55 "select" + WHITESPACE@55..56 " " + TARGET_LIST@56..81 + TARGET@56..81 + NAME_REF@56..81 + IDENT@56..69 "U&\"ok: !0061\"" + WHITESPACE@69..70 " " + UESCAPE_KW@70..77 "UESCAPE" + WHITESPACE@77..78 " " + STRING@78..81 "'!'" + SEMICOLON@81..82 ";" + WHITESPACE@82..83 "\n" + SELECT@83..108 + SELECT_CLAUSE@83..108 + SELECT_KW@83..89 "select" + WHITESPACE@89..90 " " + TARGET_LIST@90..108 + TARGET@90..108 + NAME_REF@90..108 + IDENT@90..96 "U&\" \\\"" + WHITESPACE@96..97 " " + UESCAPE_KW@97..104 "UESCAPE" + WHITESPACE@104..105 " " + STRING@105..108 "'!'" + SEMICOLON@108..109 ";" + WHITESPACE@109..111 "\n\n" + COMMENT@111..120 "-- errors" + WHITESPACE@120..121 "\n" + SELECT@121..136 + SELECT_CLAUSE@121..136 + SELECT_KW@121..127 "select" + WHITESPACE@127..128 " " + TARGET_LIST@128..136 + TARGET@128..136 + NAME_REF@128..136 + IDENT@128..136 "U&\"\\006\"" + SEMICOLON@136..137 ";" + WHITESPACE@137..138 "\n" + SELECT@138..155 + SELECT_CLAUSE@138..155 + SELECT_KW@138..144 "select" + WHITESPACE@144..145 " " + TARGET_LIST@145..155 + TARGET@145..155 + NAME_REF@145..155 + IDENT@145..155 "U&\"\\+0061\"" + SEMICOLON@155..156 ";" + WHITESPACE@156..157 "\n" + SELECT@157..190 + SELECT_CLAUSE@157..190 + SELECT_KW@157..163 "select" + WHITESPACE@163..164 " " + TARGET_LIST@164..190 + TARGET@164..190 + NAME_REF@164..190 + IDENT@164..178 "U&\"wrong: \\06\"" + WHITESPACE@178..179 " " + UESCAPE_KW@179..186 "UESCAPE" + WHITESPACE@186..187 " " + STRING@187..190 "'\\'" + SEMICOLON@190..191 ";" + WHITESPACE@191..192 "\n" + SELECT@192..226 + SELECT_CLAUSE@192..226 + SELECT_KW@192..198 "select" + WHITESPACE@198..199 " " + TARGET_LIST@199..226 + TARGET@199..226 + NAME_REF@199..226 + IDENT@199..214 "U&\"wrong: !061\"" + WHITESPACE@214..215 " " + UESCAPE_KW@215..222 "UESCAPE" + WHITESPACE@222..223 " " + STRING@223..226 "'!'" + SEMICOLON@226..227 ";" + WHITESPACE@227..228 "\n" + SELECT@228..241 + SELECT_CLAUSE@228..241 + SELECT_KW@228..234 "select" + WHITESPACE@234..235 " " + TARGET_LIST@235..241 + TARGET@235..241 + NAME_REF@235..241 + IDENT@235..241 "U&\" \\\"" + SEMICOLON@241..242 ";" + WHITESPACE@242..243 "\n" + +error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX + ╭▸ +8 │ select U&"\006"; + ╰╴ ━━━━━━━━ +error[syntax-error]: Unicode escape requires 6 hex digits: \+XXXXXX + ╭▸ +9 │ select U&"\+0061"; + ╰╴ ━━━━━━━━━━ +error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX + ╭▸ +10 │ select U&"wrong: \06" UESCAPE '\'; + ╰╴ ━━━━━━━━━━━━━━ +error[syntax-error]: Unicode escape requires 4 hex digits: !XXXX + ╭▸ +11 │ select U&"wrong: !061" UESCAPE '!'; + ╰╴ ━━━━━━━━━━━━━━━ +error[syntax-error]: Invalid Unicode escape sequence + ╭▸ +12 │ select U&" \"; + ╰╴ ━━━━━━ diff --git a/crates/squawk_syntax/src/validation.rs b/crates/squawk_syntax/src/validation.rs index 450b5772..0a52ee95 100644 --- a/crates/squawk_syntax/src/validation.rs +++ b/crates/squawk_syntax/src/validation.rs @@ -8,7 +8,7 @@ use std::fmt; use std::ops::RangeInclusive; use crate::ast::AstNode; -use crate::{SyntaxNode, ast, match_ast, syntax_error::SyntaxError}; +use crate::{SyntaxNode, SyntaxToken, ast, match_ast, syntax_error::SyntaxError}; use rowan::{TextRange, TextSize}; use squawk_parser::SyntaxKind::*; pub(crate) fn validate(root: &SyntaxNode, errors: &mut Vec) { @@ -30,6 +30,14 @@ pub(crate) fn validate(root: &SyntaxNode, errors: &mut Vec) { } } } + for element in root.descendants_with_tokens() { + if let Some(token) = element.into_token() + && token.kind() == IDENT + && let Some(err) = validate_unicode_esc_ident(&token) + { + errors.push(err); + } + } } fn validate_select(it: ast::Select, acc: &mut Vec) { @@ -184,15 +192,7 @@ fn validate_unicode_esc_string(lit: &ast::Literal) -> Option { UNICODE_ESC_STRING => unicode_esc = Some(token), UESCAPE_KW => seen_uescape = true, STRING if seen_uescape => { - let text = token.text(); - let inner = text - .strip_prefix('\'') - .and_then(|s| s.strip_suffix('\'')) - .unwrap_or(""); - let mut chars = inner.chars(); - if let (Some(c), None) = (chars.next(), chars.next()) { - escape_char = c; - } + escape_char = uescape_char(&token).unwrap_or(escape_char); break; } _ => (), @@ -208,6 +208,45 @@ fn validate_unicode_esc_string(lit: &ast::Literal) -> Option { Some(SyntaxError::new(err.to_string(), token.text_range())) } +fn validate_unicode_esc_ident(token: &SyntaxToken) -> Option { + let text = token.text(); + let inside = text + .strip_prefix("U&\"") + .or_else(|| text.strip_prefix("u&\"")) + .and_then(|s| s.strip_suffix('"'))?; + + let mut escape_char = '\\'; + let mut seen_uescape = false; + let mut next = token.next_sibling_or_token(); + while let Some(element) = next { + match element.kind() { + WHITESPACE | COMMENT => (), + UESCAPE_KW => seen_uescape = true, + STRING if seen_uescape => { + if let Some(string_token) = element.as_token() { + escape_char = uescape_char(string_token).unwrap_or(escape_char); + } + break; + } + _ => break, + } + next = element.next_sibling_or_token(); + } + + let err = check_unicode_esc_str(inside, escape_char)?; + Some(SyntaxError::new(err.to_string(), token.text_range())) +} + +fn uescape_char(string_token: &SyntaxToken) -> Option { + let text = string_token.text(); + let inner = text.strip_prefix('\'')?.strip_suffix('\'')?; + let mut chars = inner.chars(); + match (chars.next(), chars.next()) { + (Some(c), None) => Some(c), + _ => None, + } +} + enum UnicodeEscapeKind { Short, Extended, diff --git a/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql b/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql new file mode 100644 index 00000000..6c14611a --- /dev/null +++ b/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql @@ -0,0 +1,12 @@ +-- ok +select U&"d\0061t\+000061"; +select U&"\\"; +select U&"ok: !0061" UESCAPE '!'; +select U&" \" UESCAPE '!'; + +-- errors +select U&"\006"; +select U&"\+0061"; +select U&"wrong: \06" UESCAPE '\'; +select U&"wrong: !061" UESCAPE '!'; +select U&" \";