From 7e7ee341c7608355dc779137c22ce4ad9fcb30ea Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Fri, 8 May 2026 20:25:12 -0400 Subject: [PATCH] parser: validated unicode uescape character --- ...test__unicode_escape_ident_validation.snap | 110 +++++++++++++++++- ...est__unicode_escape_string_validation.snap | 110 +++++++++++++++++- crates/squawk_syntax/src/validation.rs | 41 +++++-- .../validation/unicode_escape_ident.sql | 6 + .../validation/unicode_escape_string.sql | 6 + crates/xtask/src/sync_pg.rs | 1 + postgres/regression_suite/strings.sql | 2 +- 7 files changed, 266 insertions(+), 10 deletions(-) diff --git a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap index a819c9cd..0cc485d5 100644 --- a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap +++ b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap @@ -2,7 +2,7 @@ source: crates/squawk_syntax/src/test.rs input_file: crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql --- -SOURCE_FILE@0..243 +SOURCE_FILE@0..425 COMMENT@0..5 "-- ok" WHITESPACE@5..6 "\n" SELECT@6..32 @@ -113,6 +113,90 @@ SOURCE_FILE@0..243 IDENT@235..241 "U&\" \\\"" SEMICOLON@241..242 ";" WHITESPACE@242..243 "\n" + SELECT@243..270 + SELECT_CLAUSE@243..270 + SELECT_KW@243..249 "select" + WHITESPACE@249..250 " " + TARGET_LIST@250..270 + TARGET@250..270 + NAME_REF@250..270 + IDENT@250..259 "U&\"error\"" + WHITESPACE@259..260 " " + UESCAPE_KW@260..267 "UESCAPE" + WHITESPACE@267..268 " " + STRING@268..270 "''" + SEMICOLON@270..271 ";" + WHITESPACE@271..272 "\n" + SELECT@272..300 + SELECT_CLAUSE@272..300 + SELECT_KW@272..278 "select" + WHITESPACE@278..279 " " + TARGET_LIST@279..300 + TARGET@279..300 + NAME_REF@279..300 + IDENT@279..288 "U&\"error\"" + WHITESPACE@288..289 " " + UESCAPE_KW@289..296 "UESCAPE" + WHITESPACE@296..297 " " + STRING@297..300 "' '" + SEMICOLON@300..301 ";" + WHITESPACE@301..302 "\n" + SELECT@302..330 + SELECT_CLAUSE@302..330 + SELECT_KW@302..308 "select" + WHITESPACE@308..309 " " + TARGET_LIST@309..330 + TARGET@309..330 + NAME_REF@309..330 + IDENT@309..318 "U&\"error\"" + WHITESPACE@318..319 " " + UESCAPE_KW@319..326 "UESCAPE" + WHITESPACE@326..327 " " + STRING@327..330 "'+'" + SEMICOLON@330..331 ";" + WHITESPACE@331..332 "\n" + SELECT@332..360 + SELECT_CLAUSE@332..360 + SELECT_KW@332..338 "select" + WHITESPACE@338..339 " " + TARGET_LIST@339..360 + TARGET@339..360 + NAME_REF@339..360 + IDENT@339..348 "U&\"error\"" + WHITESPACE@348..349 " " + UESCAPE_KW@349..356 "UESCAPE" + WHITESPACE@356..357 " " + STRING@357..360 "'A'" + SEMICOLON@360..361 ";" + WHITESPACE@361..362 "\n" + SELECT@362..391 + SELECT_CLAUSE@362..391 + SELECT_KW@362..368 "select" + WHITESPACE@368..369 " " + TARGET_LIST@369..391 + TARGET@369..391 + NAME_REF@369..391 + IDENT@369..378 "U&\"error\"" + WHITESPACE@378..379 " " + UESCAPE_KW@379..386 "UESCAPE" + WHITESPACE@386..387 " " + STRING@387..391 "'é'" + SEMICOLON@391..392 ";" + WHITESPACE@392..393 "\n" + SELECT@393..423 + SELECT_CLAUSE@393..423 + SELECT_KW@393..399 "select" + WHITESPACE@399..400 " " + TARGET_LIST@400..423 + TARGET@400..423 + NAME_REF@400..423 + IDENT@400..409 "U&\"error\"" + WHITESPACE@409..410 " " + UESCAPE_KW@410..417 "UESCAPE" + WHITESPACE@417..418 " " + STRING@418..423 "'foo'" + SEMICOLON@423..424 ";" + WHITESPACE@424..425 "\n" error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX ╭▸ @@ -134,3 +218,27 @@ error[syntax-error]: Invalid Unicode escape sequence ╭▸ 12 │ select U&" \"; ╰╴ ━━━━━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +13 │ select U&"error" UESCAPE ''; + ╰╴ ━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +14 │ select U&"error" UESCAPE ' '; + ╰╴ ━━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +15 │ select U&"error" UESCAPE '+'; + ╰╴ ━━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +16 │ select U&"error" UESCAPE 'A'; + ╰╴ ━━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +17 │ select U&"error" UESCAPE 'é'; + ╰╴ ━━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +18 │ select U&"error" UESCAPE 'foo'; + ╰╴ ━━━━━ diff --git a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_string_validation.snap b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_string_validation.snap index 8430a106..e8406329 100644 --- a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_string_validation.snap +++ b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_string_validation.snap @@ -2,7 +2,7 @@ source: crates/squawk_syntax/src/test.rs input_file: crates/squawk_syntax/test_data/validation/unicode_escape_string.sql --- -SOURCE_FILE@0..241 +SOURCE_FILE@0..423 COMMENT@0..5 "-- ok" WHITESPACE@5..6 "\n" SELECT@6..30 @@ -113,6 +113,90 @@ SOURCE_FILE@0..241 UNICODE_ESC_STRING@233..239 "U&' \\'" SEMICOLON@239..240 ";" WHITESPACE@240..241 "\n" + SELECT@241..268 + SELECT_CLAUSE@241..268 + SELECT_KW@241..247 "select" + WHITESPACE@247..248 " " + TARGET_LIST@248..268 + TARGET@248..268 + LITERAL@248..268 + UNICODE_ESC_STRING@248..257 "U&'error'" + WHITESPACE@257..258 " " + UESCAPE_KW@258..265 "UESCAPE" + WHITESPACE@265..266 " " + STRING@266..268 "''" + SEMICOLON@268..269 ";" + WHITESPACE@269..270 "\n" + SELECT@270..298 + SELECT_CLAUSE@270..298 + SELECT_KW@270..276 "select" + WHITESPACE@276..277 " " + TARGET_LIST@277..298 + TARGET@277..298 + LITERAL@277..298 + UNICODE_ESC_STRING@277..286 "U&'error'" + WHITESPACE@286..287 " " + UESCAPE_KW@287..294 "UESCAPE" + WHITESPACE@294..295 " " + STRING@295..298 "' '" + SEMICOLON@298..299 ";" + WHITESPACE@299..300 "\n" + SELECT@300..328 + SELECT_CLAUSE@300..328 + SELECT_KW@300..306 "select" + WHITESPACE@306..307 " " + TARGET_LIST@307..328 + TARGET@307..328 + LITERAL@307..328 + UNICODE_ESC_STRING@307..316 "U&'error'" + WHITESPACE@316..317 " " + UESCAPE_KW@317..324 "UESCAPE" + WHITESPACE@324..325 " " + STRING@325..328 "'+'" + SEMICOLON@328..329 ";" + WHITESPACE@329..330 "\n" + SELECT@330..358 + SELECT_CLAUSE@330..358 + SELECT_KW@330..336 "select" + WHITESPACE@336..337 " " + TARGET_LIST@337..358 + TARGET@337..358 + LITERAL@337..358 + UNICODE_ESC_STRING@337..346 "U&'error'" + WHITESPACE@346..347 " " + UESCAPE_KW@347..354 "UESCAPE" + WHITESPACE@354..355 " " + STRING@355..358 "'A'" + SEMICOLON@358..359 ";" + WHITESPACE@359..360 "\n" + SELECT@360..389 + SELECT_CLAUSE@360..389 + SELECT_KW@360..366 "select" + WHITESPACE@366..367 " " + TARGET_LIST@367..389 + TARGET@367..389 + LITERAL@367..389 + UNICODE_ESC_STRING@367..376 "U&'error'" + WHITESPACE@376..377 " " + UESCAPE_KW@377..384 "UESCAPE" + WHITESPACE@384..385 " " + STRING@385..389 "'é'" + SEMICOLON@389..390 ";" + WHITESPACE@390..391 "\n" + SELECT@391..421 + SELECT_CLAUSE@391..421 + SELECT_KW@391..397 "select" + WHITESPACE@397..398 " " + TARGET_LIST@398..421 + TARGET@398..421 + LITERAL@398..421 + UNICODE_ESC_STRING@398..407 "U&'error'" + WHITESPACE@407..408 " " + UESCAPE_KW@408..415 "UESCAPE" + WHITESPACE@415..416 " " + STRING@416..421 "'foo'" + SEMICOLON@421..422 ";" + WHITESPACE@422..423 "\n" error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX ╭▸ @@ -134,3 +218,27 @@ error[syntax-error]: Invalid Unicode escape sequence ╭▸ 12 │ select U&' \'; ╰╴ ━━━━━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +13 │ select U&'error' UESCAPE ''; + ╰╴ ━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +14 │ select U&'error' UESCAPE ' '; + ╰╴ ━━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +15 │ select U&'error' UESCAPE '+'; + ╰╴ ━━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +16 │ select U&'error' UESCAPE 'A'; + ╰╴ ━━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +17 │ select U&'error' UESCAPE 'é'; + ╰╴ ━━━ +error[syntax-error]: Invalid unicode escape character + ╭▸ +18 │ select U&'error' UESCAPE 'foo'; + ╰╴ ━━━━━ diff --git a/crates/squawk_syntax/src/validation.rs b/crates/squawk_syntax/src/validation.rs index 0a52ee95..0ef1d63a 100644 --- a/crates/squawk_syntax/src/validation.rs +++ b/crates/squawk_syntax/src/validation.rs @@ -192,7 +192,15 @@ fn validate_unicode_esc_string(lit: &ast::Literal) -> Option { UNICODE_ESC_STRING => unicode_esc = Some(token), UESCAPE_KW => seen_uescape = true, STRING if seen_uescape => { - escape_char = uescape_char(&token).unwrap_or(escape_char); + escape_char = match uescape_char(&token) { + Some(ch) => ch, + None => { + return Some(SyntaxError::new( + "Invalid unicode escape character", + token.text_range(), + )); + } + }; break; } _ => (), @@ -224,7 +232,15 @@ fn validate_unicode_esc_ident(token: &SyntaxToken) -> Option { UESCAPE_KW => seen_uescape = true, STRING if seen_uescape => { if let Some(string_token) = element.as_token() { - escape_char = uescape_char(string_token).unwrap_or(escape_char); + escape_char = match uescape_char(string_token) { + Some(ch) => ch, + None => { + return Some(SyntaxError::new( + "Invalid unicode escape character", + string_token.text_range(), + )); + } + }; } break; } @@ -237,14 +253,25 @@ fn validate_unicode_esc_ident(token: &SyntaxToken) -> Option { Some(SyntaxError::new(err.to_string(), token.text_range())) } +// https://github.com/postgres/postgres/blob/228a1f9542792c6533ef74c2e7aefad0da1d9a7a/src/backend/parser/parser.c#L350 +const fn is_valid_uescape_char(byte: u8) -> bool { + !byte.is_ascii_hexdigit() + && byte != b'+' + && byte != b'\'' + && byte != b'"' + && !matches!( + byte, + b' ' | b'\t' | b'\n' | b'\r' | /* b'\v' */ 0x0B | /* b'\f' */ 0x0C + ) +} + fn uescape_char(string_token: &SyntaxToken) -> Option { let text = string_token.text(); let inner = text.strip_prefix('\'')?.strip_suffix('\'')?; - let mut chars = inner.chars(); - match (chars.next(), chars.next()) { - (Some(c), None) => Some(c), - _ => None, - } + let &[byte] = inner.as_bytes() else { + return None; + }; + is_valid_uescape_char(byte).then(|| char::from(byte)) } enum UnicodeEscapeKind { diff --git a/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql b/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql index 6c14611a..572d1322 100644 --- a/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql +++ b/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql @@ -10,3 +10,9 @@ select U&"\+0061"; select U&"wrong: \06" UESCAPE '\'; select U&"wrong: !061" UESCAPE '!'; select U&" \"; +select U&"error" UESCAPE ''; +select U&"error" UESCAPE ' '; +select U&"error" UESCAPE '+'; +select U&"error" UESCAPE 'A'; +select U&"error" UESCAPE 'é'; +select U&"error" UESCAPE 'foo'; diff --git a/crates/squawk_syntax/test_data/validation/unicode_escape_string.sql b/crates/squawk_syntax/test_data/validation/unicode_escape_string.sql index 5e05a65d..58966f9e 100644 --- a/crates/squawk_syntax/test_data/validation/unicode_escape_string.sql +++ b/crates/squawk_syntax/test_data/validation/unicode_escape_string.sql @@ -10,3 +10,9 @@ select U&'\+0061'; select U&'wrong: \06' UESCAPE '\'; select U&'wrong: !061' UESCAPE '!'; select U&' \'; +select U&'error' UESCAPE ''; +select U&'error' UESCAPE ' '; +select U&'error' UESCAPE '+'; +select U&'error' UESCAPE 'A'; +select U&'error' UESCAPE 'é'; +select U&'error' UESCAPE 'foo'; diff --git a/crates/xtask/src/sync_pg.rs b/crates/xtask/src/sync_pg.rs index 28b3e124..32f2f072 100644 --- a/crates/xtask/src/sync_pg.rs +++ b/crates/xtask/src/sync_pg.rs @@ -51,6 +51,7 @@ const IGNORED_LINES: &[&str] = &[ "CREATE SUBSCRIPTION regress_testsub CONNECTION 'foo';", "CREATE SUBSCRIPTION regress_testsub PUBLICATION foo;", "SELECT U&'wrong: +0061' UESCAPE +;", + "SELECT U&'wrong: +0061' UESCAPE '+';", "CREATE STATISTICS tst;", "CREATE STATISTICS tst ON a, b;", "CREATE STATISTICS tst ON a FROM (VALUES (x)) AS foo;", diff --git a/postgres/regression_suite/strings.sql b/postgres/regression_suite/strings.sql index fde53672..bef269f6 100644 --- a/postgres/regression_suite/strings.sql +++ b/postgres/regression_suite/strings.sql @@ -27,7 +27,7 @@ SELECT 'tricky' AS U&"\" UESCAPE '!'; -- SELECT U&'wrong: \061'; -- SELECT U&'wrong: \+0061'; -- SELECT U&'wrong: +0061' UESCAPE +; -SELECT U&'wrong: +0061' UESCAPE '+'; +-- SELECT U&'wrong: +0061' UESCAPE '+'; -- SELECT U&'wrong: \db99'; -- SELECT U&'wrong: \db99xy';