diff --git a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap index 0cc485d5..d63531e7 100644 --- a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap +++ b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap @@ -2,7 +2,7 @@ source: crates/squawk_syntax/src/test.rs input_file: crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql --- -SOURCE_FILE@0..425 +SOURCE_FILE@0..489 COMMENT@0..5 "-- ok" WHITESPACE@5..6 "\n" SELECT@6..32 @@ -103,142 +103,178 @@ SOURCE_FILE@0..425 STRING@223..226 "'!'" SEMICOLON@226..227 ";" WHITESPACE@227..228 "\n" - SELECT@228..241 - SELECT_CLAUSE@228..241 + SELECT@228..262 + SELECT_CLAUSE@228..262 SELECT_KW@228..234 "select" WHITESPACE@234..235 " " - TARGET_LIST@235..241 - TARGET@235..241 - NAME_REF@235..241 - IDENT@235..241 "U&\" \\\"" - SEMICOLON@241..242 ";" - WHITESPACE@242..243 "\n" - SELECT@243..270 - SELECT_CLAUSE@243..270 - SELECT_KW@243..249 "select" - WHITESPACE@249..250 " " - TARGET_LIST@250..270 - TARGET@250..270 - NAME_REF@250..270 - IDENT@250..259 "U&\"error\"" - WHITESPACE@259..260 " " - UESCAPE_KW@260..267 "UESCAPE" - WHITESPACE@267..268 " " - STRING@268..270 "''" - SEMICOLON@270..271 ";" - WHITESPACE@271..272 "\n" - SELECT@272..300 - SELECT_CLAUSE@272..300 - SELECT_KW@272..278 "select" - WHITESPACE@278..279 " " - TARGET_LIST@279..300 - TARGET@279..300 - NAME_REF@279..300 - IDENT@279..288 "U&\"error\"" - WHITESPACE@288..289 " " - UESCAPE_KW@289..296 "UESCAPE" - WHITESPACE@296..297 " " - STRING@297..300 "' '" - SEMICOLON@300..301 ";" - WHITESPACE@301..302 "\n" - SELECT@302..330 - SELECT_CLAUSE@302..330 - SELECT_KW@302..308 "select" - WHITESPACE@308..309 " " - TARGET_LIST@309..330 - TARGET@309..330 - NAME_REF@309..330 - IDENT@309..318 "U&\"error\"" - WHITESPACE@318..319 " " - UESCAPE_KW@319..326 "UESCAPE" - WHITESPACE@326..327 " " - STRING@327..330 "'+'" - SEMICOLON@330..331 ";" - WHITESPACE@331..332 "\n" - SELECT@332..360 - SELECT_CLAUSE@332..360 - SELECT_KW@332..338 "select" - WHITESPACE@338..339 " " - TARGET_LIST@339..360 - TARGET@339..360 - NAME_REF@339..360 - IDENT@339..348 "U&\"error\"" - WHITESPACE@348..349 " " - UESCAPE_KW@349..356 "UESCAPE" - WHITESPACE@356..357 " " - STRING@357..360 "'A'" - SEMICOLON@360..361 ";" - WHITESPACE@361..362 "\n" - SELECT@362..391 - SELECT_CLAUSE@362..391 - SELECT_KW@362..368 "select" - WHITESPACE@368..369 " " - TARGET_LIST@369..391 - TARGET@369..391 - NAME_REF@369..391 - IDENT@369..378 "U&\"error\"" - WHITESPACE@378..379 " " - UESCAPE_KW@379..386 "UESCAPE" - WHITESPACE@386..387 " " - STRING@387..391 "'é'" - SEMICOLON@391..392 ";" - WHITESPACE@392..393 "\n" - SELECT@393..423 - SELECT_CLAUSE@393..423 - SELECT_KW@393..399 "select" - WHITESPACE@399..400 " " - TARGET_LIST@400..423 - TARGET@400..423 - NAME_REF@400..423 - IDENT@400..409 "U&\"error\"" - WHITESPACE@409..410 " " - UESCAPE_KW@410..417 "UESCAPE" - WHITESPACE@417..418 " " - STRING@418..423 "'foo'" - SEMICOLON@423..424 ";" - WHITESPACE@424..425 "\n" + TARGET_LIST@235..262 + TARGET@235..262 + NAME_REF@235..262 + IDENT@235..262 "U&\"many: \\061 \\+0061 ..." + SEMICOLON@262..263 ";" + WHITESPACE@263..264 "\n" + SELECT@264..290 + SELECT_CLAUSE@264..290 + SELECT_KW@264..270 "select" + WHITESPACE@270..271 " " + TARGET_LIST@271..290 + TARGET@271..290 + NAME_REF@271..290 + IDENT@271..290 "U&\"\\D800\\D801\\DC00\"" + SEMICOLON@290..291 ";" + WHITESPACE@291..292 "\n" + SELECT@292..305 + SELECT_CLAUSE@292..305 + SELECT_KW@292..298 "select" + WHITESPACE@298..299 " " + TARGET_LIST@299..305 + TARGET@299..305 + NAME_REF@299..305 + IDENT@299..305 "U&\" \\\"" + SEMICOLON@305..306 ";" + WHITESPACE@306..307 "\n" + SELECT@307..334 + SELECT_CLAUSE@307..334 + SELECT_KW@307..313 "select" + WHITESPACE@313..314 " " + TARGET_LIST@314..334 + TARGET@314..334 + NAME_REF@314..334 + IDENT@314..323 "U&\"error\"" + WHITESPACE@323..324 " " + UESCAPE_KW@324..331 "UESCAPE" + WHITESPACE@331..332 " " + STRING@332..334 "''" + SEMICOLON@334..335 ";" + WHITESPACE@335..336 "\n" + SELECT@336..364 + SELECT_CLAUSE@336..364 + SELECT_KW@336..342 "select" + WHITESPACE@342..343 " " + TARGET_LIST@343..364 + TARGET@343..364 + NAME_REF@343..364 + IDENT@343..352 "U&\"error\"" + WHITESPACE@352..353 " " + UESCAPE_KW@353..360 "UESCAPE" + WHITESPACE@360..361 " " + STRING@361..364 "' '" + SEMICOLON@364..365 ";" + WHITESPACE@365..366 "\n" + SELECT@366..394 + SELECT_CLAUSE@366..394 + SELECT_KW@366..372 "select" + WHITESPACE@372..373 " " + TARGET_LIST@373..394 + TARGET@373..394 + NAME_REF@373..394 + IDENT@373..382 "U&\"error\"" + WHITESPACE@382..383 " " + UESCAPE_KW@383..390 "UESCAPE" + WHITESPACE@390..391 " " + STRING@391..394 "'+'" + SEMICOLON@394..395 ";" + WHITESPACE@395..396 "\n" + SELECT@396..424 + SELECT_CLAUSE@396..424 + SELECT_KW@396..402 "select" + WHITESPACE@402..403 " " + TARGET_LIST@403..424 + TARGET@403..424 + NAME_REF@403..424 + IDENT@403..412 "U&\"error\"" + WHITESPACE@412..413 " " + UESCAPE_KW@413..420 "UESCAPE" + WHITESPACE@420..421 " " + STRING@421..424 "'A'" + SEMICOLON@424..425 ";" + WHITESPACE@425..426 "\n" + SELECT@426..455 + SELECT_CLAUSE@426..455 + SELECT_KW@426..432 "select" + WHITESPACE@432..433 " " + TARGET_LIST@433..455 + TARGET@433..455 + NAME_REF@433..455 + IDENT@433..442 "U&\"error\"" + WHITESPACE@442..443 " " + UESCAPE_KW@443..450 "UESCAPE" + WHITESPACE@450..451 " " + STRING@451..455 "'é'" + SEMICOLON@455..456 ";" + WHITESPACE@456..457 "\n" + SELECT@457..487 + SELECT_CLAUSE@457..487 + SELECT_KW@457..463 "select" + WHITESPACE@463..464 " " + TARGET_LIST@464..487 + TARGET@464..487 + NAME_REF@464..487 + IDENT@464..473 "U&\"error\"" + WHITESPACE@473..474 " " + UESCAPE_KW@474..481 "UESCAPE" + WHITESPACE@481..482 " " + STRING@482..487 "'foo'" + SEMICOLON@487..488 ";" + WHITESPACE@488..489 "\n" error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX ╭▸ 8 │ select U&"\006"; - ╰╴ ━━━━━━━━ + ╰╴ ━━━━ error[syntax-error]: Unicode escape requires 6 hex digits: \+XXXXXX ╭▸ 9 │ select U&"\+0061"; - ╰╴ ━━━━━━━━━━ + ╰╴ ━━━━━━ error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX ╭▸ 10 │ select U&"wrong: \06" UESCAPE '\'; - ╰╴ ━━━━━━━━━━━━━━ + ╰╴ ━━━ error[syntax-error]: Unicode escape requires 4 hex digits: !XXXX ╭▸ 11 │ select U&"wrong: !061" UESCAPE '!'; - ╰╴ ━━━━━━━━━━━━━━━ + ╰╴ ━━━━ +error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX + ╭▸ +12 │ select U&"many: \061 \+0061 \zzzz"; + ╰╴ ━━━━ +error[syntax-error]: Unicode escape requires 6 hex digits: \+XXXXXX + ╭▸ +12 │ select U&"many: \061 \+0061 \zzzz"; + ╰╴ ━━━━━━ +error[syntax-error]: Invalid Unicode escape sequence + ╭▸ +12 │ select U&"many: \061 \+0061 \zzzz"; + ╰╴ ━━ +error[syntax-error]: Invalid Unicode surrogate pair + ╭▸ +13 │ select U&"\D800\D801\DC00"; + ╰╴ ━━━━━━━━━━ error[syntax-error]: Invalid Unicode escape sequence ╭▸ -12 │ select U&" \"; - ╰╴ ━━━━━━ +14 │ select U&" \"; + ╰╴ ━ error[syntax-error]: Invalid unicode escape character ╭▸ -13 │ select U&"error" UESCAPE ''; +15 │ select U&"error" UESCAPE ''; ╰╴ ━━ error[syntax-error]: Invalid unicode escape character ╭▸ -14 │ select U&"error" UESCAPE ' '; +16 │ select U&"error" UESCAPE ' '; ╰╴ ━━━ error[syntax-error]: Invalid unicode escape character ╭▸ -15 │ select U&"error" UESCAPE '+'; +17 │ select U&"error" UESCAPE '+'; ╰╴ ━━━ error[syntax-error]: Invalid unicode escape character ╭▸ -16 │ select U&"error" UESCAPE 'A'; +18 │ select U&"error" UESCAPE 'A'; ╰╴ ━━━ error[syntax-error]: Invalid unicode escape character ╭▸ -17 │ select U&"error" UESCAPE 'é'; +19 │ select U&"error" UESCAPE 'é'; ╰╴ ━━━ error[syntax-error]: Invalid unicode escape character ╭▸ -18 │ select U&"error" UESCAPE 'foo'; +20 │ select U&"error" UESCAPE 'foo'; ╰╴ ━━━━━ diff --git a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_string_validation.snap b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_string_validation.snap index e8406329..b8565310 100644 --- a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_string_validation.snap +++ b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_string_validation.snap @@ -2,7 +2,7 @@ source: crates/squawk_syntax/src/test.rs input_file: crates/squawk_syntax/test_data/validation/unicode_escape_string.sql --- -SOURCE_FILE@0..423 +SOURCE_FILE@0..507 COMMENT@0..5 "-- ok" WHITESPACE@5..6 "\n" SELECT@6..30 @@ -103,142 +103,192 @@ SOURCE_FILE@0..423 STRING@221..224 "'!'" SEMICOLON@224..225 ";" WHITESPACE@225..226 "\n" - SELECT@226..239 - SELECT_CLAUSE@226..239 + SELECT@226..260 + SELECT_CLAUSE@226..260 SELECT_KW@226..232 "select" WHITESPACE@232..233 " " - TARGET_LIST@233..239 - TARGET@233..239 - LITERAL@233..239 - UNICODE_ESC_STRING@233..239 "U&' \\'" - SEMICOLON@239..240 ";" - WHITESPACE@240..241 "\n" - SELECT@241..268 - SELECT_CLAUSE@241..268 - SELECT_KW@241..247 "select" - WHITESPACE@247..248 " " - TARGET_LIST@248..268 - TARGET@248..268 - LITERAL@248..268 - UNICODE_ESC_STRING@248..257 "U&'error'" - WHITESPACE@257..258 " " - UESCAPE_KW@258..265 "UESCAPE" - WHITESPACE@265..266 " " - STRING@266..268 "''" - SEMICOLON@268..269 ";" - WHITESPACE@269..270 "\n" - SELECT@270..298 - SELECT_CLAUSE@270..298 - SELECT_KW@270..276 "select" - WHITESPACE@276..277 " " - TARGET_LIST@277..298 - TARGET@277..298 - LITERAL@277..298 - UNICODE_ESC_STRING@277..286 "U&'error'" - WHITESPACE@286..287 " " - UESCAPE_KW@287..294 "UESCAPE" - WHITESPACE@294..295 " " - STRING@295..298 "' '" - SEMICOLON@298..299 ";" - WHITESPACE@299..300 "\n" - SELECT@300..328 - SELECT_CLAUSE@300..328 - SELECT_KW@300..306 "select" - WHITESPACE@306..307 " " - TARGET_LIST@307..328 - TARGET@307..328 - LITERAL@307..328 - UNICODE_ESC_STRING@307..316 "U&'error'" - WHITESPACE@316..317 " " - UESCAPE_KW@317..324 "UESCAPE" - WHITESPACE@324..325 " " - STRING@325..328 "'+'" - SEMICOLON@328..329 ";" - WHITESPACE@329..330 "\n" - SELECT@330..358 - SELECT_CLAUSE@330..358 - SELECT_KW@330..336 "select" - WHITESPACE@336..337 " " - TARGET_LIST@337..358 - TARGET@337..358 - LITERAL@337..358 - UNICODE_ESC_STRING@337..346 "U&'error'" - WHITESPACE@346..347 " " - UESCAPE_KW@347..354 "UESCAPE" - WHITESPACE@354..355 " " - STRING@355..358 "'A'" - SEMICOLON@358..359 ";" - WHITESPACE@359..360 "\n" - SELECT@360..389 - SELECT_CLAUSE@360..389 - SELECT_KW@360..366 "select" - WHITESPACE@366..367 " " - TARGET_LIST@367..389 - TARGET@367..389 - LITERAL@367..389 - UNICODE_ESC_STRING@367..376 "U&'error'" - WHITESPACE@376..377 " " - UESCAPE_KW@377..384 "UESCAPE" - WHITESPACE@384..385 " " - STRING@385..389 "'é'" - SEMICOLON@389..390 ";" - WHITESPACE@390..391 "\n" - SELECT@391..421 - SELECT_CLAUSE@391..421 - SELECT_KW@391..397 "select" - WHITESPACE@397..398 " " - TARGET_LIST@398..421 - TARGET@398..421 - LITERAL@398..421 - UNICODE_ESC_STRING@398..407 "U&'error'" - WHITESPACE@407..408 " " - UESCAPE_KW@408..415 "UESCAPE" - WHITESPACE@415..416 " " - STRING@416..421 "'foo'" - SEMICOLON@421..422 ";" - WHITESPACE@422..423 "\n" + TARGET_LIST@233..260 + TARGET@233..260 + LITERAL@233..260 + UNICODE_ESC_STRING@233..260 "U&'many: \\061 \\+0061 ..." + SEMICOLON@260..261 ";" + WHITESPACE@261..262 "\n" + SELECT@262..280 + SELECT_CLAUSE@262..280 + SELECT_KW@262..268 "select" + WHITESPACE@268..269 " " + TARGET_LIST@269..280 + TARGET@269..280 + LITERAL@269..280 + UNICODE_ESC_STRING@269..280 "U&'\\D800\\\\'" + SEMICOLON@280..281 ";" + WHITESPACE@281..282 "\n" + SELECT@282..308 + SELECT_CLAUSE@282..308 + SELECT_KW@282..288 "select" + WHITESPACE@288..289 " " + TARGET_LIST@289..308 + TARGET@289..308 + LITERAL@289..308 + UNICODE_ESC_STRING@289..308 "U&'\\D800\\D801\\DC00'" + SEMICOLON@308..309 ";" + WHITESPACE@309..310 "\n" + SELECT@310..323 + SELECT_CLAUSE@310..323 + SELECT_KW@310..316 "select" + WHITESPACE@316..317 " " + TARGET_LIST@317..323 + TARGET@317..323 + LITERAL@317..323 + UNICODE_ESC_STRING@317..323 "U&' \\'" + SEMICOLON@323..324 ";" + WHITESPACE@324..325 "\n" + SELECT@325..352 + SELECT_CLAUSE@325..352 + SELECT_KW@325..331 "select" + WHITESPACE@331..332 " " + TARGET_LIST@332..352 + TARGET@332..352 + LITERAL@332..352 + UNICODE_ESC_STRING@332..341 "U&'error'" + WHITESPACE@341..342 " " + UESCAPE_KW@342..349 "UESCAPE" + WHITESPACE@349..350 " " + STRING@350..352 "''" + SEMICOLON@352..353 ";" + WHITESPACE@353..354 "\n" + SELECT@354..382 + SELECT_CLAUSE@354..382 + SELECT_KW@354..360 "select" + WHITESPACE@360..361 " " + TARGET_LIST@361..382 + TARGET@361..382 + LITERAL@361..382 + UNICODE_ESC_STRING@361..370 "U&'error'" + WHITESPACE@370..371 " " + UESCAPE_KW@371..378 "UESCAPE" + WHITESPACE@378..379 " " + STRING@379..382 "' '" + SEMICOLON@382..383 ";" + WHITESPACE@383..384 "\n" + SELECT@384..412 + SELECT_CLAUSE@384..412 + SELECT_KW@384..390 "select" + WHITESPACE@390..391 " " + TARGET_LIST@391..412 + TARGET@391..412 + LITERAL@391..412 + UNICODE_ESC_STRING@391..400 "U&'error'" + WHITESPACE@400..401 " " + UESCAPE_KW@401..408 "UESCAPE" + WHITESPACE@408..409 " " + STRING@409..412 "'+'" + SEMICOLON@412..413 ";" + WHITESPACE@413..414 "\n" + SELECT@414..442 + SELECT_CLAUSE@414..442 + SELECT_KW@414..420 "select" + WHITESPACE@420..421 " " + TARGET_LIST@421..442 + TARGET@421..442 + LITERAL@421..442 + UNICODE_ESC_STRING@421..430 "U&'error'" + WHITESPACE@430..431 " " + UESCAPE_KW@431..438 "UESCAPE" + WHITESPACE@438..439 " " + STRING@439..442 "'A'" + SEMICOLON@442..443 ";" + WHITESPACE@443..444 "\n" + SELECT@444..473 + SELECT_CLAUSE@444..473 + SELECT_KW@444..450 "select" + WHITESPACE@450..451 " " + TARGET_LIST@451..473 + TARGET@451..473 + LITERAL@451..473 + UNICODE_ESC_STRING@451..460 "U&'error'" + WHITESPACE@460..461 " " + UESCAPE_KW@461..468 "UESCAPE" + WHITESPACE@468..469 " " + STRING@469..473 "'é'" + SEMICOLON@473..474 ";" + WHITESPACE@474..475 "\n" + SELECT@475..505 + SELECT_CLAUSE@475..505 + SELECT_KW@475..481 "select" + WHITESPACE@481..482 " " + TARGET_LIST@482..505 + TARGET@482..505 + LITERAL@482..505 + UNICODE_ESC_STRING@482..491 "U&'error'" + WHITESPACE@491..492 " " + UESCAPE_KW@492..499 "UESCAPE" + WHITESPACE@499..500 " " + STRING@500..505 "'foo'" + SEMICOLON@505..506 ";" + WHITESPACE@506..507 "\n" error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX ╭▸ 8 │ select U&'\006'; - ╰╴ ━━━━━━━━ + ╰╴ ━━━━ error[syntax-error]: Unicode escape requires 6 hex digits: \+XXXXXX ╭▸ 9 │ select U&'\+0061'; - ╰╴ ━━━━━━━━━━ + ╰╴ ━━━━━━ error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX ╭▸ 10 │ select U&'wrong: \06' UESCAPE '\'; - ╰╴ ━━━━━━━━━━━━━━ + ╰╴ ━━━ error[syntax-error]: Unicode escape requires 4 hex digits: !XXXX ╭▸ 11 │ select U&'wrong: !061' UESCAPE '!'; - ╰╴ ━━━━━━━━━━━━━━━ + ╰╴ ━━━━ +error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX + ╭▸ +12 │ select U&'many: \061 \+0061 \zzzz'; + ╰╴ ━━━━ +error[syntax-error]: Unicode escape requires 6 hex digits: \+XXXXXX + ╭▸ +12 │ select U&'many: \061 \+0061 \zzzz'; + ╰╴ ━━━━━━ +error[syntax-error]: Invalid Unicode escape sequence + ╭▸ +12 │ select U&'many: \061 \+0061 \zzzz'; + ╰╴ ━━ +error[syntax-error]: Invalid Unicode surrogate pair + ╭▸ +13 │ select U&'\D800\\'; + ╰╴ ━━━━━ +error[syntax-error]: Invalid Unicode surrogate pair + ╭▸ +14 │ select U&'\D800\D801\DC00'; + ╰╴ ━━━━━━━━━━ error[syntax-error]: Invalid Unicode escape sequence ╭▸ -12 │ select U&' \'; - ╰╴ ━━━━━━ +15 │ select U&' \'; + ╰╴ ━ error[syntax-error]: Invalid unicode escape character ╭▸ -13 │ select U&'error' UESCAPE ''; +16 │ select U&'error' UESCAPE ''; ╰╴ ━━ error[syntax-error]: Invalid unicode escape character ╭▸ -14 │ select U&'error' UESCAPE ' '; +17 │ select U&'error' UESCAPE ' '; ╰╴ ━━━ error[syntax-error]: Invalid unicode escape character ╭▸ -15 │ select U&'error' UESCAPE '+'; +18 │ select U&'error' UESCAPE '+'; ╰╴ ━━━ error[syntax-error]: Invalid unicode escape character ╭▸ -16 │ select U&'error' UESCAPE 'A'; +19 │ select U&'error' UESCAPE 'A'; ╰╴ ━━━ error[syntax-error]: Invalid unicode escape character ╭▸ -17 │ select U&'error' UESCAPE 'é'; +20 │ select U&'error' UESCAPE 'é'; ╰╴ ━━━ error[syntax-error]: Invalid unicode escape character ╭▸ -18 │ select U&'error' UESCAPE 'foo'; +21 │ select U&'error' UESCAPE 'foo'; ╰╴ ━━━━━ diff --git a/crates/squawk_syntax/src/validation.rs b/crates/squawk_syntax/src/validation.rs index 0ef1d63a..c619d0a2 100644 --- a/crates/squawk_syntax/src/validation.rs +++ b/crates/squawk_syntax/src/validation.rs @@ -5,7 +5,7 @@ //! A failed validation emits a diagnostic. use std::fmt; -use std::ops::RangeInclusive; +use std::ops::{Range, RangeInclusive}; use crate::ast::AstNode; use crate::{SyntaxNode, SyntaxToken, ast, match_ast, syntax_error::SyntaxError}; @@ -33,9 +33,8 @@ pub(crate) fn validate(root: &SyntaxNode, errors: &mut Vec) { for element in root.descendants_with_tokens() { if let Some(token) = element.into_token() && token.kind() == IDENT - && let Some(err) = validate_unicode_esc_ident(&token) { - errors.push(err); + validate_unicode_esc_ident(&token, errors); } } } @@ -175,12 +174,10 @@ fn validate_literal(lit: ast::Literal, acc: &mut Vec) { } } - if let Some(err) = validate_unicode_esc_string(&lit) { - acc.push(err); - } + validate_unicode_esc_string(&lit, acc); } -fn validate_unicode_esc_string(lit: &ast::Literal) -> Option { +fn validate_unicode_esc_string(lit: &ast::Literal, acc: &mut Vec) { let mut unicode_esc = None; let mut seen_uescape = false; let mut escape_char = '\\'; @@ -195,10 +192,11 @@ fn validate_unicode_esc_string(lit: &ast::Literal) -> Option { escape_char = match uescape_char(&token) { Some(ch) => ch, None => { - return Some(SyntaxError::new( + acc.push(SyntaxError::new( "Invalid unicode escape character", token.text_range(), )); + return; } }; break; @@ -206,22 +204,35 @@ fn validate_unicode_esc_string(lit: &ast::Literal) -> Option { _ => (), } } - let token = unicode_esc?; + let Some(token) = unicode_esc else { + return; + }; let text = token.text(); - let inside = text + let Some(inside) = text .strip_prefix("U&'") .or_else(|| text.strip_prefix("u&'")) - .and_then(|s| s.strip_suffix('\''))?; - let err = check_unicode_esc_str(inside, escape_char)?; - Some(SyntaxError::new(err.to_string(), token.text_range())) + .and_then(|s| s.strip_suffix('\'')) + else { + return; + }; + let inside_start = token.text_range().start() + TextSize::new(3); + check_unicode_esc_str(inside, escape_char, |range, err| { + acc.push(SyntaxError::new( + err.to_string(), + offset_range(inside_start, range), + )); + }); } -fn validate_unicode_esc_ident(token: &SyntaxToken) -> Option { +fn validate_unicode_esc_ident(token: &SyntaxToken, acc: &mut Vec) { let text = token.text(); - let inside = text + let Some(inside) = text .strip_prefix("U&\"") .or_else(|| text.strip_prefix("u&\"")) - .and_then(|s| s.strip_suffix('"'))?; + .and_then(|s| s.strip_suffix('"')) + else { + return; + }; let mut escape_char = '\\'; let mut seen_uescape = false; @@ -235,10 +246,11 @@ fn validate_unicode_esc_ident(token: &SyntaxToken) -> Option { escape_char = match uescape_char(string_token) { Some(ch) => ch, None => { - return Some(SyntaxError::new( + acc.push(SyntaxError::new( "Invalid unicode escape character", string_token.text_range(), )); + return; } }; } @@ -249,8 +261,19 @@ fn validate_unicode_esc_ident(token: &SyntaxToken) -> Option { next = element.next_sibling_or_token(); } - let err = check_unicode_esc_str(inside, escape_char)?; - Some(SyntaxError::new(err.to_string(), token.text_range())) + let inside_start = token.text_range().start() + TextSize::new(3); + check_unicode_esc_str(inside, escape_char, |range, err| { + acc.push(SyntaxError::new( + err.to_string(), + offset_range(inside_start, range), + )); + }); +} + +fn offset_range(start: TextSize, range: Range) -> TextRange { + let begin = start + TextSize::new(range.start as u32); + let end = start + TextSize::new(range.end as u32); + TextRange::new(begin, end) } // https://github.com/postgres/postgres/blob/228a1f9542792c6533ef74c2e7aefad0da1d9a7a/src/backend/parser/parser.c#L350 @@ -320,53 +343,91 @@ impl fmt::Display for UnicodeEscError { } } -fn check_unicode_esc_str(text: &str, escape_char: char) -> Option { +fn check_unicode_esc_str(text: &str, escape_char: char, mut callback: F) +where + F: FnMut(Range, UnicodeEscError), +{ const HIGH_SURROGATE: RangeInclusive = 0xD800..=0xDBFF; const LOW_SURROGATE: RangeInclusive = 0xDC00..=0xDFFF; const MAX_CODEPOINT: u32 = 0x10FFFF; - let mut chars = text.chars().peekable(); - let mut high_surrogate: Option = None; + let mut chars = text.char_indices().peekable(); + let mut high_surrogate: Option> = None; - while let Some(c) = chars.next() { + while let Some((escape_start, c)) = chars.next() { if c != escape_char { continue; } let kind = match chars.peek() { - Some(&c) if c == escape_char => { + Some(&(_, c)) if c == escape_char => { chars.next(); - high_surrogate = None; + if let Some(hi_range) = high_surrogate.take() { + callback(hi_range, UnicodeEscError::InvalidSurrogatePair); + } continue; } - Some('+') => { + Some(&(_, '+')) => { chars.next(); UnicodeEscapeKind::Extended } - Some(c) if c.is_ascii_hexdigit() => UnicodeEscapeKind::Short, - _ => return Some(UnicodeEscError::InvalidEscape), + Some(&(_, c)) if c.is_ascii_hexdigit() => UnicodeEscapeKind::Short, + _ => { + let end = chars + .peek() + .map(|&(i, c)| i + c.len_utf8()) + .unwrap_or(text.len()); + callback(escape_start..end, UnicodeEscError::InvalidEscape); + continue; + } }; let mut codepoint: u32 = 0; + let mut got_all = true; + let mut last_end = chars.peek().map(|&(i, _)| i).unwrap_or(text.len()); for _ in 0..kind.count() { let radix = 16; - let Some(d) = chars.peek().and_then(|c| c.to_digit(radix)) else { - return Some(UnicodeEscError::RequiresHexDigits { kind, escape_char }); + let Some(&(i, ch)) = chars.peek() else { + got_all = false; + break; + }; + let Some(d) = ch.to_digit(radix) else { + got_all = false; + break; }; chars.next(); codepoint = codepoint * radix + d; + last_end = i + ch.len_utf8(); + } + if !got_all { + callback( + escape_start..last_end, + UnicodeEscError::RequiresHexDigits { kind, escape_char }, + ); + high_surrogate = None; + continue; } - if high_surrogate.take().is_some() { - if !LOW_SURROGATE.contains(&codepoint) { - return Some(UnicodeEscError::InvalidSurrogatePair); + if let Some(hi_range) = high_surrogate.take() { + if LOW_SURROGATE.contains(&codepoint) { + continue; } - } else if codepoint > MAX_CODEPOINT { - return Some(UnicodeEscError::OutOfRange); + callback( + hi_range.start..last_end, + UnicodeEscError::InvalidSurrogatePair, + ); + } + if codepoint > MAX_CODEPOINT { + callback(escape_start..last_end, UnicodeEscError::OutOfRange); } else if HIGH_SURROGATE.contains(&codepoint) { - high_surrogate = Some(codepoint); + high_surrogate = Some(escape_start..last_end); } else if LOW_SURROGATE.contains(&codepoint) { - return Some(UnicodeEscError::InvalidSurrogatePair); + callback( + escape_start..last_end, + UnicodeEscError::InvalidSurrogatePair, + ); } } - high_surrogate.map(|_| UnicodeEscError::InvalidSurrogatePair) + if let Some(range) = high_surrogate { + callback(range, UnicodeEscError::InvalidSurrogatePair); + } } fn validate_join_expr(join_expr: ast::JoinExpr, acc: &mut Vec) { diff --git a/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql b/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql index 572d1322..104045a1 100644 --- a/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql +++ b/crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql @@ -9,6 +9,8 @@ select U&"\006"; select U&"\+0061"; select U&"wrong: \06" UESCAPE '\'; select U&"wrong: !061" UESCAPE '!'; +select U&"many: \061 \+0061 \zzzz"; +select U&"\D800\D801\DC00"; select U&" \"; select U&"error" UESCAPE ''; select U&"error" UESCAPE ' '; diff --git a/crates/squawk_syntax/test_data/validation/unicode_escape_string.sql b/crates/squawk_syntax/test_data/validation/unicode_escape_string.sql index 58966f9e..4cb35317 100644 --- a/crates/squawk_syntax/test_data/validation/unicode_escape_string.sql +++ b/crates/squawk_syntax/test_data/validation/unicode_escape_string.sql @@ -9,6 +9,9 @@ select U&'\006'; select U&'\+0061'; select U&'wrong: \06' UESCAPE '\'; select U&'wrong: !061' UESCAPE '!'; +select U&'many: \061 \+0061 \zzzz'; +select U&'\D800\\'; +select U&'\D800\D801\DC00'; select U&' \'; select U&'error' UESCAPE ''; select U&'error' UESCAPE ' '; diff --git a/crates/xtask/src/sync_pg.rs b/crates/xtask/src/sync_pg.rs index 32f2f072..7223a18d 100644 --- a/crates/xtask/src/sync_pg.rs +++ b/crates/xtask/src/sync_pg.rs @@ -102,6 +102,7 @@ const IGNORED_LINES: &[&str] = &[ r#"SELECT U&'wrong: \db99\0061';"#, r#"SELECT U&'wrong: \+00db99\+000061';"#, r#"SELECT U&'wrong: \+2FFFFF';"#, + r#"SELECT U&'wrong: \db99\\';"#, r#"SELECT x'0 ';"#, r#"SELECT x' 0';"#, r#"SELECT b' 0';"#, diff --git a/postgres/regression_suite/strings.sql b/postgres/regression_suite/strings.sql index bef269f6..256b1748 100644 --- a/postgres/regression_suite/strings.sql +++ b/postgres/regression_suite/strings.sql @@ -31,7 +31,7 @@ SELECT 'tricky' AS U&"\" UESCAPE '!'; -- SELECT U&'wrong: \db99'; -- SELECT U&'wrong: \db99xy'; -SELECT U&'wrong: \db99\\'; +-- SELECT U&'wrong: \db99\\'; -- SELECT U&'wrong: \db99\0061'; -- SELECT U&'wrong: \+00db99\+000061'; -- SELECT U&'wrong: \+2FFFFF';