Skip to content

Commit cd4a0a6

Browse files
authored
parser: validated unicode uescape character (#1125)
1 parent 5c16fa2 commit cd4a0a6

7 files changed

Lines changed: 266 additions & 10 deletions

File tree

crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
source: crates/squawk_syntax/src/test.rs
33
input_file: crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql
44
---
5-
SOURCE_FILE@0..243
5+
SOURCE_FILE@0..425
66
COMMENT@0..5 "-- ok"
77
WHITESPACE@5..6 "\n"
88
SELECT@6..32
@@ -113,6 +113,90 @@ SOURCE_FILE@0..243
113113
IDENT@235..241 "U&\" \\\""
114114
SEMICOLON@241..242 ";"
115115
WHITESPACE@242..243 "\n"
116+
SELECT@243..270
117+
SELECT_CLAUSE@243..270
118+
SELECT_KW@243..249 "select"
119+
WHITESPACE@249..250 " "
120+
TARGET_LIST@250..270
121+
TARGET@250..270
122+
NAME_REF@250..270
123+
IDENT@250..259 "U&\"error\""
124+
WHITESPACE@259..260 " "
125+
UESCAPE_KW@260..267 "UESCAPE"
126+
WHITESPACE@267..268 " "
127+
STRING@268..270 "''"
128+
SEMICOLON@270..271 ";"
129+
WHITESPACE@271..272 "\n"
130+
SELECT@272..300
131+
SELECT_CLAUSE@272..300
132+
SELECT_KW@272..278 "select"
133+
WHITESPACE@278..279 " "
134+
TARGET_LIST@279..300
135+
TARGET@279..300
136+
NAME_REF@279..300
137+
IDENT@279..288 "U&\"error\""
138+
WHITESPACE@288..289 " "
139+
UESCAPE_KW@289..296 "UESCAPE"
140+
WHITESPACE@296..297 " "
141+
STRING@297..300 "' '"
142+
SEMICOLON@300..301 ";"
143+
WHITESPACE@301..302 "\n"
144+
SELECT@302..330
145+
SELECT_CLAUSE@302..330
146+
SELECT_KW@302..308 "select"
147+
WHITESPACE@308..309 " "
148+
TARGET_LIST@309..330
149+
TARGET@309..330
150+
NAME_REF@309..330
151+
IDENT@309..318 "U&\"error\""
152+
WHITESPACE@318..319 " "
153+
UESCAPE_KW@319..326 "UESCAPE"
154+
WHITESPACE@326..327 " "
155+
STRING@327..330 "'+'"
156+
SEMICOLON@330..331 ";"
157+
WHITESPACE@331..332 "\n"
158+
SELECT@332..360
159+
SELECT_CLAUSE@332..360
160+
SELECT_KW@332..338 "select"
161+
WHITESPACE@338..339 " "
162+
TARGET_LIST@339..360
163+
TARGET@339..360
164+
NAME_REF@339..360
165+
IDENT@339..348 "U&\"error\""
166+
WHITESPACE@348..349 " "
167+
UESCAPE_KW@349..356 "UESCAPE"
168+
WHITESPACE@356..357 " "
169+
STRING@357..360 "'A'"
170+
SEMICOLON@360..361 ";"
171+
WHITESPACE@361..362 "\n"
172+
SELECT@362..391
173+
SELECT_CLAUSE@362..391
174+
SELECT_KW@362..368 "select"
175+
WHITESPACE@368..369 " "
176+
TARGET_LIST@369..391
177+
TARGET@369..391
178+
NAME_REF@369..391
179+
IDENT@369..378 "U&\"error\""
180+
WHITESPACE@378..379 " "
181+
UESCAPE_KW@379..386 "UESCAPE"
182+
WHITESPACE@386..387 " "
183+
STRING@387..391 "'é'"
184+
SEMICOLON@391..392 ";"
185+
WHITESPACE@392..393 "\n"
186+
SELECT@393..423
187+
SELECT_CLAUSE@393..423
188+
SELECT_KW@393..399 "select"
189+
WHITESPACE@399..400 " "
190+
TARGET_LIST@400..423
191+
TARGET@400..423
192+
NAME_REF@400..423
193+
IDENT@400..409 "U&\"error\""
194+
WHITESPACE@409..410 " "
195+
UESCAPE_KW@410..417 "UESCAPE"
196+
WHITESPACE@417..418 " "
197+
STRING@418..423 "'foo'"
198+
SEMICOLON@423..424 ";"
199+
WHITESPACE@424..425 "\n"
116200

117201
error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX
118202
╭▸
@@ -134,3 +218,27 @@ error[syntax-error]: Invalid Unicode escape sequence
134218
╭▸
135219
12select U&" \";
136220
╰╴ ━━━━━━
221+
error[syntax-error]: Invalid unicode escape character
222+
╭▸
223+
13select U&"error" UESCAPE '';
224+
╰╴ ━━
225+
error[syntax-error]: Invalid unicode escape character
226+
╭▸
227+
14select U&"error" UESCAPE ' ';
228+
╰╴ ━━━
229+
error[syntax-error]: Invalid unicode escape character
230+
╭▸
231+
15select U&"error" UESCAPE '+';
232+
╰╴ ━━━
233+
error[syntax-error]: Invalid unicode escape character
234+
╭▸
235+
16select U&"error" UESCAPE 'A';
236+
╰╴ ━━━
237+
error[syntax-error]: Invalid unicode escape character
238+
╭▸
239+
17select U&"error" UESCAPE 'é';
240+
╰╴ ━━━
241+
error[syntax-error]: Invalid unicode escape character
242+
╭▸
243+
18select U&"error" UESCAPE 'foo';
244+
╰╴ ━━━━━

crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_string_validation.snap

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
source: crates/squawk_syntax/src/test.rs
33
input_file: crates/squawk_syntax/test_data/validation/unicode_escape_string.sql
44
---
5-
SOURCE_FILE@0..241
5+
SOURCE_FILE@0..423
66
COMMENT@0..5 "-- ok"
77
WHITESPACE@5..6 "\n"
88
SELECT@6..30
@@ -113,6 +113,90 @@ SOURCE_FILE@0..241
113113
UNICODE_ESC_STRING@233..239 "U&' \\'"
114114
SEMICOLON@239..240 ";"
115115
WHITESPACE@240..241 "\n"
116+
SELECT@241..268
117+
SELECT_CLAUSE@241..268
118+
SELECT_KW@241..247 "select"
119+
WHITESPACE@247..248 " "
120+
TARGET_LIST@248..268
121+
TARGET@248..268
122+
LITERAL@248..268
123+
UNICODE_ESC_STRING@248..257 "U&'error'"
124+
WHITESPACE@257..258 " "
125+
UESCAPE_KW@258..265 "UESCAPE"
126+
WHITESPACE@265..266 " "
127+
STRING@266..268 "''"
128+
SEMICOLON@268..269 ";"
129+
WHITESPACE@269..270 "\n"
130+
SELECT@270..298
131+
SELECT_CLAUSE@270..298
132+
SELECT_KW@270..276 "select"
133+
WHITESPACE@276..277 " "
134+
TARGET_LIST@277..298
135+
TARGET@277..298
136+
LITERAL@277..298
137+
UNICODE_ESC_STRING@277..286 "U&'error'"
138+
WHITESPACE@286..287 " "
139+
UESCAPE_KW@287..294 "UESCAPE"
140+
WHITESPACE@294..295 " "
141+
STRING@295..298 "' '"
142+
SEMICOLON@298..299 ";"
143+
WHITESPACE@299..300 "\n"
144+
SELECT@300..328
145+
SELECT_CLAUSE@300..328
146+
SELECT_KW@300..306 "select"
147+
WHITESPACE@306..307 " "
148+
TARGET_LIST@307..328
149+
TARGET@307..328
150+
LITERAL@307..328
151+
UNICODE_ESC_STRING@307..316 "U&'error'"
152+
WHITESPACE@316..317 " "
153+
UESCAPE_KW@317..324 "UESCAPE"
154+
WHITESPACE@324..325 " "
155+
STRING@325..328 "'+'"
156+
SEMICOLON@328..329 ";"
157+
WHITESPACE@329..330 "\n"
158+
SELECT@330..358
159+
SELECT_CLAUSE@330..358
160+
SELECT_KW@330..336 "select"
161+
WHITESPACE@336..337 " "
162+
TARGET_LIST@337..358
163+
TARGET@337..358
164+
LITERAL@337..358
165+
UNICODE_ESC_STRING@337..346 "U&'error'"
166+
WHITESPACE@346..347 " "
167+
UESCAPE_KW@347..354 "UESCAPE"
168+
WHITESPACE@354..355 " "
169+
STRING@355..358 "'A'"
170+
SEMICOLON@358..359 ";"
171+
WHITESPACE@359..360 "\n"
172+
SELECT@360..389
173+
SELECT_CLAUSE@360..389
174+
SELECT_KW@360..366 "select"
175+
WHITESPACE@366..367 " "
176+
TARGET_LIST@367..389
177+
TARGET@367..389
178+
LITERAL@367..389
179+
UNICODE_ESC_STRING@367..376 "U&'error'"
180+
WHITESPACE@376..377 " "
181+
UESCAPE_KW@377..384 "UESCAPE"
182+
WHITESPACE@384..385 " "
183+
STRING@385..389 "'é'"
184+
SEMICOLON@389..390 ";"
185+
WHITESPACE@390..391 "\n"
186+
SELECT@391..421
187+
SELECT_CLAUSE@391..421
188+
SELECT_KW@391..397 "select"
189+
WHITESPACE@397..398 " "
190+
TARGET_LIST@398..421
191+
TARGET@398..421
192+
LITERAL@398..421
193+
UNICODE_ESC_STRING@398..407 "U&'error'"
194+
WHITESPACE@407..408 " "
195+
UESCAPE_KW@408..415 "UESCAPE"
196+
WHITESPACE@415..416 " "
197+
STRING@416..421 "'foo'"
198+
SEMICOLON@421..422 ";"
199+
WHITESPACE@422..423 "\n"
116200

117201
error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX
118202
╭▸
@@ -134,3 +218,27 @@ error[syntax-error]: Invalid Unicode escape sequence
134218
╭▸
135219
12select U&' \';
136220
╰╴ ━━━━━━
221+
error[syntax-error]: Invalid unicode escape character
222+
╭▸
223+
13select U&'error' UESCAPE '';
224+
╰╴ ━━
225+
error[syntax-error]: Invalid unicode escape character
226+
╭▸
227+
14select U&'error' UESCAPE ' ';
228+
╰╴ ━━━
229+
error[syntax-error]: Invalid unicode escape character
230+
╭▸
231+
15select U&'error' UESCAPE '+';
232+
╰╴ ━━━
233+
error[syntax-error]: Invalid unicode escape character
234+
╭▸
235+
16select U&'error' UESCAPE 'A';
236+
╰╴ ━━━
237+
error[syntax-error]: Invalid unicode escape character
238+
╭▸
239+
17select U&'error' UESCAPE 'é';
240+
╰╴ ━━━
241+
error[syntax-error]: Invalid unicode escape character
242+
╭▸
243+
18select U&'error' UESCAPE 'foo';
244+
╰╴ ━━━━━

crates/squawk_syntax/src/validation.rs

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,15 @@ fn validate_unicode_esc_string(lit: &ast::Literal) -> Option<SyntaxError> {
192192
UNICODE_ESC_STRING => unicode_esc = Some(token),
193193
UESCAPE_KW => seen_uescape = true,
194194
STRING if seen_uescape => {
195-
escape_char = uescape_char(&token).unwrap_or(escape_char);
195+
escape_char = match uescape_char(&token) {
196+
Some(ch) => ch,
197+
None => {
198+
return Some(SyntaxError::new(
199+
"Invalid unicode escape character",
200+
token.text_range(),
201+
));
202+
}
203+
};
196204
break;
197205
}
198206
_ => (),
@@ -224,7 +232,15 @@ fn validate_unicode_esc_ident(token: &SyntaxToken) -> Option<SyntaxError> {
224232
UESCAPE_KW => seen_uescape = true,
225233
STRING if seen_uescape => {
226234
if let Some(string_token) = element.as_token() {
227-
escape_char = uescape_char(string_token).unwrap_or(escape_char);
235+
escape_char = match uescape_char(string_token) {
236+
Some(ch) => ch,
237+
None => {
238+
return Some(SyntaxError::new(
239+
"Invalid unicode escape character",
240+
string_token.text_range(),
241+
));
242+
}
243+
};
228244
}
229245
break;
230246
}
@@ -237,14 +253,25 @@ fn validate_unicode_esc_ident(token: &SyntaxToken) -> Option<SyntaxError> {
237253
Some(SyntaxError::new(err.to_string(), token.text_range()))
238254
}
239255

256+
// https://github.com/postgres/postgres/blob/228a1f9542792c6533ef74c2e7aefad0da1d9a7a/src/backend/parser/parser.c#L350
257+
const fn is_valid_uescape_char(byte: u8) -> bool {
258+
!byte.is_ascii_hexdigit()
259+
&& byte != b'+'
260+
&& byte != b'\''
261+
&& byte != b'"'
262+
&& !matches!(
263+
byte,
264+
b' ' | b'\t' | b'\n' | b'\r' | /* b'\v' */ 0x0B | /* b'\f' */ 0x0C
265+
)
266+
}
267+
240268
fn uescape_char(string_token: &SyntaxToken) -> Option<char> {
241269
let text = string_token.text();
242270
let inner = text.strip_prefix('\'')?.strip_suffix('\'')?;
243-
let mut chars = inner.chars();
244-
match (chars.next(), chars.next()) {
245-
(Some(c), None) => Some(c),
246-
_ => None,
247-
}
271+
let &[byte] = inner.as_bytes() else {
272+
return None;
273+
};
274+
is_valid_uescape_char(byte).then(|| char::from(byte))
248275
}
249276

250277
enum UnicodeEscapeKind {

crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,9 @@ select U&"\+0061";
1010
select U&"wrong: \06" UESCAPE '\';
1111
select U&"wrong: !061" UESCAPE '!';
1212
select U&" \";
13+
select U&"error" UESCAPE '';
14+
select U&"error" UESCAPE ' ';
15+
select U&"error" UESCAPE '+';
16+
select U&"error" UESCAPE 'A';
17+
select U&"error" UESCAPE 'é';
18+
select U&"error" UESCAPE 'foo';

crates/squawk_syntax/test_data/validation/unicode_escape_string.sql

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,9 @@ select U&'\+0061';
1010
select U&'wrong: \06' UESCAPE '\';
1111
select U&'wrong: !061' UESCAPE '!';
1212
select U&' \';
13+
select U&'error' UESCAPE '';
14+
select U&'error' UESCAPE ' ';
15+
select U&'error' UESCAPE '+';
16+
select U&'error' UESCAPE 'A';
17+
select U&'error' UESCAPE 'é';
18+
select U&'error' UESCAPE 'foo';

crates/xtask/src/sync_pg.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ const IGNORED_LINES: &[&str] = &[
5151
"CREATE SUBSCRIPTION regress_testsub CONNECTION 'foo';",
5252
"CREATE SUBSCRIPTION regress_testsub PUBLICATION foo;",
5353
"SELECT U&'wrong: +0061' UESCAPE +;",
54+
"SELECT U&'wrong: +0061' UESCAPE '+';",
5455
"CREATE STATISTICS tst;",
5556
"CREATE STATISTICS tst ON a, b;",
5657
"CREATE STATISTICS tst ON a FROM (VALUES (x)) AS foo;",

postgres/regression_suite/strings.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ SELECT 'tricky' AS U&"\" UESCAPE '!';
2727
-- SELECT U&'wrong: \061';
2828
-- SELECT U&'wrong: \+0061';
2929
-- SELECT U&'wrong: +0061' UESCAPE +;
30-
SELECT U&'wrong: +0061' UESCAPE '+';
30+
-- SELECT U&'wrong: +0061' UESCAPE '+';
3131
3232
-- SELECT U&'wrong: \db99';
3333
-- SELECT U&'wrong: \db99xy';

0 commit comments

Comments
 (0)