Skip to content

Commit 5c16fa2

Browse files
authored
parser: validate escape sequences for unicode escape ident (#1123)
1 parent 7c294c6 commit 5c16fa2

3 files changed

Lines changed: 197 additions & 10 deletions

File tree

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
---
2+
source: crates/squawk_syntax/src/test.rs
3+
input_file: crates/squawk_syntax/test_data/validation/unicode_escape_ident.sql
4+
---
5+
SOURCE_FILE@0..243
6+
COMMENT@0..5 "-- ok"
7+
WHITESPACE@5..6 "\n"
8+
SELECT@6..32
9+
SELECT_CLAUSE@6..32
10+
SELECT_KW@6..12 "select"
11+
WHITESPACE@12..13 " "
12+
TARGET_LIST@13..32
13+
TARGET@13..32
14+
NAME_REF@13..32
15+
IDENT@13..32 "U&\"d\\0061t\\+000061\""
16+
SEMICOLON@32..33 ";"
17+
WHITESPACE@33..34 "\n"
18+
SELECT@34..47
19+
SELECT_CLAUSE@34..47
20+
SELECT_KW@34..40 "select"
21+
WHITESPACE@40..41 " "
22+
TARGET_LIST@41..47
23+
TARGET@41..47
24+
NAME_REF@41..47
25+
IDENT@41..47 "U&\"\\\\\""
26+
SEMICOLON@47..48 ";"
27+
WHITESPACE@48..49 "\n"
28+
SELECT@49..81
29+
SELECT_CLAUSE@49..81
30+
SELECT_KW@49..55 "select"
31+
WHITESPACE@55..56 " "
32+
TARGET_LIST@56..81
33+
TARGET@56..81
34+
NAME_REF@56..81
35+
IDENT@56..69 "U&\"ok: !0061\""
36+
WHITESPACE@69..70 " "
37+
UESCAPE_KW@70..77 "UESCAPE"
38+
WHITESPACE@77..78 " "
39+
STRING@78..81 "'!'"
40+
SEMICOLON@81..82 ";"
41+
WHITESPACE@82..83 "\n"
42+
SELECT@83..108
43+
SELECT_CLAUSE@83..108
44+
SELECT_KW@83..89 "select"
45+
WHITESPACE@89..90 " "
46+
TARGET_LIST@90..108
47+
TARGET@90..108
48+
NAME_REF@90..108
49+
IDENT@90..96 "U&\" \\\""
50+
WHITESPACE@96..97 " "
51+
UESCAPE_KW@97..104 "UESCAPE"
52+
WHITESPACE@104..105 " "
53+
STRING@105..108 "'!'"
54+
SEMICOLON@108..109 ";"
55+
WHITESPACE@109..111 "\n\n"
56+
COMMENT@111..120 "-- errors"
57+
WHITESPACE@120..121 "\n"
58+
SELECT@121..136
59+
SELECT_CLAUSE@121..136
60+
SELECT_KW@121..127 "select"
61+
WHITESPACE@127..128 " "
62+
TARGET_LIST@128..136
63+
TARGET@128..136
64+
NAME_REF@128..136
65+
IDENT@128..136 "U&\"\\006\""
66+
SEMICOLON@136..137 ";"
67+
WHITESPACE@137..138 "\n"
68+
SELECT@138..155
69+
SELECT_CLAUSE@138..155
70+
SELECT_KW@138..144 "select"
71+
WHITESPACE@144..145 " "
72+
TARGET_LIST@145..155
73+
TARGET@145..155
74+
NAME_REF@145..155
75+
IDENT@145..155 "U&\"\\+0061\""
76+
SEMICOLON@155..156 ";"
77+
WHITESPACE@156..157 "\n"
78+
SELECT@157..190
79+
SELECT_CLAUSE@157..190
80+
SELECT_KW@157..163 "select"
81+
WHITESPACE@163..164 " "
82+
TARGET_LIST@164..190
83+
TARGET@164..190
84+
NAME_REF@164..190
85+
IDENT@164..178 "U&\"wrong: \\06\""
86+
WHITESPACE@178..179 " "
87+
UESCAPE_KW@179..186 "UESCAPE"
88+
WHITESPACE@186..187 " "
89+
STRING@187..190 "'\\'"
90+
SEMICOLON@190..191 ";"
91+
WHITESPACE@191..192 "\n"
92+
SELECT@192..226
93+
SELECT_CLAUSE@192..226
94+
SELECT_KW@192..198 "select"
95+
WHITESPACE@198..199 " "
96+
TARGET_LIST@199..226
97+
TARGET@199..226
98+
NAME_REF@199..226
99+
IDENT@199..214 "U&\"wrong: !061\""
100+
WHITESPACE@214..215 " "
101+
UESCAPE_KW@215..222 "UESCAPE"
102+
WHITESPACE@222..223 " "
103+
STRING@223..226 "'!'"
104+
SEMICOLON@226..227 ";"
105+
WHITESPACE@227..228 "\n"
106+
SELECT@228..241
107+
SELECT_CLAUSE@228..241
108+
SELECT_KW@228..234 "select"
109+
WHITESPACE@234..235 " "
110+
TARGET_LIST@235..241
111+
TARGET@235..241
112+
NAME_REF@235..241
113+
IDENT@235..241 "U&\" \\\""
114+
SEMICOLON@241..242 ";"
115+
WHITESPACE@242..243 "\n"
116+
117+
error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX
118+
╭▸
119+
8select U&"\006";
120+
╰╴ ━━━━━━━━
121+
error[syntax-error]: Unicode escape requires 6 hex digits: \+XXXXXX
122+
╭▸
123+
9select U&"\+0061";
124+
╰╴ ━━━━━━━━━━
125+
error[syntax-error]: Unicode escape requires 4 hex digits: \XXXX
126+
╭▸
127+
10select U&"wrong: \06" UESCAPE '\';
128+
╰╴ ━━━━━━━━━━━━━━
129+
error[syntax-error]: Unicode escape requires 4 hex digits: !XXXX
130+
╭▸
131+
11select U&"wrong: !061" UESCAPE '!';
132+
╰╴ ━━━━━━━━━━━━━━━
133+
error[syntax-error]: Invalid Unicode escape sequence
134+
╭▸
135+
12select U&" \";
136+
╰╴ ━━━━━━

crates/squawk_syntax/src/validation.rs

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use std::fmt;
88
use std::ops::RangeInclusive;
99

1010
use crate::ast::AstNode;
11-
use crate::{SyntaxNode, ast, match_ast, syntax_error::SyntaxError};
11+
use crate::{SyntaxNode, SyntaxToken, ast, match_ast, syntax_error::SyntaxError};
1212
use rowan::{TextRange, TextSize};
1313
use squawk_parser::SyntaxKind::*;
1414
pub(crate) fn validate(root: &SyntaxNode, errors: &mut Vec<SyntaxError>) {
@@ -30,6 +30,14 @@ pub(crate) fn validate(root: &SyntaxNode, errors: &mut Vec<SyntaxError>) {
3030
}
3131
}
3232
}
33+
for element in root.descendants_with_tokens() {
34+
if let Some(token) = element.into_token()
35+
&& token.kind() == IDENT
36+
&& let Some(err) = validate_unicode_esc_ident(&token)
37+
{
38+
errors.push(err);
39+
}
40+
}
3341
}
3442

3543
fn validate_select(it: ast::Select, acc: &mut Vec<SyntaxError>) {
@@ -184,15 +192,7 @@ fn validate_unicode_esc_string(lit: &ast::Literal) -> Option<SyntaxError> {
184192
UNICODE_ESC_STRING => unicode_esc = Some(token),
185193
UESCAPE_KW => seen_uescape = true,
186194
STRING if seen_uescape => {
187-
let text = token.text();
188-
let inner = text
189-
.strip_prefix('\'')
190-
.and_then(|s| s.strip_suffix('\''))
191-
.unwrap_or("");
192-
let mut chars = inner.chars();
193-
if let (Some(c), None) = (chars.next(), chars.next()) {
194-
escape_char = c;
195-
}
195+
escape_char = uescape_char(&token).unwrap_or(escape_char);
196196
break;
197197
}
198198
_ => (),
@@ -208,6 +208,45 @@ fn validate_unicode_esc_string(lit: &ast::Literal) -> Option<SyntaxError> {
208208
Some(SyntaxError::new(err.to_string(), token.text_range()))
209209
}
210210

211+
fn validate_unicode_esc_ident(token: &SyntaxToken) -> Option<SyntaxError> {
212+
let text = token.text();
213+
let inside = text
214+
.strip_prefix("U&\"")
215+
.or_else(|| text.strip_prefix("u&\""))
216+
.and_then(|s| s.strip_suffix('"'))?;
217+
218+
let mut escape_char = '\\';
219+
let mut seen_uescape = false;
220+
let mut next = token.next_sibling_or_token();
221+
while let Some(element) = next {
222+
match element.kind() {
223+
WHITESPACE | COMMENT => (),
224+
UESCAPE_KW => seen_uescape = true,
225+
STRING if seen_uescape => {
226+
if let Some(string_token) = element.as_token() {
227+
escape_char = uescape_char(string_token).unwrap_or(escape_char);
228+
}
229+
break;
230+
}
231+
_ => break,
232+
}
233+
next = element.next_sibling_or_token();
234+
}
235+
236+
let err = check_unicode_esc_str(inside, escape_char)?;
237+
Some(SyntaxError::new(err.to_string(), token.text_range()))
238+
}
239+
240+
fn uescape_char(string_token: &SyntaxToken) -> Option<char> {
241+
let text = string_token.text();
242+
let inner = text.strip_prefix('\'')?.strip_suffix('\'')?;
243+
let mut chars = inner.chars();
244+
match (chars.next(), chars.next()) {
245+
(Some(c), None) => Some(c),
246+
_ => None,
247+
}
248+
}
249+
211250
enum UnicodeEscapeKind {
212251
Short,
213252
Extended,
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
-- ok
2+
select U&"d\0061t\+000061";
3+
select U&"\\";
4+
select U&"ok: !0061" UESCAPE '!';
5+
select U&" \" UESCAPE '!';
6+
7+
-- errors
8+
select U&"\006";
9+
select U&"\+0061";
10+
select U&"wrong: \06" UESCAPE '\';
11+
select U&"wrong: !061" UESCAPE '!';
12+
select U&" \";

0 commit comments

Comments
 (0)