Skip to content

Commit 02c90e8

Browse files
authored
parser: refactor unicode escape validation (#1129)
1 parent 4d8db71 commit 02c90e8

5 files changed

Lines changed: 242 additions & 145 deletions

File tree

crates/squawk_syntax/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ pub mod quote;
3333
pub mod syntax_error;
3434
mod syntax_node;
3535
mod token_text;
36+
mod unescape;
3637
mod validation;
3738

3839
#[cfg(test)]

crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_ident_validation.snap

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,10 @@ error[syntax-error]: Invalid Unicode surrogate pair
250250
╭▸
251251
13select U&"\D800\D801\DC00";
252252
╰╴ ━━━━━━━━━━
253+
error[syntax-error]: Invalid Unicode surrogate pair
254+
╭▸
255+
13select U&"\D800\D801\DC00";
256+
╰╴ ━━━━━
253257
error[syntax-error]: Invalid Unicode escape sequence
254258
╭▸
255259
14select U&" \";

crates/squawk_syntax/src/snapshots/squawk_syntax__test__unicode_escape_string_validation.snap

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,10 @@ error[syntax-error]: Invalid Unicode surrogate pair
264264
╭▸
265265
14select U&'\D800\D801\DC00';
266266
╰╴ ━━━━━━━━━━
267+
error[syntax-error]: Invalid Unicode surrogate pair
268+
╭▸
269+
14select U&'\D800\D801\DC00';
270+
╰╴ ━━━━━
267271
error[syntax-error]: Invalid Unicode escape sequence
268272
╭▸
269273
15select U&' \';
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
use std::fmt;
2+
use std::ops::{Range, RangeInclusive};
3+
4+
pub(crate) enum UnicodeEscapeKind {
5+
Extended,
6+
Short,
7+
}
8+
9+
impl UnicodeEscapeKind {
10+
fn count(&self) -> u32 {
11+
match self {
12+
UnicodeEscapeKind::Extended => 6,
13+
UnicodeEscapeKind::Short => 4,
14+
}
15+
}
16+
}
17+
18+
pub(crate) enum UnicodeEscError {
19+
InvalidEscape,
20+
InvalidSurrogatePair,
21+
OutOfRange,
22+
RequiresHexDigits {
23+
kind: UnicodeEscapeKind,
24+
escape_char: char,
25+
},
26+
}
27+
28+
impl fmt::Display for UnicodeEscError {
29+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
30+
match self {
31+
Self::InvalidEscape => f.write_str("Invalid Unicode escape sequence"),
32+
Self::InvalidSurrogatePair => f.write_str("Invalid Unicode surrogate pair"),
33+
Self::OutOfRange => f.write_str("Unicode escape value out of range"),
34+
Self::RequiresHexDigits { kind, escape_char } => {
35+
let required = kind.count();
36+
let plus = match kind {
37+
UnicodeEscapeKind::Extended => "+",
38+
UnicodeEscapeKind::Short => "",
39+
};
40+
let xs = "X".repeat(required as usize);
41+
write!(
42+
f,
43+
"Unicode escape requires {required} hex digits: {escape_char}{plus}{xs}"
44+
)
45+
}
46+
}
47+
}
48+
}
49+
50+
pub(crate) fn escape_unicode_esc_str<F>(text: &str, escape_char: char, mut callback: F)
51+
where
52+
F: FnMut(Range<usize>, Result<char, UnicodeEscError>),
53+
{
54+
const HIGH_SURROGATE: RangeInclusive<u32> = 0xD800..=0xDBFF;
55+
const LOW_SURROGATE: RangeInclusive<u32> = 0xDC00..=0xDFFF;
56+
const MAX_CODEPOINT: u32 = 0x10FFFF;
57+
58+
let mut chars = text.char_indices().peekable();
59+
let mut high_surrogate: Option<(Range<usize>, u32)> = None;
60+
61+
while let Some((escape_start, c)) = chars.next() {
62+
if c != escape_char {
63+
if let Some((hi_range, _)) = high_surrogate.take() {
64+
callback(hi_range, Err(UnicodeEscError::InvalidSurrogatePair));
65+
}
66+
callback(escape_start..escape_start + c.len_utf8(), Ok(c));
67+
continue;
68+
}
69+
let kind = match chars.peek() {
70+
Some(&(_, c)) if c == escape_char => {
71+
chars.next();
72+
if let Some((hi_range, _)) = high_surrogate.take() {
73+
callback(hi_range, Err(UnicodeEscError::InvalidSurrogatePair));
74+
}
75+
let end = escape_start + escape_char.len_utf8() * 2;
76+
callback(escape_start..end, Ok(escape_char));
77+
continue;
78+
}
79+
Some(&(_, '+')) => {
80+
chars.next();
81+
UnicodeEscapeKind::Extended
82+
}
83+
Some(&(_, c)) if c.is_ascii_hexdigit() => UnicodeEscapeKind::Short,
84+
_ => {
85+
let end = chars
86+
.next()
87+
.map(|(i, c)| i + c.len_utf8())
88+
.unwrap_or(text.len());
89+
if let Some((hi_range, _)) = high_surrogate.take() {
90+
callback(hi_range, Err(UnicodeEscError::InvalidSurrogatePair));
91+
}
92+
callback(escape_start..end, Err(UnicodeEscError::InvalidEscape));
93+
continue;
94+
}
95+
};
96+
let mut codepoint: u32 = 0;
97+
let mut got_all = true;
98+
let mut last_end = chars.peek().map(|&(i, _)| i).unwrap_or(text.len());
99+
for _ in 0..kind.count() {
100+
let radix = 16;
101+
let Some(&(i, ch)) = chars.peek() else {
102+
got_all = false;
103+
break;
104+
};
105+
let Some(d) = ch.to_digit(radix) else {
106+
got_all = false;
107+
break;
108+
};
109+
chars.next();
110+
codepoint = codepoint * radix + d;
111+
last_end = i + ch.len_utf8();
112+
}
113+
if !got_all {
114+
if let Some((hi_range, _)) = high_surrogate.take() {
115+
callback(hi_range, Err(UnicodeEscError::InvalidSurrogatePair));
116+
}
117+
callback(
118+
escape_start..last_end,
119+
Err(UnicodeEscError::RequiresHexDigits { kind, escape_char }),
120+
);
121+
continue;
122+
}
123+
if let Some((hi_range, hi_cp)) = high_surrogate.take() {
124+
if LOW_SURROGATE.contains(&codepoint) {
125+
let combined = 0x10000 + ((hi_cp - 0xD800) << 10) + (codepoint - 0xDC00);
126+
let ch = char::from_u32(combined).unwrap();
127+
callback(hi_range.start..last_end, Ok(ch));
128+
continue;
129+
}
130+
callback(
131+
hi_range.start..last_end,
132+
Err(UnicodeEscError::InvalidSurrogatePair),
133+
);
134+
continue;
135+
}
136+
if codepoint > MAX_CODEPOINT {
137+
callback(escape_start..last_end, Err(UnicodeEscError::OutOfRange));
138+
} else if HIGH_SURROGATE.contains(&codepoint) {
139+
high_surrogate = Some((escape_start..last_end, codepoint));
140+
} else if LOW_SURROGATE.contains(&codepoint) {
141+
callback(
142+
escape_start..last_end,
143+
Err(UnicodeEscError::InvalidSurrogatePair),
144+
);
145+
} else {
146+
let ch = char::from_u32(codepoint).unwrap();
147+
callback(escape_start..last_end, Ok(ch));
148+
}
149+
}
150+
if let Some((range, _)) = high_surrogate {
151+
callback(range, Err(UnicodeEscError::InvalidSurrogatePair));
152+
}
153+
}
154+
155+
#[cfg(test)]
156+
mod tests {
157+
use insta::assert_snapshot;
158+
159+
use super::*;
160+
161+
fn unicode_escape_events(text: &str, escape_char: char) -> String {
162+
let mut events = vec![];
163+
164+
escape_unicode_esc_str(text, escape_char, |range, result| {
165+
let entry = match result {
166+
Ok(ch) => format!("{}..{} ok {ch:?}", range.start, range.end),
167+
Err(err) => format!("{}..{} err {err}", range.start, range.end),
168+
};
169+
events.push(entry);
170+
});
171+
172+
events.join("\n")
173+
}
174+
175+
#[test]
176+
fn incomplete_unicode_escape_breaks_surrogate_pairing() {
177+
assert_snapshot!(unicode_escape_events(r"\D800\006\DC00", '\\'), @r"
178+
0..5 err Invalid Unicode surrogate pair
179+
5..9 err Unicode escape requires 4 hex digits: \XXXX
180+
9..14 err Invalid Unicode surrogate pair
181+
");
182+
}
183+
184+
#[test]
185+
fn invalid_unicode_escape_breaks_surrogate_pairing() {
186+
assert_snapshot!(unicode_escape_events(r"\D800\Q\DC00", '\\'), @r"
187+
0..5 err Invalid Unicode surrogate pair
188+
5..7 err Invalid Unicode escape sequence
189+
7..12 err Invalid Unicode surrogate pair
190+
");
191+
}
192+
193+
#[test]
194+
fn invalid_unicode_escape_does_not_emit_literal_char() {
195+
assert_snapshot!(unicode_escape_events(r"\0061\Q\0062", '\\'), @r"
196+
0..5 ok 'a'
197+
5..7 err Invalid Unicode escape sequence
198+
7..12 ok 'b'
199+
");
200+
}
201+
202+
#[test]
203+
fn invalid_unicode_escape_works_with_custom_escape_char() {
204+
assert_snapshot!(unicode_escape_events("!0061!Q!0062", '!'), @r"
205+
0..5 ok 'a'
206+
5..7 err Invalid Unicode escape sequence
207+
7..12 ok 'b'
208+
");
209+
}
210+
211+
#[test]
212+
fn valid_unicode_escape_after_high_surrogate_only_emits_error() {
213+
assert_snapshot!(unicode_escape_events(r"\D800\0061", '\\'), @r"
214+
0..10 err Invalid Unicode surrogate pair
215+
");
216+
}
217+
}

0 commit comments

Comments
 (0)