|
| 1 | +use std::fmt; |
| 2 | +use std::ops::{Range, RangeInclusive}; |
| 3 | + |
| 4 | +pub(crate) enum UnicodeEscapeKind { |
| 5 | + Extended, |
| 6 | + Short, |
| 7 | +} |
| 8 | + |
| 9 | +impl UnicodeEscapeKind { |
| 10 | + fn count(&self) -> u32 { |
| 11 | + match self { |
| 12 | + UnicodeEscapeKind::Extended => 6, |
| 13 | + UnicodeEscapeKind::Short => 4, |
| 14 | + } |
| 15 | + } |
| 16 | +} |
| 17 | + |
| 18 | +pub(crate) enum UnicodeEscError { |
| 19 | + InvalidEscape, |
| 20 | + InvalidSurrogatePair, |
| 21 | + OutOfRange, |
| 22 | + RequiresHexDigits { |
| 23 | + kind: UnicodeEscapeKind, |
| 24 | + escape_char: char, |
| 25 | + }, |
| 26 | +} |
| 27 | + |
| 28 | +impl fmt::Display for UnicodeEscError { |
| 29 | + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 30 | + match self { |
| 31 | + Self::InvalidEscape => f.write_str("Invalid Unicode escape sequence"), |
| 32 | + Self::InvalidSurrogatePair => f.write_str("Invalid Unicode surrogate pair"), |
| 33 | + Self::OutOfRange => f.write_str("Unicode escape value out of range"), |
| 34 | + Self::RequiresHexDigits { kind, escape_char } => { |
| 35 | + let required = kind.count(); |
| 36 | + let plus = match kind { |
| 37 | + UnicodeEscapeKind::Extended => "+", |
| 38 | + UnicodeEscapeKind::Short => "", |
| 39 | + }; |
| 40 | + let xs = "X".repeat(required as usize); |
| 41 | + write!( |
| 42 | + f, |
| 43 | + "Unicode escape requires {required} hex digits: {escape_char}{plus}{xs}" |
| 44 | + ) |
| 45 | + } |
| 46 | + } |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +pub(crate) fn escape_unicode_esc_str<F>(text: &str, escape_char: char, mut callback: F) |
| 51 | +where |
| 52 | + F: FnMut(Range<usize>, Result<char, UnicodeEscError>), |
| 53 | +{ |
| 54 | + const HIGH_SURROGATE: RangeInclusive<u32> = 0xD800..=0xDBFF; |
| 55 | + const LOW_SURROGATE: RangeInclusive<u32> = 0xDC00..=0xDFFF; |
| 56 | + const MAX_CODEPOINT: u32 = 0x10FFFF; |
| 57 | + |
| 58 | + let mut chars = text.char_indices().peekable(); |
| 59 | + let mut high_surrogate: Option<(Range<usize>, u32)> = None; |
| 60 | + |
| 61 | + while let Some((escape_start, c)) = chars.next() { |
| 62 | + if c != escape_char { |
| 63 | + if let Some((hi_range, _)) = high_surrogate.take() { |
| 64 | + callback(hi_range, Err(UnicodeEscError::InvalidSurrogatePair)); |
| 65 | + } |
| 66 | + callback(escape_start..escape_start + c.len_utf8(), Ok(c)); |
| 67 | + continue; |
| 68 | + } |
| 69 | + let kind = match chars.peek() { |
| 70 | + Some(&(_, c)) if c == escape_char => { |
| 71 | + chars.next(); |
| 72 | + if let Some((hi_range, _)) = high_surrogate.take() { |
| 73 | + callback(hi_range, Err(UnicodeEscError::InvalidSurrogatePair)); |
| 74 | + } |
| 75 | + let end = escape_start + escape_char.len_utf8() * 2; |
| 76 | + callback(escape_start..end, Ok(escape_char)); |
| 77 | + continue; |
| 78 | + } |
| 79 | + Some(&(_, '+')) => { |
| 80 | + chars.next(); |
| 81 | + UnicodeEscapeKind::Extended |
| 82 | + } |
| 83 | + Some(&(_, c)) if c.is_ascii_hexdigit() => UnicodeEscapeKind::Short, |
| 84 | + _ => { |
| 85 | + let end = chars |
| 86 | + .next() |
| 87 | + .map(|(i, c)| i + c.len_utf8()) |
| 88 | + .unwrap_or(text.len()); |
| 89 | + if let Some((hi_range, _)) = high_surrogate.take() { |
| 90 | + callback(hi_range, Err(UnicodeEscError::InvalidSurrogatePair)); |
| 91 | + } |
| 92 | + callback(escape_start..end, Err(UnicodeEscError::InvalidEscape)); |
| 93 | + continue; |
| 94 | + } |
| 95 | + }; |
| 96 | + let mut codepoint: u32 = 0; |
| 97 | + let mut got_all = true; |
| 98 | + let mut last_end = chars.peek().map(|&(i, _)| i).unwrap_or(text.len()); |
| 99 | + for _ in 0..kind.count() { |
| 100 | + let radix = 16; |
| 101 | + let Some(&(i, ch)) = chars.peek() else { |
| 102 | + got_all = false; |
| 103 | + break; |
| 104 | + }; |
| 105 | + let Some(d) = ch.to_digit(radix) else { |
| 106 | + got_all = false; |
| 107 | + break; |
| 108 | + }; |
| 109 | + chars.next(); |
| 110 | + codepoint = codepoint * radix + d; |
| 111 | + last_end = i + ch.len_utf8(); |
| 112 | + } |
| 113 | + if !got_all { |
| 114 | + if let Some((hi_range, _)) = high_surrogate.take() { |
| 115 | + callback(hi_range, Err(UnicodeEscError::InvalidSurrogatePair)); |
| 116 | + } |
| 117 | + callback( |
| 118 | + escape_start..last_end, |
| 119 | + Err(UnicodeEscError::RequiresHexDigits { kind, escape_char }), |
| 120 | + ); |
| 121 | + continue; |
| 122 | + } |
| 123 | + if let Some((hi_range, hi_cp)) = high_surrogate.take() { |
| 124 | + if LOW_SURROGATE.contains(&codepoint) { |
| 125 | + let combined = 0x10000 + ((hi_cp - 0xD800) << 10) + (codepoint - 0xDC00); |
| 126 | + let ch = char::from_u32(combined).unwrap(); |
| 127 | + callback(hi_range.start..last_end, Ok(ch)); |
| 128 | + continue; |
| 129 | + } |
| 130 | + callback( |
| 131 | + hi_range.start..last_end, |
| 132 | + Err(UnicodeEscError::InvalidSurrogatePair), |
| 133 | + ); |
| 134 | + continue; |
| 135 | + } |
| 136 | + if codepoint > MAX_CODEPOINT { |
| 137 | + callback(escape_start..last_end, Err(UnicodeEscError::OutOfRange)); |
| 138 | + } else if HIGH_SURROGATE.contains(&codepoint) { |
| 139 | + high_surrogate = Some((escape_start..last_end, codepoint)); |
| 140 | + } else if LOW_SURROGATE.contains(&codepoint) { |
| 141 | + callback( |
| 142 | + escape_start..last_end, |
| 143 | + Err(UnicodeEscError::InvalidSurrogatePair), |
| 144 | + ); |
| 145 | + } else { |
| 146 | + let ch = char::from_u32(codepoint).unwrap(); |
| 147 | + callback(escape_start..last_end, Ok(ch)); |
| 148 | + } |
| 149 | + } |
| 150 | + if let Some((range, _)) = high_surrogate { |
| 151 | + callback(range, Err(UnicodeEscError::InvalidSurrogatePair)); |
| 152 | + } |
| 153 | +} |
| 154 | + |
| 155 | +#[cfg(test)] |
| 156 | +mod tests { |
| 157 | + use insta::assert_snapshot; |
| 158 | + |
| 159 | + use super::*; |
| 160 | + |
| 161 | + fn unicode_escape_events(text: &str, escape_char: char) -> String { |
| 162 | + let mut events = vec![]; |
| 163 | + |
| 164 | + escape_unicode_esc_str(text, escape_char, |range, result| { |
| 165 | + let entry = match result { |
| 166 | + Ok(ch) => format!("{}..{} ok {ch:?}", range.start, range.end), |
| 167 | + Err(err) => format!("{}..{} err {err}", range.start, range.end), |
| 168 | + }; |
| 169 | + events.push(entry); |
| 170 | + }); |
| 171 | + |
| 172 | + events.join("\n") |
| 173 | + } |
| 174 | + |
| 175 | + #[test] |
| 176 | + fn incomplete_unicode_escape_breaks_surrogate_pairing() { |
| 177 | + assert_snapshot!(unicode_escape_events(r"\D800\006\DC00", '\\'), @r" |
| 178 | + 0..5 err Invalid Unicode surrogate pair |
| 179 | + 5..9 err Unicode escape requires 4 hex digits: \XXXX |
| 180 | + 9..14 err Invalid Unicode surrogate pair |
| 181 | + "); |
| 182 | + } |
| 183 | + |
| 184 | + #[test] |
| 185 | + fn invalid_unicode_escape_breaks_surrogate_pairing() { |
| 186 | + assert_snapshot!(unicode_escape_events(r"\D800\Q\DC00", '\\'), @r" |
| 187 | + 0..5 err Invalid Unicode surrogate pair |
| 188 | + 5..7 err Invalid Unicode escape sequence |
| 189 | + 7..12 err Invalid Unicode surrogate pair |
| 190 | + "); |
| 191 | + } |
| 192 | + |
| 193 | + #[test] |
| 194 | + fn invalid_unicode_escape_does_not_emit_literal_char() { |
| 195 | + assert_snapshot!(unicode_escape_events(r"\0061\Q\0062", '\\'), @r" |
| 196 | + 0..5 ok 'a' |
| 197 | + 5..7 err Invalid Unicode escape sequence |
| 198 | + 7..12 ok 'b' |
| 199 | + "); |
| 200 | + } |
| 201 | + |
| 202 | + #[test] |
| 203 | + fn invalid_unicode_escape_works_with_custom_escape_char() { |
| 204 | + assert_snapshot!(unicode_escape_events("!0061!Q!0062", '!'), @r" |
| 205 | + 0..5 ok 'a' |
| 206 | + 5..7 err Invalid Unicode escape sequence |
| 207 | + 7..12 ok 'b' |
| 208 | + "); |
| 209 | + } |
| 210 | + |
| 211 | + #[test] |
| 212 | + fn valid_unicode_escape_after_high_surrogate_only_emits_error() { |
| 213 | + assert_snapshot!(unicode_escape_events(r"\D800\0061", '\\'), @r" |
| 214 | + 0..10 err Invalid Unicode surrogate pair |
| 215 | + "); |
| 216 | + } |
| 217 | +} |
0 commit comments