Skip to content

Commit 9d38e9c

Browse files
Brooooooklynclaude
andauthored
fix: escape non-ASCII characters as \uNNNN in string literals (#11)
Match TypeScript's emitter behavior by escaping all non-ASCII characters (code point > 0x7E) as \uNNNN sequences. Characters above the BMP use UTF-16 surrogate pairs. Uses push_str with hex table lookup instead of fmt::Write. Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 5627f7b commit 9d38e9c

File tree

2 files changed

+94
-39
lines changed

2 files changed

+94
-39
lines changed

crates/oxc_angular_compiler/src/output/emitter.rs

Lines changed: 80 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1227,9 +1227,11 @@ fn is_nullish_coalesce(expr: &OutputExpression<'_>) -> bool {
12271227
/// Escape a string for JavaScript output.
12281228
///
12291229
/// Uses double quotes to match Angular's output style.
1230-
/// Only escapes control characters (`"`, `\`, `\n`, `\r`, and `$` when requested).
1231-
/// Non-ASCII printable characters (e.g. `×`, `é`, `α`) are emitted as literal UTF-8,
1232-
/// matching Angular's `escapeIdentifier` behavior.
1230+
/// Escapes `"`, `\`, `\n`, `\r`, `$` (when requested), ASCII control characters,
1231+
/// and all non-ASCII characters (code point > 0x7E) as `\uNNNN` sequences.
1232+
/// Characters above the BMP (U+10000+) are encoded as UTF-16 surrogate pairs
1233+
/// (`\uXXXX\uXXXX`). This matches TypeScript's emitter behavior, which escapes
1234+
/// non-ASCII characters in string literals.
12331235
fn escape_string(input: &str, escape_dollar: bool) -> String {
12341236
let mut result = String::with_capacity(input.len() + 2);
12351237
result.push('"');
@@ -1240,19 +1242,37 @@ fn escape_string(input: &str, escape_dollar: bool) -> String {
12401242
'\n' => result.push_str("\\n"),
12411243
'\r' => result.push_str("\\r"),
12421244
'$' if escape_dollar => result.push_str("\\$"),
1243-
// Escape ASCII control characters (0x00-0x1F, 0x7F) other than \n and \r
1244-
c if c.is_ascii_control() => {
1245+
// ASCII printable characters (0x20-0x7E) are emitted literally
1246+
c if (' '..='\x7E').contains(&c) => result.push(c),
1247+
// Everything else (ASCII control chars, non-ASCII) is escaped as \uNNNN.
1248+
// Characters above the BMP are encoded as UTF-16 surrogate pairs.
1249+
c => {
12451250
let code = c as u32;
1246-
result.push_str(&format!("\\u{code:04X}"));
1251+
if code <= 0xFFFF {
1252+
push_unicode_escape(&mut result, code);
1253+
} else {
1254+
let hi = 0xD800 + ((code - 0x10000) >> 10);
1255+
let lo = 0xDC00 + ((code - 0x10000) & 0x3FF);
1256+
push_unicode_escape(&mut result, hi);
1257+
push_unicode_escape(&mut result, lo);
1258+
}
12471259
}
1248-
// All other characters (including non-ASCII printable) are emitted literally
1249-
_ => result.push(c),
12501260
}
12511261
}
12521262
result.push('"');
12531263
result
12541264
}
12551265

1266+
/// Push a `\uXXXX` escape sequence for a 16-bit code unit.
1267+
fn push_unicode_escape(buf: &mut String, code: u32) {
1268+
const HEX: &[u8; 16] = b"0123456789ABCDEF";
1269+
buf.push_str("\\u");
1270+
buf.push(HEX[((code >> 12) & 0xF) as usize] as char);
1271+
buf.push(HEX[((code >> 8) & 0xF) as usize] as char);
1272+
buf.push(HEX[((code >> 4) & 0xF) as usize] as char);
1273+
buf.push(HEX[(code & 0xF) as usize] as char);
1274+
}
1275+
12561276
/// Escape an identifier for use as a property key.
12571277
fn escape_identifier(input: &Atom<'_>, escape_dollar: bool, always_quote: bool) -> String {
12581278
// Check if the identifier is a valid JavaScript identifier
@@ -1487,35 +1507,35 @@ mod tests {
14871507

14881508
#[test]
14891509
fn test_escape_string_unicode_literals() {
1490-
// Non-ASCII printable characters should be emitted as literal UTF-8,
1491-
// matching Angular's escapeIdentifier behavior.
1510+
// Non-ASCII characters should be escaped as \uNNNN to match
1511+
// TypeScript's emitter behavior.
14921512

1493-
// &times; (multiplication sign U+00D7) -> literal ×
1494-
assert_eq!(escape_string("\u{00D7}", false), "\"\u{00D7}\"");
1513+
// &times; (multiplication sign U+00D7) -> \u00D7
1514+
assert_eq!(escape_string("\u{00D7}", false), "\"\\u00D7\"");
14951515

1496-
// &nbsp; (non-breaking space U+00A0) -> literal
1497-
assert_eq!(escape_string("\u{00A0}", false), "\"\u{00A0}\"");
1516+
// &nbsp; (non-breaking space U+00A0) -> \u00A0
1517+
assert_eq!(escape_string("\u{00A0}", false), "\"\\u00A0\"");
14981518

14991519
// Mixed ASCII and non-ASCII
1500-
assert_eq!(escape_string("a\u{00D7}b", false), "\"a\u{00D7}b\"");
1520+
assert_eq!(escape_string("a\u{00D7}b", false), "\"a\\u00D7b\"");
15011521

15021522
// Multiple non-ASCII characters
1503-
assert_eq!(escape_string("\u{00D7}\u{00A0}", false), "\"\u{00D7}\u{00A0}\"");
1523+
assert_eq!(escape_string("\u{00D7}\u{00A0}", false), "\"\\u00D7\\u00A0\"");
15041524

1505-
// Characters outside BMP (emoji) -> emitted literally
1506-
assert_eq!(escape_string("\u{1F600}", false), "\"\u{1F600}\"");
1525+
// Characters outside BMP (emoji) -> surrogate pair
1526+
assert_eq!(escape_string("\u{1F600}", false), "\"\\uD83D\\uDE00\"");
15071527

1508-
// Common HTML entities -> all emitted literally
1509-
assert_eq!(escape_string("\u{00A9}", false), "\"\u{00A9}\""); // &copy; ©
1510-
assert_eq!(escape_string("\u{00AE}", false), "\"\u{00AE}\""); // &reg; ®
1511-
assert_eq!(escape_string("\u{2014}", false), "\"\u{2014}\""); // &mdash; —
1512-
assert_eq!(escape_string("\u{2013}", false), "\"\u{2013}\""); // &ndash; –
1528+
// Common HTML entities -> all escaped as \uNNNN
1529+
assert_eq!(escape_string("\u{00A9}", false), "\"\\u00A9\""); // &copy; ©
1530+
assert_eq!(escape_string("\u{00AE}", false), "\"\\u00AE\""); // &reg; ®
1531+
assert_eq!(escape_string("\u{2014}", false), "\"\\u2014\""); // &mdash; —
1532+
assert_eq!(escape_string("\u{2013}", false), "\"\\u2013\""); // &ndash; –
15131533

15141534
// Greek letter alpha
1515-
assert_eq!(escape_string("\u{03B1}", false), "\"\u{03B1}\""); // α
1535+
assert_eq!(escape_string("\u{03B1}", false), "\"\\u03B1\""); // α
15161536

15171537
// Accented Latin letter
1518-
assert_eq!(escape_string("\u{00E9}", false), "\"\u{00E9}\""); // é
1538+
assert_eq!(escape_string("\u{00E9}", false), "\"\\u00E9\""); // é
15191539
}
15201540

15211541
#[test]
@@ -1533,6 +1553,41 @@ mod tests {
15331553
assert_eq!(escape_string("\r", false), "\"\\r\"");
15341554
}
15351555

1556+
#[test]
1557+
fn test_escape_string_non_ascii_as_unicode_escapes() {
1558+
// Non-ASCII characters should be escaped as \uNNNN to match
1559+
// TypeScript's emitter behavior (which escapes non-ASCII in string literals).
1560+
1561+
// Non-breaking space U+00A0
1562+
assert_eq!(escape_string("\u{00A0}", false), "\"\\u00A0\"");
1563+
1564+
// En dash U+2013
1565+
assert_eq!(escape_string("\u{2013}", false), "\"\\u2013\"");
1566+
1567+
// Trademark U+2122
1568+
assert_eq!(escape_string("\u{2122}", false), "\"\\u2122\"");
1569+
1570+
// Infinity U+221E
1571+
assert_eq!(escape_string("\u{221E}", false), "\"\\u221E\"");
1572+
1573+
// Mixed ASCII and non-ASCII
1574+
assert_eq!(escape_string("a\u{00D7}b", false), "\"a\\u00D7b\"");
1575+
1576+
// Multiple non-ASCII characters
1577+
assert_eq!(escape_string("\u{00D7}\u{00A0}", false), "\"\\u00D7\\u00A0\"");
1578+
1579+
// Characters above BMP should use surrogate pairs
1580+
// U+1F600 (grinning face) = surrogate pair D83D DE00
1581+
assert_eq!(escape_string("\u{1F600}", false), "\"\\uD83D\\uDE00\"");
1582+
1583+
// U+10000 (first supplementary char) = surrogate pair D800 DC00
1584+
assert_eq!(escape_string("\u{10000}", false), "\"\\uD800\\uDC00\"");
1585+
1586+
// ASCII printable chars (0x20-0x7E) should remain literal
1587+
assert_eq!(escape_string(" ~", false), "\" ~\"");
1588+
assert_eq!(escape_string("abc123!@#", false), "\"abc123!@#\"");
1589+
}
1590+
15361591
// ========================================================================
15371592
// Source Map Tests
15381593
// ========================================================================

crates/oxc_angular_compiler/tests/integration_test.rs

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -145,26 +145,26 @@ fn test_multiple_interpolations() {
145145

146146
#[test]
147147
fn test_html_entity_between_interpolations() {
148-
// HTML entity &times; between two interpolations should produce literal UTF-8 in the output
148+
// HTML entity &times; between two interpolations should produce \u00D7 in the output
149149
let js = compile_template_to_js("<div>{{ a }}&times;{{ b }}</div>", "TestComponent");
150-
// Should produce: textInterpolate2("", ctx.a, "\u{00D7}", ctx.b)
151-
// Note: × (multiplication sign) = U+00D7, emitted as literal UTF-8
150+
// Should produce: textInterpolate2("", ctx.a, "\u00D7", ctx.b)
151+
// Note: × (multiplication sign) = U+00D7, escaped as \u00D7
152152
assert!(
153-
js.contains("textInterpolate2(\"\",ctx.a,\"\u{00D7}\",ctx.b)"),
154-
"Expected textInterpolate2 with literal times character. Got:\n{js}"
153+
js.contains(r#"textInterpolate2("",ctx.a,"\u00D7",ctx.b)"#),
154+
"Expected textInterpolate2 with escaped times character. Got:\n{js}"
155155
);
156156
}
157157

158158
#[test]
159159
fn test_html_entity_at_start_of_interpolation() {
160160
// Entity at start: &times;{{ a }}
161161
let js = compile_template_to_js("<div>&times;{{ a }}</div>", "TestComponent");
162-
// Should produce: textInterpolate1("\u{00D7}", ctx.a)
163-
// Note: × (multiplication sign) = U+00D7, emitted as literal UTF-8
162+
// Should produce: textInterpolate1("\u00D7", ctx.a)
163+
// Note: × (multiplication sign) = U+00D7, escaped as \u00D7
164164
assert!(
165-
js.contains("textInterpolate1(\"\u{00D7}\",ctx.a)")
166-
|| js.contains("textInterpolate(\"\u{00D7}\",ctx.a)"),
167-
"Expected textInterpolate with literal times character at start. Got:\n{js}"
165+
js.contains(r#"textInterpolate1("\u00D7",ctx.a)"#)
166+
|| js.contains(r#"textInterpolate("\u00D7",ctx.a)"#),
167+
"Expected textInterpolate with escaped times character at start. Got:\n{js}"
168168
);
169169
}
170170

@@ -173,11 +173,11 @@ fn test_multiple_html_entities_between_interpolations() {
173173
// Multiple entities: {{ a }}&nbsp;&times;&nbsp;{{ b }}
174174
let js =
175175
compile_template_to_js("<div>{{ a }}&nbsp;&times;&nbsp;{{ b }}</div>", "TestComponent");
176-
// Should produce: textInterpolate2("", ctx.a, "\u{00A0}\u{00D7}\u{00A0}", ctx.b)
177-
// Note: &nbsp; = U+00A0, &times; = U+00D7, both emitted as literal UTF-8
176+
// Should produce: textInterpolate2("", ctx.a, "\u00A0\u00D7\u00A0", ctx.b)
177+
// Note: &nbsp; = U+00A0, &times; = U+00D7, both escaped as \uNNNN
178178
assert!(
179-
js.contains("textInterpolate2(\"\",ctx.a,\"\u{00A0}\u{00D7}\u{00A0}\",ctx.b)"),
180-
"Expected textInterpolate2 with literal Unicode entities. Got:\n{js}"
179+
js.contains(r#"textInterpolate2("",ctx.a,"\u00A0\u00D7\u00A0",ctx.b)"#),
180+
"Expected textInterpolate2 with escaped Unicode entities. Got:\n{js}"
181181
);
182182
}
183183

0 commit comments

Comments
 (0)