Skip to content

Commit 94a6d52

Browse files
delexwclaude
authored andcommitted
fix(parser): sanitize lone UTF-16 surrogates before JSONL parsing
JSONL files written by Claude Code before v2.1.132 may contain lone UTF-16 surrogate code units (e.g. `\uD83D` without a matching low surrogate) when the tool-error truncation logic split a multi-byte emoji at an offset boundary. serde_json rejects lone surrogates per RFC 8259, causing parse_entry to silently discard those lines. Add sanitize_lone_surrogates() which scans the raw JSONL string for `\uXXXX` escape sequences in the surrogate range (U+D800-U+DFFF) and replaces lone surrogates with `�` before the JSON deserializer sees them. Valid surrogate pairs (\uD8xx followed immediately by \uDCxx) are preserved unchanged. Allocation is deferred: strings with no surrogates return Cow::Borrowed with zero copies. Update parse_entry to convert the input bytes to str (failing fast on non-UTF-8) and apply the sanitizer before serde_json::from_str. Closes #85
1 parent 38b27b4 commit 94a6d52

1 file changed

Lines changed: 205 additions & 1 deletion

File tree

src-tauri/src/parser/entry.rs

Lines changed: 205 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use serde::Deserialize;
22
use serde_json::Value;
3+
use std::borrow::Cow;
34
use std::collections::HashMap;
45

56
/// Deserializes a JSON string field, treating `null` as the type's default
@@ -155,12 +156,85 @@ impl Entry {
155156
}
156157
}
157158

159+
/// Parses 4 ASCII hex bytes into a u16. Returns None if bytes are fewer than 4
160+
/// or contain non-hex characters.
161+
fn hex4_to_u16(bytes: &[u8]) -> Option<u16> {
162+
if bytes.len() < 4 {
163+
return None;
164+
}
165+
let s = std::str::from_utf8(&bytes[..4]).ok()?;
166+
u16::from_str_radix(s, 16).ok()
167+
}
168+
169+
/// Replaces lone UTF-16 surrogates (U+D800–U+DFFF) in JSON `\uXXXX` escape
170+
/// sequences with the Unicode replacement character U+FFFD. JSONL files
171+
/// written by Claude Code before v2.1.132 may contain lone surrogates when
172+
/// the tool-error truncation logic split a multi-byte emoji at an offset
173+
/// boundary. serde_json rejects lone surrogates per RFC 8259; this pass makes
174+
/// such lines parseable before they reach the deserializer.
175+
fn sanitize_lone_surrogates(s: &str) -> Cow<str> {
176+
let bytes = s.as_bytes();
177+
let len = bytes.len();
178+
let mut i = 0;
179+
// Delay allocation until the first lone surrogate is found.
180+
let mut result: Option<Vec<u8>> = None;
181+
182+
while i < len {
183+
// Match \uXXXX (6 bytes: backslash, u, 4 hex digits).
184+
if bytes[i] == b'\\' && i + 5 < len && bytes[i + 1] == b'u' {
185+
if let Some(cp) = hex4_to_u16(&bytes[i + 2..i + 6]) {
186+
if (0xD800..=0xDBFF).contains(&cp) {
187+
// High surrogate — valid only when immediately followed by \uDCxx–\uDFxx.
188+
let is_valid_pair = i + 11 < len
189+
&& bytes[i + 6] == b'\\'
190+
&& bytes[i + 7] == b'u'
191+
&& hex4_to_u16(&bytes[i + 8..i + 12])
192+
.is_some_and(|c| (0xDC00..=0xDFFF).contains(&c));
193+
if is_valid_pair {
194+
if let Some(ref mut buf) = result {
195+
buf.extend_from_slice(&bytes[i..i + 12]);
196+
}
197+
i += 12;
198+
} else {
199+
result
200+
.get_or_insert_with(|| bytes[..i].to_vec())
201+
.extend_from_slice(b"\\uFFFD");
202+
i += 6;
203+
}
204+
continue;
205+
} else if (0xDC00..=0xDFFF).contains(&cp) {
206+
// Lone low surrogate.
207+
result
208+
.get_or_insert_with(|| bytes[..i].to_vec())
209+
.extend_from_slice(b"\\uFFFD");
210+
i += 6;
211+
continue;
212+
}
213+
}
214+
}
215+
if let Some(ref mut buf) = result {
216+
buf.push(bytes[i]);
217+
}
218+
i += 1;
219+
}
220+
221+
match result {
222+
None => Cow::Borrowed(s),
223+
Some(buf) => Cow::Owned(
224+
String::from_utf8(buf)
225+
.unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned()),
226+
),
227+
}
228+
}
229+
158230
/// Parse a single JSONL line into an Entry.
159231
/// Returns None if the JSON is invalid, the entry has no UUID, or the entry
160232
/// has no type (guards against empty entries written by async PostToolUse
161233
/// hooks in Claude Code pre-v2.1.119).
162234
pub fn parse_entry(line: &[u8]) -> Option<Entry> {
163-
let e: Entry = serde_json::from_slice(line).ok()?;
235+
let s = std::str::from_utf8(line).ok()?;
236+
let sanitized = sanitize_lone_surrogates(s);
237+
let e: Entry = serde_json::from_str(&sanitized).ok()?;
164238
if (e.uuid.is_empty() && e.leaf_uuid.is_empty()) || e.entry_type.is_empty() {
165239
return None;
166240
}
@@ -547,6 +621,136 @@ mod tests {
547621
);
548622
}
549623

624+
// --- Issue #85: lone UTF-16 surrogate sanitization ---
625+
626+
#[test]
627+
fn sanitize_lone_surrogates_no_surrogates_returns_borrowed() {
628+
let s = r#"{\"key\":\"hello world\"}"#;
629+
let result = sanitize_lone_surrogates(s);
630+
assert!(
631+
matches!(result, Cow::Borrowed(_)),
632+
"no surrogates -- must return borrowed (no allocation)"
633+
);
634+
assert_eq!(result.as_ref(), s);
635+
}
636+
637+
#[test]
638+
fn sanitize_lone_high_surrogate_replaced_with_fffd() {
639+
// \uD83D is a lone high surrogate (no following \uDCxx).
640+
// The function outputs the literal JSON escape \uFFFD (6 ASCII chars).
641+
let s = r#"{\"key\":\"emoji \uD83D truncated\"}"#;
642+
let result = sanitize_lone_surrogates(s);
643+
assert!(
644+
result.as_ref().contains(r"\uFFFD"),
645+
"replacement escape must be present"
646+
);
647+
assert!(
648+
!result.as_ref().contains(r"\uD83D"),
649+
"lone surrogate must be removed"
650+
);
651+
}
652+
653+
#[test]
654+
fn sanitize_lone_low_surrogate_replaced_with_fffd() {
655+
// \uDC36 is a lone low surrogate (not preceded by a high surrogate).
656+
let s = r#"{\"key\":\"broken \uDC36 emoji\"}"#;
657+
let result = sanitize_lone_surrogates(s);
658+
assert!(
659+
result.as_ref().contains(r"\uFFFD"),
660+
"replacement escape must be present"
661+
);
662+
assert!(
663+
!result.as_ref().contains(r"\uDC36"),
664+
"lone surrogate must be removed"
665+
);
666+
}
667+
668+
#[test]
669+
fn sanitize_valid_surrogate_pair_unchanged() {
670+
// \uD83D\uDC36 is a valid surrogate pair (dog face emoji).
671+
let s = r#"{\"key\":\"dog \uD83D\uDC36 emoji\"}"#;
672+
let result = sanitize_lone_surrogates(s);
673+
assert!(
674+
matches!(result, Cow::Borrowed(_)),
675+
"valid pair must return borrowed (no modification)"
676+
);
677+
assert_eq!(result.as_ref(), s);
678+
}
679+
680+
#[test]
681+
fn sanitize_multiple_lone_surrogates_all_replaced() {
682+
let s = r#"{\"a\":\"\uD83D\",\"b\":\"\uDC00\"}"#;
683+
let result = sanitize_lone_surrogates(s);
684+
assert!(
685+
!result.as_ref().contains(r"\uD83D"),
686+
"first lone surrogate must be removed"
687+
);
688+
assert!(
689+
!result.as_ref().contains(r"\uDC00"),
690+
"second lone surrogate must be removed"
691+
);
692+
let fffd_count = result.as_ref().match_indices(r"\uFFFD").count();
693+
assert_eq!(
694+
fffd_count, 2,
695+
"both lone surrogates must be replaced with \\uFFFD"
696+
);
697+
}
698+
699+
#[test]
700+
fn sanitize_high_surrogate_at_end_of_string_replaced() {
701+
// \uD83D at end of value -- no room for a low surrogate, must be replaced.
702+
let s = r#"\"\uD83D\""#;
703+
let result = sanitize_lone_surrogates(s);
704+
assert!(
705+
result.as_ref().contains(r"\uFFFD"),
706+
"replacement escape must be present"
707+
);
708+
assert!(
709+
!result.as_ref().contains(r"\uD83D"),
710+
"lone surrogate must be removed"
711+
);
712+
}
713+
714+
#[test]
715+
fn parse_entry_with_lone_high_surrogate_succeeds() {
716+
// Simulates a JSONL line from Claude Code < v2.1.132 where tool error
717+
// truncation left a lone \uD83D (high surrogate, no low surrogate follows).
718+
// serde_json rejects this without sanitization; parse_entry must succeed.
719+
let line = r#"{"type":"user","uuid":"emoji-lone-high","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"truncated emoji: \uD83D end"}}"#.as_bytes();
720+
let entry = parse_entry(line);
721+
assert!(
722+
entry.is_some(),
723+
"parse_entry must succeed despite lone high surrogate"
724+
);
725+
let e = entry.unwrap();
726+
assert_eq!(e.uuid, "emoji-lone-high");
727+
assert_eq!(e.entry_type, "user");
728+
}
729+
730+
#[test]
731+
fn parse_entry_with_lone_low_surrogate_succeeds() {
732+
// Lone low surrogate \uDC36 without a preceding high surrogate.
733+
let line = r#"{"type":"user","uuid":"emoji-lone-low","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"lone low: \uDC36"}}"#.as_bytes();
734+
let entry = parse_entry(line);
735+
assert!(
736+
entry.is_some(),
737+
"parse_entry must succeed despite lone low surrogate"
738+
);
739+
assert_eq!(entry.unwrap().uuid, "emoji-lone-low");
740+
}
741+
742+
#[test]
743+
fn parse_entry_with_valid_surrogate_pair_succeeds() {
744+
// Valid surrogate pair \uD83D\uDC36 (dog face) must parse successfully.
745+
let line = r#"{"type":"user","uuid":"emoji-valid-pair","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"dog: \uD83D\uDC36"}}"#.as_bytes();
746+
let entry = parse_entry(line);
747+
assert!(
748+
entry.is_some(),
749+
"parse_entry must succeed with a valid surrogate pair"
750+
);
751+
assert_eq!(entry.unwrap().uuid, "emoji-valid-pair");
752+
}
753+
550754
#[test]
551755
fn parse_entry_unknown_fields_are_silently_ignored() {
552756
// Future Claude Code versions may add more fields. The parser must never crash on

0 commit comments

Comments
 (0)