Skip to content

Commit bd30fdb

Browse files
committed
fix(parser): sanitize lone UTF-16 surrogates before JSONL parsing
JSONL files written by Claude Code before v2.1.132 may contain lone UTF-16 surrogate code units (e.g. `\uD83D` without a matching low surrogate) when the tool-error truncation logic split a multi-byte emoji at an offset boundary. serde_json rejects lone surrogates per RFC 8259, causing parse_entry to silently discard those lines. Add sanitize_lone_surrogates() which scans the raw JSONL string for `\uXXXX` escape sequences in the surrogate range (U+D800-U+DFFF) and replaces lone surrogates with `�` before the JSON deserializer sees them. Valid surrogate pairs (\uD8xx followed immediately by \uDCxx) are preserved unchanged. Allocation is deferred: strings with no surrogates return Cow::Borrowed with zero copies. Update parse_entry to convert the input bytes to str (failing fast on non-UTF-8) and apply the sanitizer before serde_json::from_str. Closes #85
1 parent 0af3f9a commit bd30fdb

1 file changed

Lines changed: 205 additions & 1 deletion

File tree

src-tauri/src/parser/entry.rs

Lines changed: 205 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use serde::Deserialize;
22
use serde_json::Value;
3+
use std::borrow::Cow;
34
use std::collections::HashMap;
45

56
/// Deserializes a JSON string field, treating `null` as the type's default
@@ -147,12 +148,85 @@ impl Entry {
147148
}
148149
}
149150

151+
/// Parses 4 ASCII hex bytes into a u16. Returns None if bytes are fewer than 4
152+
/// or contain non-hex characters.
153+
fn hex4_to_u16(bytes: &[u8]) -> Option<u16> {
154+
if bytes.len() < 4 {
155+
return None;
156+
}
157+
let s = std::str::from_utf8(&bytes[..4]).ok()?;
158+
u16::from_str_radix(s, 16).ok()
159+
}
160+
161+
/// Replaces lone UTF-16 surrogates (U+D800–U+DFFF) in JSON `\uXXXX` escape
162+
/// sequences with the Unicode replacement character U+FFFD. JSONL files
163+
/// written by Claude Code before v2.1.132 may contain lone surrogates when
164+
/// the tool-error truncation logic split a multi-byte emoji at an offset
165+
/// boundary. serde_json rejects lone surrogates per RFC 8259; this pass makes
166+
/// such lines parseable before they reach the deserializer.
167+
fn sanitize_lone_surrogates(s: &str) -> Cow<str> {
168+
let bytes = s.as_bytes();
169+
let len = bytes.len();
170+
let mut i = 0;
171+
// Delay allocation until the first lone surrogate is found.
172+
let mut result: Option<Vec<u8>> = None;
173+
174+
while i < len {
175+
// Match \uXXXX (6 bytes: backslash, u, 4 hex digits).
176+
if bytes[i] == b'\\' && i + 5 < len && bytes[i + 1] == b'u' {
177+
if let Some(cp) = hex4_to_u16(&bytes[i + 2..i + 6]) {
178+
if (0xD800..=0xDBFF).contains(&cp) {
179+
// High surrogate — valid only when immediately followed by \uDCxx–\uDFxx.
180+
let is_valid_pair = i + 11 < len
181+
&& bytes[i + 6] == b'\\'
182+
&& bytes[i + 7] == b'u'
183+
&& hex4_to_u16(&bytes[i + 8..i + 12])
184+
.is_some_and(|c| (0xDC00..=0xDFFF).contains(&c));
185+
if is_valid_pair {
186+
if let Some(ref mut buf) = result {
187+
buf.extend_from_slice(&bytes[i..i + 12]);
188+
}
189+
i += 12;
190+
} else {
191+
result
192+
.get_or_insert_with(|| bytes[..i].to_vec())
193+
.extend_from_slice(b"\\uFFFD");
194+
i += 6;
195+
}
196+
continue;
197+
} else if (0xDC00..=0xDFFF).contains(&cp) {
198+
// Lone low surrogate.
199+
result
200+
.get_or_insert_with(|| bytes[..i].to_vec())
201+
.extend_from_slice(b"\\uFFFD");
202+
i += 6;
203+
continue;
204+
}
205+
}
206+
}
207+
if let Some(ref mut buf) = result {
208+
buf.push(bytes[i]);
209+
}
210+
i += 1;
211+
}
212+
213+
match result {
214+
None => Cow::Borrowed(s),
215+
Some(buf) => Cow::Owned(
216+
String::from_utf8(buf)
217+
.unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned()),
218+
),
219+
}
220+
}
221+
150222
/// Parse a single JSONL line into an Entry.
151223
/// Returns None if the JSON is invalid, the entry has no UUID, or the entry
152224
/// has no type (guards against empty entries written by async PostToolUse
153225
/// hooks in Claude Code pre-v2.1.119).
154226
pub fn parse_entry(line: &[u8]) -> Option<Entry> {
155-
let e: Entry = serde_json::from_slice(line).ok()?;
227+
let s = std::str::from_utf8(line).ok()?;
228+
let sanitized = sanitize_lone_surrogates(s);
229+
let e: Entry = serde_json::from_str(&sanitized).ok()?;
156230
if (e.uuid.is_empty() && e.leaf_uuid.is_empty()) || e.entry_type.is_empty() {
157231
return None;
158232
}
@@ -459,4 +533,134 @@ mod tests {
459533
);
460534
assert!(!entry.is_compact_summary);
461535
}
536+
537+
// --- Issue #85: lone UTF-16 surrogate sanitization ---
538+
539+
#[test]
540+
fn sanitize_lone_surrogates_no_surrogates_returns_borrowed() {
541+
let s = r#"{\"key\":\"hello world\"}"#;
542+
let result = sanitize_lone_surrogates(s);
543+
assert!(
544+
matches!(result, Cow::Borrowed(_)),
545+
"no surrogates -- must return borrowed (no allocation)"
546+
);
547+
assert_eq!(result.as_ref(), s);
548+
}
549+
550+
#[test]
551+
fn sanitize_lone_high_surrogate_replaced_with_fffd() {
552+
// \uD83D is a lone high surrogate (no following \uDCxx).
553+
// The function outputs the literal JSON escape \uFFFD (6 ASCII chars).
554+
let s = r#"{\"key\":\"emoji \uD83D truncated\"}"#;
555+
let result = sanitize_lone_surrogates(s);
556+
assert!(
557+
result.as_ref().contains(r"\uFFFD"),
558+
"replacement escape must be present"
559+
);
560+
assert!(
561+
!result.as_ref().contains(r"\uD83D"),
562+
"lone surrogate must be removed"
563+
);
564+
}
565+
566+
#[test]
567+
fn sanitize_lone_low_surrogate_replaced_with_fffd() {
568+
// \uDC36 is a lone low surrogate (not preceded by a high surrogate).
569+
let s = r#"{\"key\":\"broken \uDC36 emoji\"}"#;
570+
let result = sanitize_lone_surrogates(s);
571+
assert!(
572+
result.as_ref().contains(r"\uFFFD"),
573+
"replacement escape must be present"
574+
);
575+
assert!(
576+
!result.as_ref().contains(r"\uDC36"),
577+
"lone surrogate must be removed"
578+
);
579+
}
580+
581+
#[test]
582+
fn sanitize_valid_surrogate_pair_unchanged() {
583+
// \uD83D\uDC36 is a valid surrogate pair (dog face emoji).
584+
let s = r#"{\"key\":\"dog \uD83D\uDC36 emoji\"}"#;
585+
let result = sanitize_lone_surrogates(s);
586+
assert!(
587+
matches!(result, Cow::Borrowed(_)),
588+
"valid pair must return borrowed (no modification)"
589+
);
590+
assert_eq!(result.as_ref(), s);
591+
}
592+
593+
#[test]
594+
fn sanitize_multiple_lone_surrogates_all_replaced() {
595+
let s = r#"{\"a\":\"\uD83D\",\"b\":\"\uDC00\"}"#;
596+
let result = sanitize_lone_surrogates(s);
597+
assert!(
598+
!result.as_ref().contains(r"\uD83D"),
599+
"first lone surrogate must be removed"
600+
);
601+
assert!(
602+
!result.as_ref().contains(r"\uDC00"),
603+
"second lone surrogate must be removed"
604+
);
605+
let fffd_count = result.as_ref().match_indices(r"\uFFFD").count();
606+
assert_eq!(
607+
fffd_count, 2,
608+
"both lone surrogates must be replaced with \\uFFFD"
609+
);
610+
}
611+
612+
#[test]
613+
fn sanitize_high_surrogate_at_end_of_string_replaced() {
614+
// \uD83D at end of value -- no room for a low surrogate, must be replaced.
615+
let s = r#"\"\uD83D\""#;
616+
let result = sanitize_lone_surrogates(s);
617+
assert!(
618+
result.as_ref().contains(r"\uFFFD"),
619+
"replacement escape must be present"
620+
);
621+
assert!(
622+
!result.as_ref().contains(r"\uD83D"),
623+
"lone surrogate must be removed"
624+
);
625+
}
626+
627+
#[test]
628+
fn parse_entry_with_lone_high_surrogate_succeeds() {
629+
// Simulates a JSONL line from Claude Code < v2.1.132 where tool error
630+
// truncation left a lone \uD83D (high surrogate, no low surrogate follows).
631+
// serde_json rejects this without sanitization; parse_entry must succeed.
632+
let line = r#"{"type":"user","uuid":"emoji-lone-high","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"truncated emoji: \uD83D end"}}"#.as_bytes();
633+
let entry = parse_entry(line);
634+
assert!(
635+
entry.is_some(),
636+
"parse_entry must succeed despite lone high surrogate"
637+
);
638+
let e = entry.unwrap();
639+
assert_eq!(e.uuid, "emoji-lone-high");
640+
assert_eq!(e.entry_type, "user");
641+
}
642+
643+
#[test]
644+
fn parse_entry_with_lone_low_surrogate_succeeds() {
645+
// Lone low surrogate \uDC36 without a preceding high surrogate.
646+
let line = r#"{"type":"user","uuid":"emoji-lone-low","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"lone low: \uDC36"}}"#.as_bytes();
647+
let entry = parse_entry(line);
648+
assert!(
649+
entry.is_some(),
650+
"parse_entry must succeed despite lone low surrogate"
651+
);
652+
assert_eq!(entry.unwrap().uuid, "emoji-lone-low");
653+
}
654+
655+
#[test]
656+
fn parse_entry_with_valid_surrogate_pair_succeeds() {
657+
// Valid surrogate pair \uD83D\uDC36 (dog face) must parse successfully.
658+
let line = r#"{"type":"user","uuid":"emoji-valid-pair","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"dog: \uD83D\uDC36"}}"#.as_bytes();
659+
let entry = parse_entry(line);
660+
assert!(
661+
entry.is_some(),
662+
"parse_entry must succeed with a valid surrogate pair"
663+
);
664+
assert_eq!(entry.unwrap().uuid, "emoji-valid-pair");
665+
}
462666
}

0 commit comments

Comments
 (0)