|
1 | 1 | use serde::Deserialize; |
2 | 2 | use serde_json::Value; |
| 3 | +use std::borrow::Cow; |
3 | 4 | use std::collections::HashMap; |
4 | 5 |
|
5 | 6 | /// Deserializes a JSON string field, treating `null` as the type's default |
@@ -147,12 +148,85 @@ impl Entry { |
147 | 148 | } |
148 | 149 | } |
149 | 150 |
|
| 151 | +/// Parses 4 ASCII hex bytes into a u16. Returns None if bytes are fewer than 4 |
| 152 | +/// or contain non-hex characters. |
| 153 | +fn hex4_to_u16(bytes: &[u8]) -> Option<u16> { |
| 154 | + if bytes.len() < 4 { |
| 155 | + return None; |
| 156 | + } |
| 157 | + let s = std::str::from_utf8(&bytes[..4]).ok()?; |
| 158 | + u16::from_str_radix(s, 16).ok() |
| 159 | +} |
| 160 | + |
| 161 | +/// Replaces lone UTF-16 surrogates (U+D800–U+DFFF) in JSON `\uXXXX` escape |
| 162 | +/// sequences with the Unicode replacement character U+FFFD. JSONL files |
| 163 | +/// written by Claude Code before v2.1.132 may contain lone surrogates when |
| 164 | +/// the tool-error truncation logic split a multi-byte emoji at an offset |
| 165 | +/// boundary. serde_json rejects lone surrogates per RFC 8259; this pass makes |
| 166 | +/// such lines parseable before they reach the deserializer. |
| 167 | +fn sanitize_lone_surrogates(s: &str) -> Cow<str> { |
| 168 | + let bytes = s.as_bytes(); |
| 169 | + let len = bytes.len(); |
| 170 | + let mut i = 0; |
| 171 | + // Delay allocation until the first lone surrogate is found. |
| 172 | + let mut result: Option<Vec<u8>> = None; |
| 173 | + |
| 174 | + while i < len { |
| 175 | + // Match \uXXXX (6 bytes: backslash, u, 4 hex digits). |
| 176 | + if bytes[i] == b'\\' && i + 5 < len && bytes[i + 1] == b'u' { |
| 177 | + if let Some(cp) = hex4_to_u16(&bytes[i + 2..i + 6]) { |
| 178 | + if (0xD800..=0xDBFF).contains(&cp) { |
| 179 | + // High surrogate — valid only when immediately followed by \uDCxx–\uDFxx. |
| 180 | + let is_valid_pair = i + 11 < len |
| 181 | + && bytes[i + 6] == b'\\' |
| 182 | + && bytes[i + 7] == b'u' |
| 183 | + && hex4_to_u16(&bytes[i + 8..i + 12]) |
| 184 | + .is_some_and(|c| (0xDC00..=0xDFFF).contains(&c)); |
| 185 | + if is_valid_pair { |
| 186 | + if let Some(ref mut buf) = result { |
| 187 | + buf.extend_from_slice(&bytes[i..i + 12]); |
| 188 | + } |
| 189 | + i += 12; |
| 190 | + } else { |
| 191 | + result |
| 192 | + .get_or_insert_with(|| bytes[..i].to_vec()) |
| 193 | + .extend_from_slice(b"\\uFFFD"); |
| 194 | + i += 6; |
| 195 | + } |
| 196 | + continue; |
| 197 | + } else if (0xDC00..=0xDFFF).contains(&cp) { |
| 198 | + // Lone low surrogate. |
| 199 | + result |
| 200 | + .get_or_insert_with(|| bytes[..i].to_vec()) |
| 201 | + .extend_from_slice(b"\\uFFFD"); |
| 202 | + i += 6; |
| 203 | + continue; |
| 204 | + } |
| 205 | + } |
| 206 | + } |
| 207 | + if let Some(ref mut buf) = result { |
| 208 | + buf.push(bytes[i]); |
| 209 | + } |
| 210 | + i += 1; |
| 211 | + } |
| 212 | + |
| 213 | + match result { |
| 214 | + None => Cow::Borrowed(s), |
| 215 | + Some(buf) => Cow::Owned( |
| 216 | + String::from_utf8(buf) |
| 217 | + .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned()), |
| 218 | + ), |
| 219 | + } |
| 220 | +} |
| 221 | + |
150 | 222 | /// Parse a single JSONL line into an Entry. |
151 | 223 | /// Returns None if the JSON is invalid, the entry has no UUID, or the entry |
152 | 224 | /// has no type (guards against empty entries written by async PostToolUse |
153 | 225 | /// hooks in Claude Code pre-v2.1.119). |
154 | 226 | pub fn parse_entry(line: &[u8]) -> Option<Entry> { |
155 | | - let e: Entry = serde_json::from_slice(line).ok()?; |
| 227 | + let s = std::str::from_utf8(line).ok()?; |
| 228 | + let sanitized = sanitize_lone_surrogates(s); |
| 229 | + let e: Entry = serde_json::from_str(&sanitized).ok()?; |
156 | 230 | if (e.uuid.is_empty() && e.leaf_uuid.is_empty()) || e.entry_type.is_empty() { |
157 | 231 | return None; |
158 | 232 | } |
@@ -459,4 +533,134 @@ mod tests { |
459 | 533 | ); |
460 | 534 | assert!(!entry.is_compact_summary); |
461 | 535 | } |
| 536 | + |
| 537 | + // --- Issue #85: lone UTF-16 surrogate sanitization --- |
| 538 | + |
| 539 | + #[test] |
| 540 | + fn sanitize_lone_surrogates_no_surrogates_returns_borrowed() { |
| 541 | + let s = r#"{\"key\":\"hello world\"}"#; |
| 542 | + let result = sanitize_lone_surrogates(s); |
| 543 | + assert!( |
| 544 | + matches!(result, Cow::Borrowed(_)), |
| 545 | + "no surrogates -- must return borrowed (no allocation)" |
| 546 | + ); |
| 547 | + assert_eq!(result.as_ref(), s); |
| 548 | + } |
| 549 | + |
| 550 | + #[test] |
| 551 | + fn sanitize_lone_high_surrogate_replaced_with_fffd() { |
| 552 | + // \uD83D is a lone high surrogate (no following \uDCxx). |
| 553 | + // The function outputs the literal JSON escape \uFFFD (6 ASCII chars). |
| 554 | + let s = r#"{\"key\":\"emoji \uD83D truncated\"}"#; |
| 555 | + let result = sanitize_lone_surrogates(s); |
| 556 | + assert!( |
| 557 | + result.as_ref().contains(r"\uFFFD"), |
| 558 | + "replacement escape must be present" |
| 559 | + ); |
| 560 | + assert!( |
| 561 | + !result.as_ref().contains(r"\uD83D"), |
| 562 | + "lone surrogate must be removed" |
| 563 | + ); |
| 564 | + } |
| 565 | + |
| 566 | + #[test] |
| 567 | + fn sanitize_lone_low_surrogate_replaced_with_fffd() { |
| 568 | + // \uDC36 is a lone low surrogate (not preceded by a high surrogate). |
| 569 | + let s = r#"{\"key\":\"broken \uDC36 emoji\"}"#; |
| 570 | + let result = sanitize_lone_surrogates(s); |
| 571 | + assert!( |
| 572 | + result.as_ref().contains(r"\uFFFD"), |
| 573 | + "replacement escape must be present" |
| 574 | + ); |
| 575 | + assert!( |
| 576 | + !result.as_ref().contains(r"\uDC36"), |
| 577 | + "lone surrogate must be removed" |
| 578 | + ); |
| 579 | + } |
| 580 | + |
| 581 | + #[test] |
| 582 | + fn sanitize_valid_surrogate_pair_unchanged() { |
| 583 | + // \uD83D\uDC36 is a valid surrogate pair (dog face emoji). |
| 584 | + let s = r#"{\"key\":\"dog \uD83D\uDC36 emoji\"}"#; |
| 585 | + let result = sanitize_lone_surrogates(s); |
| 586 | + assert!( |
| 587 | + matches!(result, Cow::Borrowed(_)), |
| 588 | + "valid pair must return borrowed (no modification)" |
| 589 | + ); |
| 590 | + assert_eq!(result.as_ref(), s); |
| 591 | + } |
| 592 | + |
| 593 | + #[test] |
| 594 | + fn sanitize_multiple_lone_surrogates_all_replaced() { |
| 595 | + let s = r#"{\"a\":\"\uD83D\",\"b\":\"\uDC00\"}"#; |
| 596 | + let result = sanitize_lone_surrogates(s); |
| 597 | + assert!( |
| 598 | + !result.as_ref().contains(r"\uD83D"), |
| 599 | + "first lone surrogate must be removed" |
| 600 | + ); |
| 601 | + assert!( |
| 602 | + !result.as_ref().contains(r"\uDC00"), |
| 603 | + "second lone surrogate must be removed" |
| 604 | + ); |
| 605 | + let fffd_count = result.as_ref().match_indices(r"\uFFFD").count(); |
| 606 | + assert_eq!( |
| 607 | + fffd_count, 2, |
| 608 | + "both lone surrogates must be replaced with \\uFFFD" |
| 609 | + ); |
| 610 | + } |
| 611 | + |
| 612 | + #[test] |
| 613 | + fn sanitize_high_surrogate_at_end_of_string_replaced() { |
| 614 | + // \uD83D at end of value -- no room for a low surrogate, must be replaced. |
| 615 | + let s = r#"\"\uD83D\""#; |
| 616 | + let result = sanitize_lone_surrogates(s); |
| 617 | + assert!( |
| 618 | + result.as_ref().contains(r"\uFFFD"), |
| 619 | + "replacement escape must be present" |
| 620 | + ); |
| 621 | + assert!( |
| 622 | + !result.as_ref().contains(r"\uD83D"), |
| 623 | + "lone surrogate must be removed" |
| 624 | + ); |
| 625 | + } |
| 626 | + |
| 627 | + #[test] |
| 628 | + fn parse_entry_with_lone_high_surrogate_succeeds() { |
| 629 | + // Simulates a JSONL line from Claude Code < v2.1.132 where tool error |
| 630 | + // truncation left a lone \uD83D (high surrogate, no low surrogate follows). |
| 631 | + // serde_json rejects this without sanitization; parse_entry must succeed. |
| 632 | + let line = r#"{"type":"user","uuid":"emoji-lone-high","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"truncated emoji: \uD83D end"}}"#.as_bytes(); |
| 633 | + let entry = parse_entry(line); |
| 634 | + assert!( |
| 635 | + entry.is_some(), |
| 636 | + "parse_entry must succeed despite lone high surrogate" |
| 637 | + ); |
| 638 | + let e = entry.unwrap(); |
| 639 | + assert_eq!(e.uuid, "emoji-lone-high"); |
| 640 | + assert_eq!(e.entry_type, "user"); |
| 641 | + } |
| 642 | + |
| 643 | + #[test] |
| 644 | + fn parse_entry_with_lone_low_surrogate_succeeds() { |
| 645 | + // Lone low surrogate \uDC36 without a preceding high surrogate. |
| 646 | + let line = r#"{"type":"user","uuid":"emoji-lone-low","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"lone low: \uDC36"}}"#.as_bytes(); |
| 647 | + let entry = parse_entry(line); |
| 648 | + assert!( |
| 649 | + entry.is_some(), |
| 650 | + "parse_entry must succeed despite lone low surrogate" |
| 651 | + ); |
| 652 | + assert_eq!(entry.unwrap().uuid, "emoji-lone-low"); |
| 653 | + } |
| 654 | + |
| 655 | + #[test] |
| 656 | + fn parse_entry_with_valid_surrogate_pair_succeeds() { |
| 657 | + // Valid surrogate pair \uD83D\uDC36 (dog face) must parse successfully. |
| 658 | + let line = r#"{"type":"user","uuid":"emoji-valid-pair","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"dog: \uD83D\uDC36"}}"#.as_bytes(); |
| 659 | + let entry = parse_entry(line); |
| 660 | + assert!( |
| 661 | + entry.is_some(), |
| 662 | + "parse_entry must succeed with a valid surrogate pair" |
| 663 | + ); |
| 664 | + assert_eq!(entry.unwrap().uuid, "emoji-valid-pair"); |
| 665 | + } |
462 | 666 | } |
0 commit comments