|
1 | 1 | use serde::Deserialize; |
2 | 2 | use serde_json::Value; |
| 3 | +use std::borrow::Cow; |
3 | 4 | use std::collections::HashMap; |
4 | 5 |
|
5 | 6 | /// Deserializes a JSON string field, treating `null` as the type's default |
@@ -155,12 +156,85 @@ impl Entry { |
155 | 156 | } |
156 | 157 | } |
157 | 158 |
|
| 159 | +/// Parses 4 ASCII hex bytes into a u16. Returns None if bytes are fewer than 4 |
| 160 | +/// or contain non-hex characters. |
| 161 | +fn hex4_to_u16(bytes: &[u8]) -> Option<u16> { |
| 162 | + if bytes.len() < 4 { |
| 163 | + return None; |
| 164 | + } |
| 165 | + let s = std::str::from_utf8(&bytes[..4]).ok()?; |
| 166 | + u16::from_str_radix(s, 16).ok() |
| 167 | +} |
| 168 | + |
| 169 | +/// Replaces lone UTF-16 surrogates (U+D800–U+DFFF) in JSON `\uXXXX` escape |
| 170 | +/// sequences with the Unicode replacement character U+FFFD. JSONL files |
| 171 | +/// written by Claude Code before v2.1.132 may contain lone surrogates when |
| 172 | +/// the tool-error truncation logic split a multi-byte emoji at an offset |
| 173 | +/// boundary. serde_json rejects lone surrogates per RFC 8259; this pass makes |
| 174 | +/// such lines parseable before they reach the deserializer. |
| 175 | +fn sanitize_lone_surrogates(s: &str) -> Cow<str> { |
| 176 | + let bytes = s.as_bytes(); |
| 177 | + let len = bytes.len(); |
| 178 | + let mut i = 0; |
| 179 | + // Delay allocation until the first lone surrogate is found. |
| 180 | + let mut result: Option<Vec<u8>> = None; |
| 181 | + |
| 182 | + while i < len { |
| 183 | + // Match \uXXXX (6 bytes: backslash, u, 4 hex digits). |
| 184 | + if bytes[i] == b'\\' && i + 5 < len && bytes[i + 1] == b'u' { |
| 185 | + if let Some(cp) = hex4_to_u16(&bytes[i + 2..i + 6]) { |
| 186 | + if (0xD800..=0xDBFF).contains(&cp) { |
| 187 | + // High surrogate — valid only when immediately followed by \uDCxx–\uDFxx. |
| 188 | + let is_valid_pair = i + 11 < len |
| 189 | + && bytes[i + 6] == b'\\' |
| 190 | + && bytes[i + 7] == b'u' |
| 191 | + && hex4_to_u16(&bytes[i + 8..i + 12]) |
| 192 | + .is_some_and(|c| (0xDC00..=0xDFFF).contains(&c)); |
| 193 | + if is_valid_pair { |
| 194 | + if let Some(ref mut buf) = result { |
| 195 | + buf.extend_from_slice(&bytes[i..i + 12]); |
| 196 | + } |
| 197 | + i += 12; |
| 198 | + } else { |
| 199 | + result |
| 200 | + .get_or_insert_with(|| bytes[..i].to_vec()) |
| 201 | + .extend_from_slice(b"\\uFFFD"); |
| 202 | + i += 6; |
| 203 | + } |
| 204 | + continue; |
| 205 | + } else if (0xDC00..=0xDFFF).contains(&cp) { |
| 206 | + // Lone low surrogate. |
| 207 | + result |
| 208 | + .get_or_insert_with(|| bytes[..i].to_vec()) |
| 209 | + .extend_from_slice(b"\\uFFFD"); |
| 210 | + i += 6; |
| 211 | + continue; |
| 212 | + } |
| 213 | + } |
| 214 | + } |
| 215 | + if let Some(ref mut buf) = result { |
| 216 | + buf.push(bytes[i]); |
| 217 | + } |
| 218 | + i += 1; |
| 219 | + } |
| 220 | + |
| 221 | + match result { |
| 222 | + None => Cow::Borrowed(s), |
| 223 | + Some(buf) => Cow::Owned( |
| 224 | + String::from_utf8(buf) |
| 225 | + .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned()), |
| 226 | + ), |
| 227 | + } |
| 228 | +} |
| 229 | + |
158 | 230 | /// Parse a single JSONL line into an Entry. |
159 | 231 | /// Returns None if the JSON is invalid, the entry has no UUID, or the entry |
160 | 232 | /// has no type (guards against empty entries written by async PostToolUse |
161 | 233 | /// hooks in Claude Code pre-v2.1.119). |
162 | 234 | pub fn parse_entry(line: &[u8]) -> Option<Entry> { |
163 | | - let e: Entry = serde_json::from_slice(line).ok()?; |
| 235 | + let s = std::str::from_utf8(line).ok()?; |
| 236 | + let sanitized = sanitize_lone_surrogates(s); |
| 237 | + let e: Entry = serde_json::from_str(&sanitized).ok()?; |
164 | 238 | if (e.uuid.is_empty() && e.leaf_uuid.is_empty()) || e.entry_type.is_empty() { |
165 | 239 | return None; |
166 | 240 | } |
@@ -547,6 +621,136 @@ mod tests { |
547 | 621 | ); |
548 | 622 | } |
549 | 623 |
|
| 624 | + // --- Issue #85: lone UTF-16 surrogate sanitization --- |
| 625 | + |
| 626 | + #[test] |
| 627 | + fn sanitize_lone_surrogates_no_surrogates_returns_borrowed() { |
| 628 | + let s = r#"{\"key\":\"hello world\"}"#; |
| 629 | + let result = sanitize_lone_surrogates(s); |
| 630 | + assert!( |
| 631 | + matches!(result, Cow::Borrowed(_)), |
| 632 | + "no surrogates -- must return borrowed (no allocation)" |
| 633 | + ); |
| 634 | + assert_eq!(result.as_ref(), s); |
| 635 | + } |
| 636 | + |
| 637 | + #[test] |
| 638 | + fn sanitize_lone_high_surrogate_replaced_with_fffd() { |
| 639 | + // \uD83D is a lone high surrogate (no following \uDCxx). |
| 640 | + // The function outputs the literal JSON escape \uFFFD (6 ASCII chars). |
| 641 | + let s = r#"{\"key\":\"emoji \uD83D truncated\"}"#; |
| 642 | + let result = sanitize_lone_surrogates(s); |
| 643 | + assert!( |
| 644 | + result.as_ref().contains(r"\uFFFD"), |
| 645 | + "replacement escape must be present" |
| 646 | + ); |
| 647 | + assert!( |
| 648 | + !result.as_ref().contains(r"\uD83D"), |
| 649 | + "lone surrogate must be removed" |
| 650 | + ); |
| 651 | + } |
| 652 | + |
| 653 | + #[test] |
| 654 | + fn sanitize_lone_low_surrogate_replaced_with_fffd() { |
| 655 | + // \uDC36 is a lone low surrogate (not preceded by a high surrogate). |
| 656 | + let s = r#"{\"key\":\"broken \uDC36 emoji\"}"#; |
| 657 | + let result = sanitize_lone_surrogates(s); |
| 658 | + assert!( |
| 659 | + result.as_ref().contains(r"\uFFFD"), |
| 660 | + "replacement escape must be present" |
| 661 | + ); |
| 662 | + assert!( |
| 663 | + !result.as_ref().contains(r"\uDC36"), |
| 664 | + "lone surrogate must be removed" |
| 665 | + ); |
| 666 | + } |
| 667 | + |
| 668 | + #[test] |
| 669 | + fn sanitize_valid_surrogate_pair_unchanged() { |
| 670 | + // \uD83D\uDC36 is a valid surrogate pair (dog face emoji). |
| 671 | + let s = r#"{\"key\":\"dog \uD83D\uDC36 emoji\"}"#; |
| 672 | + let result = sanitize_lone_surrogates(s); |
| 673 | + assert!( |
| 674 | + matches!(result, Cow::Borrowed(_)), |
| 675 | + "valid pair must return borrowed (no modification)" |
| 676 | + ); |
| 677 | + assert_eq!(result.as_ref(), s); |
| 678 | + } |
| 679 | + |
| 680 | + #[test] |
| 681 | + fn sanitize_multiple_lone_surrogates_all_replaced() { |
| 682 | + let s = r#"{\"a\":\"\uD83D\",\"b\":\"\uDC00\"}"#; |
| 683 | + let result = sanitize_lone_surrogates(s); |
| 684 | + assert!( |
| 685 | + !result.as_ref().contains(r"\uD83D"), |
| 686 | + "first lone surrogate must be removed" |
| 687 | + ); |
| 688 | + assert!( |
| 689 | + !result.as_ref().contains(r"\uDC00"), |
| 690 | + "second lone surrogate must be removed" |
| 691 | + ); |
| 692 | + let fffd_count = result.as_ref().match_indices(r"\uFFFD").count(); |
| 693 | + assert_eq!( |
| 694 | + fffd_count, 2, |
| 695 | + "both lone surrogates must be replaced with \\uFFFD" |
| 696 | + ); |
| 697 | + } |
| 698 | + |
| 699 | + #[test] |
| 700 | + fn sanitize_high_surrogate_at_end_of_string_replaced() { |
| 701 | + // \uD83D at end of value -- no room for a low surrogate, must be replaced. |
| 702 | + let s = r#"\"\uD83D\""#; |
| 703 | + let result = sanitize_lone_surrogates(s); |
| 704 | + assert!( |
| 705 | + result.as_ref().contains(r"\uFFFD"), |
| 706 | + "replacement escape must be present" |
| 707 | + ); |
| 708 | + assert!( |
| 709 | + !result.as_ref().contains(r"\uD83D"), |
| 710 | + "lone surrogate must be removed" |
| 711 | + ); |
| 712 | + } |
| 713 | + |
| 714 | + #[test] |
| 715 | + fn parse_entry_with_lone_high_surrogate_succeeds() { |
| 716 | + // Simulates a JSONL line from Claude Code < v2.1.132 where tool error |
| 717 | + // truncation left a lone \uD83D (high surrogate, no low surrogate follows). |
| 718 | + // serde_json rejects this without sanitization; parse_entry must succeed. |
| 719 | + let line = r#"{"type":"user","uuid":"emoji-lone-high","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"truncated emoji: \uD83D end"}}"#.as_bytes(); |
| 720 | + let entry = parse_entry(line); |
| 721 | + assert!( |
| 722 | + entry.is_some(), |
| 723 | + "parse_entry must succeed despite lone high surrogate" |
| 724 | + ); |
| 725 | + let e = entry.unwrap(); |
| 726 | + assert_eq!(e.uuid, "emoji-lone-high"); |
| 727 | + assert_eq!(e.entry_type, "user"); |
| 728 | + } |
| 729 | + |
| 730 | + #[test] |
| 731 | + fn parse_entry_with_lone_low_surrogate_succeeds() { |
| 732 | + // Lone low surrogate \uDC36 without a preceding high surrogate. |
| 733 | + let line = r#"{"type":"user","uuid":"emoji-lone-low","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"lone low: \uDC36"}}"#.as_bytes(); |
| 734 | + let entry = parse_entry(line); |
| 735 | + assert!( |
| 736 | + entry.is_some(), |
| 737 | + "parse_entry must succeed despite lone low surrogate" |
| 738 | + ); |
| 739 | + assert_eq!(entry.unwrap().uuid, "emoji-lone-low"); |
| 740 | + } |
| 741 | + |
| 742 | + #[test] |
| 743 | + fn parse_entry_with_valid_surrogate_pair_succeeds() { |
| 744 | + // Valid surrogate pair \uD83D\uDC36 (dog face) must parse successfully. |
| 745 | + let line = r#"{"type":"user","uuid":"emoji-valid-pair","timestamp":"2026-05-01T10:00:00Z","message":{"role":"user","content":"dog: \uD83D\uDC36"}}"#.as_bytes(); |
| 746 | + let entry = parse_entry(line); |
| 747 | + assert!( |
| 748 | + entry.is_some(), |
| 749 | + "parse_entry must succeed with a valid surrogate pair" |
| 750 | + ); |
| 751 | + assert_eq!(entry.unwrap().uuid, "emoji-valid-pair"); |
| 752 | + } |
| 753 | + |
550 | 754 | #[test] |
551 | 755 | fn parse_entry_unknown_fields_are_silently_ignored() { |
552 | 756 | // Future Claude Code versions may add more fields. The parser must never crash on |
|
0 commit comments