|
| 1 | +//! Stateful streaming UTF-8 decoder. |
| 2 | +//! |
| 3 | +//! PTY reads can land in the middle of a multi-byte UTF-8 codepoint, so a |
| 4 | +//! naïve `String::from_utf8_lossy` on each chunk replaces partial sequences |
| 5 | +//! with `U+FFFD` even though the next chunk would complete the codepoint. |
| 6 | +//! `Utf8StreamDecoder` keeps any trailing incomplete byte sequence buffered |
| 7 | +//! across `decode` calls and only substitutes `U+FFFD` for byte sequences |
| 8 | +//! that are definitively invalid. |
| 9 | +
|
| 10 | +/// Streaming UTF-8 decoder that preserves codepoints split across byte |
| 11 | +/// chunks. |
| 12 | +#[derive(Debug, Default)] |
| 13 | +pub(crate) struct Utf8StreamDecoder { |
| 14 | + pending: Vec<u8>, |
| 15 | +} |
| 16 | + |
| 17 | +impl Utf8StreamDecoder { |
| 18 | + pub(crate) fn new() -> Self { |
| 19 | + Self { |
| 20 | + pending: Vec::new(), |
| 21 | + } |
| 22 | + } |
| 23 | + |
| 24 | + /// Decode an incoming byte chunk, returning all complete UTF-8 text |
| 25 | + /// available. Any trailing bytes that form an incomplete codepoint are |
| 26 | + /// retained for the next call. |
| 27 | + pub(crate) fn decode(&mut self, bytes: &[u8]) -> String { |
| 28 | + if bytes.is_empty() && self.pending.is_empty() { |
| 29 | + return String::new(); |
| 30 | + } |
| 31 | + self.pending.extend_from_slice(bytes); |
| 32 | + let mut output = String::with_capacity(self.pending.len()); |
| 33 | + let mut cursor = 0; |
| 34 | + |
| 35 | + while cursor < self.pending.len() { |
| 36 | + match std::str::from_utf8(&self.pending[cursor..]) { |
| 37 | + Ok(s) => { |
| 38 | + output.push_str(s); |
| 39 | + cursor = self.pending.len(); |
| 40 | + break; |
| 41 | + } |
| 42 | + Err(e) => { |
| 43 | + let valid_up_to = e.valid_up_to(); |
| 44 | + if valid_up_to > 0 { |
| 45 | + // SAFETY: from_utf8 reported these bytes as valid. |
| 46 | + let valid = |
| 47 | + std::str::from_utf8(&self.pending[cursor..cursor + valid_up_to]) |
| 48 | + .expect("valid_up_to slice must be valid UTF-8"); |
| 49 | + output.push_str(valid); |
| 50 | + cursor += valid_up_to; |
| 51 | + } |
| 52 | + |
| 53 | + match e.error_len() { |
| 54 | + Some(invalid_len) => { |
| 55 | + output.push('\u{FFFD}'); |
| 56 | + cursor += invalid_len; |
| 57 | + } |
| 58 | + None => { |
| 59 | + // Incomplete sequence at the end of the buffer — |
| 60 | + // hold it for the next chunk. |
| 61 | + break; |
| 62 | + } |
| 63 | + } |
| 64 | + } |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + self.pending.drain(..cursor); |
| 69 | + output |
| 70 | + } |
| 71 | + |
| 72 | + /// Drain any remaining buffered bytes, emitting `U+FFFD` for each |
| 73 | + /// incomplete sequence. Call once no more bytes will arrive. |
| 74 | + pub(crate) fn flush(&mut self) -> String { |
| 75 | + if self.pending.is_empty() { |
| 76 | + return String::new(); |
| 77 | + } |
| 78 | + let bytes = std::mem::take(&mut self.pending); |
| 79 | + let mut output = String::with_capacity(bytes.len()); |
| 80 | + let mut cursor = 0; |
| 81 | + while cursor < bytes.len() { |
| 82 | + match std::str::from_utf8(&bytes[cursor..]) { |
| 83 | + Ok(s) => { |
| 84 | + output.push_str(s); |
| 85 | + break; |
| 86 | + } |
| 87 | + Err(e) => { |
| 88 | + let valid_up_to = e.valid_up_to(); |
| 89 | + if valid_up_to > 0 { |
| 90 | + let valid = std::str::from_utf8(&bytes[cursor..cursor + valid_up_to]) |
| 91 | + .expect("valid_up_to slice must be valid UTF-8"); |
| 92 | + output.push_str(valid); |
| 93 | + cursor += valid_up_to; |
| 94 | + } |
| 95 | + output.push('\u{FFFD}'); |
| 96 | + match e.error_len() { |
| 97 | + Some(invalid_len) => cursor += invalid_len, |
| 98 | + // Incomplete trailing sequence: consume the rest. |
| 99 | + None => break, |
| 100 | + } |
| 101 | + } |
| 102 | + } |
| 103 | + } |
| 104 | + output |
| 105 | + } |
| 106 | +} |
| 107 | + |
| 108 | +#[cfg(test)] |
| 109 | +mod tests { |
| 110 | + use super::*; |
| 111 | + |
| 112 | + #[test] |
| 113 | + fn decodes_ascii_passthrough() { |
| 114 | + let mut dec = Utf8StreamDecoder::new(); |
| 115 | + assert_eq!(dec.decode(b"hello"), "hello"); |
| 116 | + assert_eq!(dec.decode(b" world"), " world"); |
| 117 | + } |
| 118 | + |
| 119 | + #[test] |
| 120 | + fn box_drawing_split_across_two_chunks() { |
| 121 | + // U+2500 BOX DRAWINGS LIGHT HORIZONTAL = E2 94 80 |
| 122 | + let mut dec = Utf8StreamDecoder::new(); |
| 123 | + let first = dec.decode(&[0xE2]); |
| 124 | + let second = dec.decode(&[0x94, 0x80]); |
| 125 | + assert_eq!(first, ""); |
| 126 | + assert_eq!(second, "\u{2500}"); |
| 127 | + assert!(!format!("{first}{second}").contains('\u{FFFD}')); |
| 128 | + } |
| 129 | + |
| 130 | + #[test] |
| 131 | + fn box_drawing_split_at_every_byte_boundary() { |
| 132 | + // U+2588 FULL BLOCK = E2 96 88 |
| 133 | + let original = "\u{2588}"; |
| 134 | + let bytes = original.as_bytes(); |
| 135 | + for split in 1..bytes.len() { |
| 136 | + let mut dec = Utf8StreamDecoder::new(); |
| 137 | + let mut combined = dec.decode(&bytes[..split]); |
| 138 | + combined.push_str(&dec.decode(&bytes[split..])); |
| 139 | + assert_eq!( |
| 140 | + combined, original, |
| 141 | + "split at {split} should preserve the original codepoint" |
| 142 | + ); |
| 143 | + assert!( |
| 144 | + !combined.contains('\u{FFFD}'), |
| 145 | + "split at {split} produced replacement char: {combined:?}" |
| 146 | + ); |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + #[test] |
| 151 | + fn cjk_glyph_split_across_chunks() { |
| 152 | + // U+4E2D 中 = E4 B8 AD |
| 153 | + let original = "\u{4E2D}"; |
| 154 | + let bytes = original.as_bytes(); |
| 155 | + for split in 1..bytes.len() { |
| 156 | + let mut dec = Utf8StreamDecoder::new(); |
| 157 | + let mut combined = dec.decode(&bytes[..split]); |
| 158 | + combined.push_str(&dec.decode(&bytes[split..])); |
| 159 | + assert_eq!(combined, original); |
| 160 | + assert!(!combined.contains('\u{FFFD}')); |
| 161 | + } |
| 162 | + } |
| 163 | + |
| 164 | + #[test] |
| 165 | + fn four_byte_emoji_split_at_every_boundary() { |
| 166 | + // U+1F600 😀 = F0 9F 98 80 |
| 167 | + let original = "\u{1F600}"; |
| 168 | + let bytes = original.as_bytes(); |
| 169 | + for split in 1..bytes.len() { |
| 170 | + let mut dec = Utf8StreamDecoder::new(); |
| 171 | + let mut combined = dec.decode(&bytes[..split]); |
| 172 | + combined.push_str(&dec.decode(&bytes[split..])); |
| 173 | + assert_eq!(combined, original, "split at {split}"); |
| 174 | + assert!(!combined.contains('\u{FFFD}')); |
| 175 | + } |
| 176 | + } |
| 177 | + |
| 178 | + #[test] |
| 179 | + fn byte_by_byte_streaming() { |
| 180 | + let original = "héllo 世界 😀 ─"; |
| 181 | + let bytes = original.as_bytes(); |
| 182 | + let mut dec = Utf8StreamDecoder::new(); |
| 183 | + let mut out = String::new(); |
| 184 | + for b in bytes { |
| 185 | + out.push_str(&dec.decode(&[*b])); |
| 186 | + } |
| 187 | + out.push_str(&dec.flush()); |
| 188 | + assert_eq!(out, original); |
| 189 | + assert!(!out.contains('\u{FFFD}')); |
| 190 | + } |
| 191 | + |
| 192 | + #[test] |
| 193 | + fn invalid_byte_in_middle_is_replaced() { |
| 194 | + let mut dec = Utf8StreamDecoder::new(); |
| 195 | + // 'A' (0x41) + invalid lone 0xFF + 'B' (0x42) |
| 196 | + let out = dec.decode(&[0x41, 0xFF, 0x42]); |
| 197 | + assert_eq!(out, "A\u{FFFD}B"); |
| 198 | + } |
| 199 | + |
| 200 | + #[test] |
| 201 | + fn invalid_continuation_after_valid_lead_is_replaced() { |
| 202 | + let mut dec = Utf8StreamDecoder::new(); |
| 203 | + // E2 (start of 3-byte) + 0x41 ('A' — not a continuation byte) |
| 204 | + let out = dec.decode(&[0xE2, 0x41]); |
| 205 | + // E2 is invalid (can't start that codepoint), 'A' is valid. |
| 206 | + assert!(out.contains('\u{FFFD}')); |
| 207 | + assert!(out.ends_with('A')); |
| 208 | + } |
| 209 | + |
| 210 | + #[test] |
| 211 | + fn flush_emits_replacement_for_truncated_tail() { |
| 212 | + let mut dec = Utf8StreamDecoder::new(); |
| 213 | + // E2 alone is incomplete — held in buffer, no output yet. |
| 214 | + assert_eq!(dec.decode(&[0xE2]), ""); |
| 215 | + // Flush should emit one replacement character since stream ended mid-codepoint. |
| 216 | + assert_eq!(dec.flush(), "\u{FFFD}"); |
| 217 | + // Subsequent flush is empty. |
| 218 | + assert_eq!(dec.flush(), ""); |
| 219 | + } |
| 220 | + |
| 221 | + #[test] |
| 222 | + fn empty_input_is_handled() { |
| 223 | + let mut dec = Utf8StreamDecoder::new(); |
| 224 | + assert_eq!(dec.decode(&[]), ""); |
| 225 | + assert_eq!(dec.flush(), ""); |
| 226 | + } |
| 227 | + |
| 228 | + #[test] |
| 229 | + fn multiple_incomplete_chunks_combine() { |
| 230 | + // Send E2 94 80 (U+2500) byte-by-byte. |
| 231 | + let mut dec = Utf8StreamDecoder::new(); |
| 232 | + assert_eq!(dec.decode(&[0xE2]), ""); |
| 233 | + assert_eq!(dec.decode(&[0x94]), ""); |
| 234 | + assert_eq!(dec.decode(&[0x80]), "\u{2500}"); |
| 235 | + } |
| 236 | + |
| 237 | + #[test] |
| 238 | + fn matches_from_utf8_lossy_for_complete_input() { |
| 239 | + let inputs: &[&[u8]] = &[ |
| 240 | + b"plain ascii", |
| 241 | + "héllo".as_bytes(), |
| 242 | + "中文 box ─ end".as_bytes(), |
| 243 | + "emoji 😀 done".as_bytes(), |
| 244 | + ]; |
| 245 | + for input in inputs { |
| 246 | + let mut dec = Utf8StreamDecoder::new(); |
| 247 | + let mut out = dec.decode(input); |
| 248 | + out.push_str(&dec.flush()); |
| 249 | + assert_eq!(out, String::from_utf8_lossy(input)); |
| 250 | + } |
| 251 | + } |
| 252 | +} |
0 commit comments