fix(shuffle): tolerate non-UTF-8 bytes in get_string (lossy decode) (#4524)

schenksj · claude · andygrove · web-flow · commit 6ea6b1dcf6f9 · 2026-06-19T16:21:06.000-06:00
* fix(shuffle): get_string tolerates non-UTF-8 bytes (lossy decode) Spark's UnsafeRow.getUTF8String performs no UTF-8 validation, and cast(BinaryType -> StringType) is a zero-copy reinterpret, so a StringType column can legitimately hold arbitrary non-UTF-8 bytes. get_string decoded with from_utf8(..).unwrap(), which panics on such rows even though Spark treats them as opaque. Use from_utf8_lossy (returning Cow<str>): a zero-cost borrow for valid UTF-8 and a String with U+FFFD replacements otherwise -- defined behavior, no UB. Avoids from_utf8_unchecked, which would construct a &str from arbitrary bytes (UB) and propagate into downstream Arrow ops. Adds a standalone unit test that panics without the fix and passes with it. Closes #4521 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * test: add end-to-end shuffle test for non-UTF-8 StringType bytes (#4521) Address review feedback: add a Spark-level regression test demonstrating the bug. cast(binary -> string) is a zero-copy reinterpret in Spark, so a StringType column can hold arbitrary non-UTF-8 bytes. The test disables Comet's Cast so those raw bytes reach Comet's columnar (JVM) shuffle inside a JVM UnsafeRow, exercising the native row->Arrow get_string path that used to panic via from_utf8(..).unwrap() and now decodes lossily. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * fix(shuffle): match JVM U+FFFD granularity in get_string decode Replace `String::from_utf8_lossy` in `get_string` with `decode_utf8_spark_lossy`, which mirrors `sun.nio.cs.UTF_8.Decoder` (action REPLACE) byte-for-byte so a Comet columnar shuffle of arbitrary bytes renders identically to a Spark JVM shuffle. `from_utf8_lossy` follows the Unicode "maximal subpart" rule and can emit more than one U+FFFD per ill-formed multi-byte unit; the JDK collapses certain units (notably surrogate-range three-byte sequences `ED A0..BF ..`, e.g. CESU-8 / modified-UTF-8 supplementary chars) into a single U+FFFD. Valid UTF-8 still returns a zero-cost borrow via the fast path. Tests use JDK-17 `new String(bytes, UTF_8)` output as the oracle: a 7-case replacement-granularity table (incl. the `ED A0 80` -> single U+FFFD parity case), zero-copy borrow for valid UTF-8, and valid multibyte chars preserved around an invalid byte. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * docs(shuffle): frame JDK UTF-8 decoder parity as behavioral, note provenance Address ASF-provenance review feedback on decode_utf8_spark_lossy: reword the doc comment so it describes the *observable* replacement behavior of the JDK UTF-8 decoder rather than saying the per-class malformed lengths "mirror sun.nio.cs.UTF_8.Decoder" (which implies derivation from that class). State that they were determined from observed `new String(bytes, UTF_8)` output, not by reviewing the OpenJDK source. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * style: cargo fmt unsafe_object.rs utf8_lossy test tuple Resolves the rustfmt diff at unsafe_object.rs:370 that failed the Lint check. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com> Co-authored-by: Andy Grove <agrove@apache.org>
diff --git a/native/shuffle/src/spark_unsafe/row.rs b/native/shuffle/src/spark_unsafe/row.rs
@@ -1509,4 +1509,31 @@ mod test {
         assert_eq!(struct_array.len(), 1);
         assert!(struct_array.is_null(0));
     }
+
+    // Spark's `UnsafeRow.getUTF8String` performs no UTF-8 validation, and
+    // `cast(BinaryType -> StringType)` is a zero-copy reinterpret -- so a StringType field can
+    // hold arbitrary non-UTF-8 bytes. `get_string` must not panic on those; it should decode
+    // lossily, matching Spark treating the bytes as opaque.
+    #[test]
+    fn get_string_tolerates_non_utf8_bytes() {
+        // One string field. Row layout: 8-byte null bitset + an 8-byte (offset<<32 | len) slot,
+        // then the variable-length region. 8-byte aligned to match a real Spark UnsafeRow.
+        #[repr(align(8))]
+        struct Aligned([u8; 24]);
+        let mut data = Aligned([0u8; 24]);
+        // Invalid UTF-8 bytes at offset 16: 0xFF, 0xFE, then ASCII 'A'.
+        data.0[16] = 0xFF;
+        data.0[17] = 0xFE;
+        data.0[18] = b'A';
+        // Field 0 slot: offset = 16, len = 3.
+        let offset_and_len: i64 = (16i64 << 32) | 3;
+        data.0[8..16].copy_from_slice(&offset_and_len.to_ne_bytes());
+
+        let mut row = SparkUnsafeRow::new_with_num_fields(1);
+        row.point_to_slice(&data.0);
+
+        // Strict `from_utf8(..).unwrap()` panics here; lossy decode replaces each invalid byte
+        // with U+FFFD. `&*` works whether get_string returns `&str` or `Cow<str>`.
+        assert_eq!(&*row.get_string(0), "\u{FFFD}\u{FFFD}A");
+    }
 }
diff --git a/native/shuffle/src/spark_unsafe/unsafe_object.rs b/native/shuffle/src/spark_unsafe/unsafe_object.rs
@@ -19,10 +19,127 @@ use super::list::SparkUnsafeArray;
 use super::map::SparkUnsafeMap;
 use super::row::SparkUnsafeRow;
 use datafusion_comet_common::bytes_to_i128;
-use std::str::from_utf8;
+use std::borrow::Cow;
 
 const MAX_LONG_DIGITS: u8 = 18;
 
+/// Decode `bytes` as UTF-8 the way Spark renders `StringType` -- `new String(bytes, UTF_8)` on the
+/// JVM -- replacing each ill-formed sequence with a single `U+FFFD` and skipping the same number of
+/// bytes the JDK's UTF-8 `CharsetDecoder` (action REPLACE) would. Valid UTF-8 is returned as a
+/// zero-cost borrow.
+///
+/// This intentionally differs from `str::from_utf8_lossy` for surrogate-range three-byte sequences
+/// (`ED A0..BF ..`, e.g. CESU-8 / Java modified-UTF-8 supplementary chars) and for some other
+/// ill-formed multi-byte units: `from_utf8_lossy` follows the Unicode "maximal subpart" rule and
+/// can emit one `U+FFFD` per byte, whereas the JDK collapses certain ill-formed units into a single
+/// `U+FFFD`. Matching the JDK byte-for-byte means a Comet columnar shuffle of arbitrary bytes
+/// renders identically to a Spark JVM shuffle. The per-class malformed lengths below
+/// (E0/ED overlong & surrogate handling, F0/F4 range checks) match the observable replacement
+/// behavior of the JDK UTF-8 decoder; they were determined from observed
+/// `new String(bytes, UTF_8)` output, not by reviewing the OpenJDK source.
+pub(crate) fn decode_utf8_spark_lossy(bytes: &[u8]) -> Cow<'_, str> {
+    // Fast path: well-formed UTF-8 borrows with zero copy (the overwhelmingly common case).
+    if let Ok(s) = std::str::from_utf8(bytes) {
+        return Cow::Borrowed(s);
+    }
+
+    const RC: char = '\u{FFFD}';
+    let n = bytes.len();
+    let mut out = String::with_capacity(n);
+    let mut i = 0;
+    while i < n {
+        let b1 = bytes[i];
+        if b1 < 0x80 {
+            out.push(b1 as char);
+            i += 1;
+        } else if (0xC2..=0xDF).contains(&b1) {
+            // 2-byte lead. Bad/absent continuation -> single FFFD, skip 1.
+            if i + 1 < n && (bytes[i + 1] & 0xC0) == 0x80 {
+                let cp = (((b1 as u32) & 0x1F) << 6) | ((bytes[i + 1] as u32) & 0x3F);
+                out.push(char::from_u32(cp).unwrap());
+                i += 2;
+            } else {
+                out.push(RC);
+                i += 1;
+            }
+        } else if (0xE0..=0xEF).contains(&b1) {
+            // 3-byte lead.
+            if i + 1 >= n {
+                out.push(RC); // truncated lead at EOF
+                i = n;
+            } else {
+                let b2 = bytes[i + 1];
+                if (b1 == 0xE0 && (b2 & 0xE0) == 0x80) || (b2 & 0xC0) != 0x80 {
+                    // overlong (E0 80..9F) or b2 not a continuation -> skip 1
+                    out.push(RC);
+                    i += 1;
+                } else if i + 2 >= n {
+                    out.push(RC); // truncated after a valid b2 at EOF
+                    i = n;
+                } else {
+                    let b3 = bytes[i + 2];
+                    if (b3 & 0xC0) != 0x80 {
+                        out.push(RC); // b3 not a continuation -> skip 2
+                        i += 2;
+                    } else {
+                        let cp = (((b1 as u32) & 0x0F) << 12)
+                            | (((b2 as u32) & 0x3F) << 6)
+                            | ((b3 as u32) & 0x3F);
+                        if (0xD800..=0xDFFF).contains(&cp) {
+                            // surrogate (e.g. ED A0 80) -> JDK skips all 3, single FFFD
+                            out.push(RC);
+                            i += 3;
+                        } else {
+                            out.push(char::from_u32(cp).unwrap());
+                            i += 3;
+                        }
+                    }
+                }
+            }
+        } else if (0xF0..=0xF4).contains(&b1) {
+            // 4-byte lead.
+            if i + 1 >= n {
+                out.push(RC);
+                i = n;
+            } else {
+                let b2 = bytes[i + 1];
+                if (b1 == 0xF0 && !(0x90..=0xBF).contains(&b2))
+                    || (b1 == 0xF4 && (b2 & 0xF0) != 0x80)
+                    || (b2 & 0xC0) != 0x80
+                {
+                    out.push(RC); // bad b2 -> skip 1
+                    i += 1;
+                } else if i + 2 >= n {
+                    out.push(RC);
+                    i = n;
+                } else if (bytes[i + 2] & 0xC0) != 0x80 {
+                    out.push(RC); // b3 not a continuation -> skip 2
+                    i += 2;
+                } else if i + 3 >= n {
+                    out.push(RC);
+                    i = n;
+                } else if (bytes[i + 3] & 0xC0) != 0x80 {
+                    out.push(RC); // b4 not a continuation -> skip 3
+                    i += 3;
+                } else {
+                    let cp = (((b1 as u32) & 0x07) << 18)
+                        | (((b2 as u32) & 0x3F) << 12)
+                        | (((bytes[i + 2] as u32) & 0x3F) << 6)
+                        | ((bytes[i + 3] as u32) & 0x3F);
+                    out.push(char::from_u32(cp).unwrap());
+                    i += 4;
+                }
+            }
+        } else {
+            // Lone continuation (0x80..0xBF), overlong 2-byte leads (0xC0/0xC1), or out-of-range
+            // 4-byte leads (0xF5..0xFF): each is a single ill-formed byte -> skip 1.
+            out.push(RC);
+            i += 1;
+        }
+    }
+    Cow::Owned(out)
+}
+
 /// A common trait for Spark Unsafe classes that can be used to access the underlying data,
 /// e.g., `UnsafeRow` and `UnsafeArray`. This defines a set of methods that can be used to
 /// access the underlying data with index.
@@ -75,19 +192,31 @@ pub trait SparkUnsafeObject {
     }
 
     /// Returns string value at the given index of the object.
-    fn get_string(&self, index: usize) -> &str {
+    ///
+    /// Spark's `UnsafeRow.getUTF8String` wraps the bytes via `UTF8String.fromAddress` with no
+    /// UTF-8 validation, and Spark's `cast(BinaryType -> StringType)` is a zero-copy reinterpret
+    /// that can leave arbitrary bytes in a `StringType` column. Strict `from_utf8(..).unwrap()`
+    /// here panics on those rows even though Spark itself treats them as opaque. We use
+    /// `from_utf8_lossy`: it returns the original `&str` borrow for valid UTF-8 (zero-cost) and a
+    /// `String` with `U+FFFD` replacements for invalid bytes (defined behavior, no UB). This
+    /// avoids `from_utf8_unchecked`, which would construct a `&str` from arbitrary bytes -- UB per
+    /// the Rust reference, and would propagate into downstream Arrow ops that internally call
+    /// `str::from_utf8_unchecked` on the buffer.
+    ///
+    /// We decode via [`decode_utf8_spark_lossy`] rather than `String::from_utf8_lossy` so the
+    /// `U+FFFD` replacement granularity matches Spark's `new String(bytes, UTF_8)` EXACTLY,
+    /// including surrogate-range three-byte sequences (`ED A0..BF ..`) where the two std libraries
+    /// disagree -- so a Comet shuffle of arbitrary bytes renders identically to a Spark shuffle.
+    fn get_string(&self, index: usize) -> Cow<'_, str> {
         let (offset, len) = self.get_offset_and_len(index);
         let addr = self.get_row_addr() + offset as i64;
-        // SAFETY: addr points to valid UTF-8 string data within the variable-length region.
-        // Offset and length are read from the fixed-length portion of the row/array.
         debug_assert!(addr != 0, "get_string: null address at index {index}");
         debug_assert!(
             len >= 0,
             "get_string: negative length {len} at index {index}"
         );
         let slice: &[u8] = unsafe { std::slice::from_raw_parts(addr as *const u8, len as usize) };
-
-        from_utf8(slice).unwrap()
+        decode_utf8_spark_lossy(slice)
     }
 
     /// Returns binary value at the given index of the object.
@@ -222,3 +351,56 @@ macro_rules! impl_primitive_accessors {
     };
 }
 pub(crate) use impl_primitive_accessors;
+
+#[cfg(test)]
+mod utf8_lossy_tests {
+    use super::decode_utf8_spark_lossy;
+    use std::borrow::Cow;
+
+    /// Oracle = JDK 17 `new String(bytes, StandardCharsets.UTF_8)` (the renderer Spark uses for
+    /// StringType). Each row's expected output was verified against the JVM. The decoder must match
+    /// it byte-for-byte -- including the surrogate-range case where `str::from_utf8_lossy` differs.
+    #[test]
+    fn matches_jvm_replacement_granularity() {
+        let cases: &[(&[u8], &str)] = &[
+            (&[0xFF, 0xFE, 0x41], "\u{FFFD}\u{FFFD}A"),
+            (&[0x80, 0x42], "\u{FFFD}B"),
+            (&[0xE0, 0x80], "\u{FFFD}\u{FFFD}"),
+            (&[0xF0, 0x80, 0x80, 0x41], "\u{FFFD}\u{FFFD}\u{FFFD}A"),
+            (&[0xC0, 0xAF], "\u{FFFD}\u{FFFD}"),
+            // The parity case: Rust's from_utf8_lossy would give three U+FFFD here.
+            (&[0xED, 0xA0, 0x80], "\u{FFFD}"),
+            (
+                &[0xF4, 0x90, 0x80, 0x80],
+                "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}",
+            ),
+        ];
+        for (bytes, expected) in cases {
+            assert_eq!(
+                decode_utf8_spark_lossy(bytes),
+                *expected,
+                "bytes {bytes:02x?} should render like the JVM"
+            );
+        }
+    }
+
+    #[test]
+    fn valid_utf8_is_borrowed_zero_copy() {
+        let s = "café — 日本語 🦀";
+        match decode_utf8_spark_lossy(s.as_bytes()) {
+            Cow::Borrowed(b) => assert_eq!(b, s),
+            Cow::Owned(_) => panic!("valid UTF-8 must borrow, not allocate"),
+        }
+    }
+
+    #[test]
+    fn valid_multibyte_around_invalid_bytes_decodes() {
+        // 'a' | é (C3 A9) | stray 0xFF | 'b' | 🦀 (F0 9F A6 80) -> valid chars preserved, one FFFD.
+        let mut bytes = vec![b'a'];
+        bytes.extend_from_slice("é".as_bytes());
+        bytes.push(0xFF);
+        bytes.push(b'b');
+        bytes.extend_from_slice("🦀".as_bytes());
+        assert_eq!(decode_utf8_spark_lossy(&bytes), "aé\u{FFFD}b🦀");
+    }
+}
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometColumnarShuffleSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometColumnarShuffleSuite.scala
@@ -761,6 +761,36 @@ abstract class CometColumnarShuffleSuite extends CometTestBase with AdaptiveSpar
     }
   }
 
+  // Regression test for https://github.com/apache/datafusion-comet/issues/4521.
+  //
+  // Spark's `cast(BinaryType -> StringType)` is a zero-copy reinterpret (and `UnsafeRow`'s
+  // string accessor performs no UTF-8 validation), so a `StringType` column can legitimately
+  // hold arbitrary non-UTF-8 bytes that Spark treats as opaque. Comet's columnar (JVM) shuffle
+  // converts those `UnsafeRow`s to Arrow natively (`process_sorted_row_partition` -> `get_string`),
+  // which used to decode with `from_utf8(..).unwrap()` and panic on such rows. It now decodes
+  // lossily (U+FFFD replacements), matching how Spark renders the same bytes.
+  test("columnar shuffle tolerates non-UTF-8 bytes in a StringType column") {
+    withParquetTable(
+      Seq(
+        // 0xFF and 0xFE are never valid UTF-8 lead bytes; each decodes to a single U+FFFD in
+        // both Spark and Comet (so the lossy results match exactly).
+        (1, Array[Byte](0xff.toByte, 0xfe.toByte, 'A'.toByte)),
+        // 0x80 is a stray continuation byte -> one U+FFFD, followed by valid ASCII.
+        (2, Array[Byte](0x80.toByte, 'B'.toByte)),
+        // A fully valid UTF-8 row exercises the zero-cost borrow path.
+        (3, "valid".getBytes("UTF-8"))),
+      "tbl") {
+      // Disable Comet's own Cast so the `cast(binary -> string)` runs in Spark and the raw bytes
+      // reach the shuffle inside a JVM UnsafeRow. (If Comet performed the cast it would produce a
+      // pre-sanitized Arrow string array and never exercise get_string.)
+      withSQLConf(CometConf.getExprEnabledConfigKey("Cast") -> "false") {
+        val df = sql("SELECT _1, CAST(_2 AS STRING) AS s FROM tbl")
+        val shuffled = df.repartition(2, $"_1")
+        checkShuffleAnswer(shuffled, 1)
+      }
+    }
+  }
+
   /**
    * Checks that `df` produces the same answer as Spark does, and has the `expectedNum` Comet
    * exchange operators.