fix(fsst): widen FSST output offsets to i64 to avoid i32 overflow

claude · claude · commit 2cf4232054ce · 2026-05-07T22:13:54.000Z
`fsst_compress_iter` previously hardcoded `VarBinBuilder::&lt;i32&gt;` for the
FSST output, panicking once cumulative compressed bytes crossed
`i32::MAX`. Switch to `VarBinBuilder::&lt;i64&gt;` so large inputs compress
without overflow. The `FSSTMetadata.codes_offsets_ptype` field already
records the offset PType, so existing serialized arrays continue to
deserialize unchanged.

Widening exposed a latent bug in `VarBin::compare`: with i64 offsets,
the LHS is converted to Arrow `LargeBinary`/`LargeUtf8` (per
`preferred_arrow_type`), but the RHS scalar was hardcoded to `Binary`/
`Utf8`. Arrow refuses `LargeBinary == Binary`. The RHS now picks the
matching Arrow type from the LHS Datum.

The previously-ignored regression test
`fsst_compress_offsets_overflow_i32` now passes when run with
`--ignored`. It still allocates ~5 GiB and stays `#[ignore]`d.

Signed-off-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/encodings/fsst/src/compress.rs b/encodings/fsst/src/compress.rs
@@ -69,7 +69,10 @@ where
     I: Iterator<Item = Option<&'a [u8]>>,
 {
     let mut buffer = Vec::with_capacity(DEFAULT_BUFFER_LEN);
-    let mut builder = VarBinBuilder::<i32>::with_capacity(len);
+    // Offsets are widened to i64 because the cumulative compressed bytes can
+    // exceed i32::MAX for large inputs (see issue #7833). Per-string sizes
+    // still fit in i32.
+    let mut builder = VarBinBuilder::<i64>::with_capacity(len);
     let mut uncompressed_lengths: BufferMut<i32> = BufferMut::with_capacity(len);
     for string in iter {
         match string {
diff --git a/encodings/fsst/src/tests.rs b/encodings/fsst/src/tests.rs
@@ -112,13 +112,14 @@ fn test_fsst_array_ops() {
 }
 
 /// Regression for #7833: `fsst_compress` must accept inputs whose cumulative
-/// compressed bytes exceed `i32::MAX`. Today this panics in
-/// `vortex-array/src/arrays/varbin/builder.rs:62` because `fsst_compress_iter`
-/// (`encodings/fsst/src/compress.rs:72`) hardcodes `VarBinBuilder::<i32>` for
-/// the FSST output buffer regardless of input size.
+/// compressed bytes exceed `i32::MAX`. Before the fix, `fsst_compress_iter`
+/// (`encodings/fsst/src/compress.rs`) used a `VarBinBuilder::<i32>` for the
+/// FSST output regardless of input size, which panicked in
+/// `VarBinBuilder::<i32>::append_value` once cumulative compressed bytes
+/// crossed `i32::MAX`. The output builder is now `VarBinBuilder::<i64>`.
 ///
-/// The input is built with `VarBinBuilder::<i64>` to confirm that widening the
-/// input alone does not help — the overflow is on the FSST output side.
+/// The input is built with `VarBinBuilder::<i64>` so the test exercises the
+/// large-output path without hitting an unrelated overflow on the input side.
 ///
 /// Marked `#[ignore]` because the test allocates ~2.5 GiB for the input and
 /// ~2.5 GiB for the FSST output (~5 GiB total), which is too much to run by
@@ -127,10 +128,6 @@ fn test_fsst_array_ops() {
 /// ```text
 /// cargo test --release -p vortex-fsst -- --ignored fsst_compress_offsets
 /// ```
-///
-/// Until the underlying overflow is fixed, the test panics in
-/// `VarBinBuilder::<i32>::append_value` once cumulative compressed bytes pass
-/// `i32::MAX`. After the fix it must succeed with the row count preserved.
 #[test]
 #[ignore = "allocates ~5 GiB; run with --ignored"]
 fn fsst_compress_offsets_overflow_i32() {
diff --git a/vortex-array/src/arrays/varbin/compute/compare.rs b/vortex-array/src/arrays/varbin/compute/compare.rs
@@ -2,8 +2,11 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use arrow_array::BinaryArray;
+use arrow_array::LargeBinaryArray;
+use arrow_array::LargeStringArray;
 use arrow_array::StringArray;
 use arrow_ord::cmp;
+use arrow_schema::DataType;
 use vortex_buffer::BitBuffer;
 use vortex_error::VortexExpect as _;
 use vortex_error::VortexResult;
@@ -82,15 +85,27 @@ impl CompareKernel for VarBin {
 
             let lhs = Datum::try_new(lhs.array(), ctx)?;
 
-            // Use StringViewArray/BinaryViewArray to match the Utf8View/BinaryView types
-            // produced by Datum::try_new (which uses execute_arrow(None, ctx))
-            let arrow_rhs: &dyn arrow_array::Datum = match rhs_const.dtype() {
-                DType::Utf8(_) => &rhs_const
+            // The RHS scalar must match the LHS Arrow data type. VarBin with i64
+            // offsets is converted to LargeBinary/LargeUtf8 (see
+            // `preferred_arrow_type`), and Arrow refuses to compare LargeBinary
+            // with Binary (or LargeUtf8 with Utf8).
+            let arrow_rhs: &dyn arrow_array::Datum = match (rhs_const.dtype(), lhs.data_type()) {
+                (DType::Utf8(_), DataType::LargeUtf8) => &rhs_const
+                    .as_utf8()
+                    .value()
+                    .map(LargeStringArray::new_scalar)
+                    .unwrap_or_else(|| arrow_array::Scalar::new(LargeStringArray::new_null(1))),
+                (DType::Utf8(_), _) => &rhs_const
                     .as_utf8()
                     .value()
                     .map(StringArray::new_scalar)
                     .unwrap_or_else(|| arrow_array::Scalar::new(StringArray::new_null(1))),
-                DType::Binary(_) => &rhs_const
+                (DType::Binary(_), DataType::LargeBinary) => &rhs_const
+                    .as_binary()
+                    .value()
+                    .map(LargeBinaryArray::new_scalar)
+                    .unwrap_or_else(|| arrow_array::Scalar::new(LargeBinaryArray::new_null(1))),
+                (DType::Binary(_), _) => &rhs_const
                     .as_binary()
                     .value()
                     .map(BinaryArray::new_scalar)