Skip to content

Commit 2cf4232

Browse files
committed
fix(fsst): widen FSST output offsets to i64 to avoid i32 overflow
`fsst_compress_iter` previously hardcoded `VarBinBuilder::<i32>` for the FSST output, panicking once cumulative compressed bytes crossed `i32::MAX`. Switch to `VarBinBuilder::<i64>` so large inputs compress without overflow. The `FSSTMetadata.codes_offsets_ptype` field already records the offset PType, so existing serialized arrays continue to deserialize unchanged. Widening exposed a latent bug in `VarBin::compare`: with i64 offsets, the LHS is converted to Arrow `LargeBinary`/`LargeUtf8` (per `preferred_arrow_type`), but the RHS scalar was hardcoded to `Binary`/ `Utf8`. Arrow refuses `LargeBinary == Binary`. The RHS now picks the matching Arrow type from the LHS Datum. The previously-ignored regression test `fsst_compress_offsets_overflow_i32` now passes when run with `--ignored`. It still allocates ~5 GiB and stays `#[ignore]`d. Signed-off-by: Claude <noreply@anthropic.com>
1 parent d9bcd20 commit 2cf4232

3 files changed

Lines changed: 31 additions & 16 deletions

File tree

encodings/fsst/src/compress.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,10 @@ where
6969
I: Iterator<Item = Option<&'a [u8]>>,
7070
{
7171
let mut buffer = Vec::with_capacity(DEFAULT_BUFFER_LEN);
72-
let mut builder = VarBinBuilder::<i32>::with_capacity(len);
72+
// Offsets are widened to i64 because the cumulative compressed bytes can
73+
// exceed i32::MAX for large inputs (see issue #7833). Per-string sizes
74+
// still fit in i32.
75+
let mut builder = VarBinBuilder::<i64>::with_capacity(len);
7376
let mut uncompressed_lengths: BufferMut<i32> = BufferMut::with_capacity(len);
7477
for string in iter {
7578
match string {

encodings/fsst/src/tests.rs

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,14 @@ fn test_fsst_array_ops() {
112112
}
113113

114114
/// Regression for #7833: `fsst_compress` must accept inputs whose cumulative
115-
/// compressed bytes exceed `i32::MAX`. Today this panics in
116-
/// `vortex-array/src/arrays/varbin/builder.rs:62` because `fsst_compress_iter`
117-
/// (`encodings/fsst/src/compress.rs:72`) hardcodes `VarBinBuilder::<i32>` for
118-
/// the FSST output buffer regardless of input size.
115+
/// compressed bytes exceed `i32::MAX`. Before the fix, `fsst_compress_iter`
116+
/// (`encodings/fsst/src/compress.rs`) used a `VarBinBuilder::<i32>` for the
117+
/// FSST output regardless of input size, which panicked in
118+
/// `VarBinBuilder::<i32>::append_value` once cumulative compressed bytes
119+
/// crossed `i32::MAX`. The output builder is now `VarBinBuilder::<i64>`.
119120
///
120-
/// The input is built with `VarBinBuilder::<i64>` to confirm that widening the
121-
/// input alone does not help — the overflow is on the FSST output side.
121+
/// The input is built with `VarBinBuilder::<i64>` so the test exercises the
122+
/// large-output path without hitting an unrelated overflow on the input side.
122123
///
123124
/// Marked `#[ignore]` because the test allocates ~2.5 GiB for the input and
124125
/// ~2.5 GiB for the FSST output (~5 GiB total), which is too much to run by
@@ -127,10 +128,6 @@ fn test_fsst_array_ops() {
127128
/// ```text
128129
/// cargo test --release -p vortex-fsst -- --ignored fsst_compress_offsets
129130
/// ```
130-
///
131-
/// Until the underlying overflow is fixed, the test panics in
132-
/// `VarBinBuilder::<i32>::append_value` once cumulative compressed bytes pass
133-
/// `i32::MAX`. After the fix it must succeed with the row count preserved.
134131
#[test]
135132
#[ignore = "allocates ~5 GiB; run with --ignored"]
136133
fn fsst_compress_offsets_overflow_i32() {

vortex-array/src/arrays/varbin/compute/compare.rs

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
use arrow_array::BinaryArray;
5+
use arrow_array::LargeBinaryArray;
6+
use arrow_array::LargeStringArray;
57
use arrow_array::StringArray;
68
use arrow_ord::cmp;
9+
use arrow_schema::DataType;
710
use vortex_buffer::BitBuffer;
811
use vortex_error::VortexExpect as _;
912
use vortex_error::VortexResult;
@@ -82,15 +85,27 @@ impl CompareKernel for VarBin {
8285

8386
let lhs = Datum::try_new(lhs.array(), ctx)?;
8487

85-
// Use StringViewArray/BinaryViewArray to match the Utf8View/BinaryView types
86-
// produced by Datum::try_new (which uses execute_arrow(None, ctx))
87-
let arrow_rhs: &dyn arrow_array::Datum = match rhs_const.dtype() {
88-
DType::Utf8(_) => &rhs_const
88+
// The RHS scalar must match the LHS Arrow data type. VarBin with i64
89+
// offsets is converted to LargeBinary/LargeUtf8 (see
90+
// `preferred_arrow_type`), and Arrow refuses to compare LargeBinary
91+
// with Binary (or LargeUtf8 with Utf8).
92+
let arrow_rhs: &dyn arrow_array::Datum = match (rhs_const.dtype(), lhs.data_type()) {
93+
(DType::Utf8(_), DataType::LargeUtf8) => &rhs_const
94+
.as_utf8()
95+
.value()
96+
.map(LargeStringArray::new_scalar)
97+
.unwrap_or_else(|| arrow_array::Scalar::new(LargeStringArray::new_null(1))),
98+
(DType::Utf8(_), _) => &rhs_const
8999
.as_utf8()
90100
.value()
91101
.map(StringArray::new_scalar)
92102
.unwrap_or_else(|| arrow_array::Scalar::new(StringArray::new_null(1))),
93-
DType::Binary(_) => &rhs_const
103+
(DType::Binary(_), DataType::LargeBinary) => &rhs_const
104+
.as_binary()
105+
.value()
106+
.map(LargeBinaryArray::new_scalar)
107+
.unwrap_or_else(|| arrow_array::Scalar::new(LargeBinaryArray::new_null(1))),
108+
(DType::Binary(_), _) => &rhs_const
94109
.as_binary()
95110
.value()
96111
.map(BinaryArray::new_scalar)

0 commit comments

Comments
 (0)