Skip to content

Commit d9bcd20

Browse files
committed
test(fsst): regression test for i32 offset overflow in fsst_compress
Adds an `#[ignore]`d regression test for #7833 to the existing `encodings/fsst/src/tests.rs`. The test allocates ~5 GiB total, so it is opt-in via `--ignored`: cargo test --release -p vortex-fsst -- --ignored fsst_compress_offsets This is an alternative to #7832 that keeps the test alongside the other FSST tests instead of introducing a new module, and avoids the `test-with` dev-dependency. Signed-off-by: Claude <noreply@anthropic.com>
1 parent eda04e1 commit d9bcd20

1 file changed

Lines changed: 60 additions & 0 deletions

File tree

encodings/fsst/src/tests.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
use rand::SeedableRng;
5+
use rand::rngs::StdRng;
6+
use rand::seq::IndexedRandom;
47
use vortex_array::ArrayRef;
58
use vortex_array::IntoArray;
69
use vortex_array::LEGACY_SESSION;
@@ -107,3 +110,60 @@ fn test_fsst_array_ops() {
107110

108111
assert_arrays_eq!(fsst_array, canonical_array);
109112
}
113+
114+
/// Regression for #7833: `fsst_compress` must accept inputs whose cumulative
115+
/// compressed bytes exceed `i32::MAX`. Today this panics in
116+
/// `vortex-array/src/arrays/varbin/builder.rs:62` because `fsst_compress_iter`
117+
/// (`encodings/fsst/src/compress.rs:72`) hardcodes `VarBinBuilder::<i32>` for
118+
/// the FSST output buffer regardless of input size.
119+
///
120+
/// The input is built with `VarBinBuilder::<i64>` to confirm that widening the
121+
/// input alone does not help — the overflow is on the FSST output side.
122+
///
123+
/// Marked `#[ignore]` because the test allocates ~2.5 GiB for the input and
124+
/// ~2.5 GiB for the FSST output (~5 GiB total), which is too much to run by
125+
/// default even in CI. To run it explicitly:
126+
///
127+
/// ```text
128+
/// cargo test --release -p vortex-fsst -- --ignored fsst_compress_offsets
129+
/// ```
130+
///
131+
/// Until the underlying overflow is fixed, the test panics in
132+
/// `VarBinBuilder::<i32>::append_value` once cumulative compressed bytes pass
133+
/// `i32::MAX`. After the fix it must succeed with the row count preserved.
134+
#[test]
135+
#[ignore = "allocates ~5 GiB; run with --ignored"]
136+
fn fsst_compress_offsets_overflow_i32() {
137+
// High-entropy ASCII strings sliced from a random pool. FSST is a
138+
// symbol-table compressor; pseudo-random data with no recurring byte
139+
// sequences resists compression, so the compressed output stays close
140+
// to input size and crosses the i32 boundary.
141+
const STRING_LEN: usize = 64 * 1024;
142+
const TOTAL_BYTES: usize = (1usize << 31) + (512 << 20); // ~2.5 GiB
143+
const N: usize = TOTAL_BYTES / STRING_LEN;
144+
const POOL_LEN: usize = 64 * 1024 * 1024;
145+
146+
// Printable ASCII alphabet so the result is valid UTF-8.
147+
const ALPHABET: &[u8; 95] =
148+
b" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
149+
150+
let mut rng = StdRng::seed_from_u64(0xC0DE_C011_B711);
151+
let pool: Vec<u8> = (0..POOL_LEN)
152+
.map(|_| *ALPHABET.choose(&mut rng).unwrap())
153+
.collect();
154+
155+
let mut builder = VarBinBuilder::<i64>::with_capacity(N);
156+
for i in 0..N {
157+
let off = (i.wrapping_mul(31337)) % (POOL_LEN - STRING_LEN);
158+
builder.append_value(&pool[off..off + STRING_LEN]);
159+
}
160+
let array = builder.finish(DType::Utf8(Nullability::NonNullable));
161+
162+
let compressor = fsst_train_compressor(&array);
163+
let len = array.len();
164+
let dtype = array.dtype().clone();
165+
let mut ctx = LEGACY_SESSION.create_execution_ctx();
166+
167+
let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx);
168+
assert_eq!(compressed.len(), len);
169+
}

0 commit comments

Comments
 (0)