Skip to content

Commit 9aa32d9

Browse files
committed
fsst: add #[ignore]'d regression test for i32 offset overflow
Move the regression test from PR #7832's tests_large.rs into the existing tests.rs module. Use #[ignore] instead of test_with env gates since the test allocates ~5 GiB and shouldn't run by default even in CI. Tracks #7833. Signed-off-by: Claude <noreply@anthropic.com>
1 parent eda04e1 commit 9aa32d9

1 file changed

Lines changed: 49 additions & 0 deletions

File tree

encodings/fsst/src/tests.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
use rand::SeedableRng;
5+
use rand::rngs::StdRng;
6+
use rand::seq::IndexedRandom;
47
use vortex_array::ArrayRef;
58
use vortex_array::IntoArray;
69
use vortex_array::LEGACY_SESSION;
@@ -107,3 +110,49 @@ fn test_fsst_array_ops() {
107110

108111
assert_arrays_eq!(fsst_array, canonical_array);
109112
}
113+
114+
/// Regression for #7833: `fsst_compress` must accept inputs whose cumulative
115+
/// compressed bytes exceed `i32::MAX`. Pre-fix, `fsst_compress_iter` hardcoded
116+
/// `VarBinBuilder::<i32>` for the FSST output buffer regardless of input size,
117+
/// which panicked in `VarBinBuilder::<i32>::append_value` once cumulative
118+
/// compressed bytes passed `i32::MAX`.
119+
///
120+
/// Allocates ~2.5 GiB for the input plus ~2.5 GiB for the FSST output, so the
121+
/// test is `#[ignore]`-d by default. Run explicitly with:
122+
/// `cargo test --release -p vortex-fsst -- --ignored fsst_compress_offsets`.
123+
#[test]
124+
#[ignore = "allocates ~5 GiB; run with --ignored"]
125+
fn fsst_compress_offsets_overflow_i32() {
126+
// High-entropy ASCII strings sliced from a random pool. FSST is a
127+
// symbol-table compressor; pseudo-random data with no recurring byte
128+
// sequences resists compression, so the compressed output stays close
129+
// to input size and crosses the i32 boundary.
130+
const STRING_LEN: usize = 64 * 1024;
131+
const TOTAL_BYTES: usize = (1usize << 31) + (512 << 20); // ~2.5 GiB
132+
const N: usize = TOTAL_BYTES / STRING_LEN;
133+
const POOL_LEN: usize = 64 * 1024 * 1024;
134+
135+
// Printable ASCII alphabet so the result is valid UTF-8.
136+
const ALPHABET: &[u8; 95] =
137+
b" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
138+
139+
let mut rng = StdRng::seed_from_u64(0xC0DE_C011_B711);
140+
let pool: Vec<u8> = (0..POOL_LEN)
141+
.map(|_| *ALPHABET.choose(&mut rng).unwrap())
142+
.collect();
143+
144+
let mut builder = VarBinBuilder::<i64>::with_capacity(N);
145+
for i in 0..N {
146+
let off = (i.wrapping_mul(31337)) % (POOL_LEN - STRING_LEN);
147+
builder.append_value(&pool[off..off + STRING_LEN]);
148+
}
149+
let array = builder.finish(DType::Utf8(Nullability::NonNullable));
150+
151+
let compressor = fsst_train_compressor(&array);
152+
let len = array.len();
153+
let dtype = array.dtype().clone();
154+
let mut ctx = LEGACY_SESSION.create_execution_ctx();
155+
156+
let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx);
157+
assert_eq!(compressed.len(), len);
158+
}

0 commit comments

Comments
 (0)