|
1 | 1 | // SPDX-License-Identifier: Apache-2.0 |
2 | 2 | // SPDX-FileCopyrightText: Copyright the Vortex contributors |
3 | 3 |
|
| 4 | +use rand::SeedableRng; |
| 5 | +use rand::rngs::StdRng; |
| 6 | +use rand::seq::IndexedRandom; |
4 | 7 | use vortex_array::ArrayRef; |
5 | 8 | use vortex_array::IntoArray; |
6 | 9 | use vortex_array::LEGACY_SESSION; |
@@ -107,3 +110,49 @@ fn test_fsst_array_ops() { |
107 | 110 |
|
108 | 111 | assert_arrays_eq!(fsst_array, canonical_array); |
109 | 112 | } |
| 113 | + |
| 114 | +/// Regression for #7833: `fsst_compress` must accept inputs whose cumulative |
| 115 | +/// compressed bytes exceed `i32::MAX`. Pre-fix, `fsst_compress_iter` hardcoded |
| 116 | +/// `VarBinBuilder::<i32>` for the FSST output buffer regardless of input size, |
| 117 | +/// which panicked in `VarBinBuilder::<i32>::append_value` once cumulative |
| 118 | +/// compressed bytes passed `i32::MAX`. |
| 119 | +/// |
| 120 | +/// Allocates ~2.5 GiB for the input plus ~2.5 GiB for the FSST output, so the |
| 121 | +/// test is `#[ignore]`-d by default. Run explicitly with: |
| 122 | +/// `cargo test --release -p vortex-fsst -- --ignored fsst_compress_offsets`. |
| 123 | +#[test] |
| 124 | +#[ignore = "allocates ~5 GiB; run with --ignored"] |
| 125 | +fn fsst_compress_offsets_overflow_i32() { |
| 126 | + // High-entropy ASCII strings sliced from a random pool. FSST is a |
| 127 | + // symbol-table compressor; pseudo-random data with no recurring byte |
| 128 | + // sequences resists compression, so the compressed output stays close |
| 129 | + // to input size and crosses the i32 boundary. |
| 130 | + const STRING_LEN: usize = 64 * 1024; |
| 131 | + const TOTAL_BYTES: usize = (1usize << 31) + (512 << 20); // ~2.5 GiB |
| 132 | + const N: usize = TOTAL_BYTES / STRING_LEN; |
| 133 | + const POOL_LEN: usize = 64 * 1024 * 1024; |
| 134 | + |
| 135 | + // Printable ASCII alphabet so the result is valid UTF-8. |
| 136 | + const ALPHABET: &[u8; 95] = |
| 137 | + b" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; |
| 138 | + |
| 139 | + let mut rng = StdRng::seed_from_u64(0xC0DE_C011_B711); |
| 140 | + let pool: Vec<u8> = (0..POOL_LEN) |
| 141 | + .map(|_| *ALPHABET.choose(&mut rng).unwrap()) |
| 142 | + .collect(); |
| 143 | + |
| 144 | + let mut builder = VarBinBuilder::<i64>::with_capacity(N); |
| 145 | + for i in 0..N { |
| 146 | + let off = (i.wrapping_mul(31337)) % (POOL_LEN - STRING_LEN); |
| 147 | + builder.append_value(&pool[off..off + STRING_LEN]); |
| 148 | + } |
| 149 | + let array = builder.finish(DType::Utf8(Nullability::NonNullable)); |
| 150 | + |
| 151 | + let compressor = fsst_train_compressor(&array); |
| 152 | + let len = array.len(); |
| 153 | + let dtype = array.dtype().clone(); |
| 154 | + let mut ctx = LEGACY_SESSION.create_execution_ctx(); |
| 155 | + |
| 156 | + let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx); |
| 157 | + assert_eq!(compressed.len(), len); |
| 158 | +} |
0 commit comments