Skip to content

Commit df3e829

Browse files
authored
Correctly calculate FSST compressed output size (#8551)
FSST worst case scenario for compression is all escape codes which would result in 2x size increase. There's no additional overhead as previous pr would imply
1 parent 226477d commit df3e829

1 file changed

Lines changed: 12 additions & 17 deletions

File tree

encodings/fsst/src/compress.rs

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,8 @@ use vortex_mask::Mask;
3636
use crate::FSST;
3737
use crate::FSSTArray;
3838

39-
/// FSST worst case: every input byte expands to an escape + literal (2x) plus a small
40-
/// per-string header.
39+
/// FSST worst case: every input byte expands to an escape + literal (2x).
4140
const FSST_PER_BYTE_OVERHEAD: usize = 2;
42-
const FSST_PER_ROW_OVERHEAD: usize = 7;
4341

4442
/// Starting capacity for the per-row `compress_into` scratch buffer; grown monotonically.
4543
const DEFAULT_BUFFER_LEN: usize = 1024 * 1024;
@@ -87,7 +85,6 @@ fn compress_varbinview(
8785
) -> VortexResult<FSSTArray> {
8886
let mask = strings.validity()?.execute_mask(strings.len(), ctx)?;
8987
let views = strings.views();
90-
let non_null = mask.true_count();
9188

9289
let total_input_bytes = match mask.bit_buffer() {
9390
AllOr::All => views.iter().map(|v| v.len() as usize).sum(),
@@ -100,7 +97,7 @@ fn compress_varbinview(
10097
.sum(),
10198
};
10299

103-
if fsst_output_fits_in_i32_offsets(total_input_bytes, non_null) {
100+
if fsst_output_fits_in_i32_offsets(total_input_bytes) {
104101
compress_views::<i32>(strings, &mask, compressor, ctx)
105102
} else {
106103
compress_views::<i64>(strings, &mask, compressor, ctx)
@@ -120,9 +117,8 @@ fn compress_varbin_array(
120117
let last: usize = off[off.len() - 1].as_();
121118
last - first
122119
});
123-
let non_null = mask.true_count();
124120

125-
if fsst_output_fits_in_i32_offsets(total_input_bytes, non_null) {
121+
if fsst_output_fits_in_i32_offsets(total_input_bytes) {
126122
compress_varbin::<i32>(strings, &offsets, &mask, compressor, ctx)
127123
} else {
128124
compress_varbin::<i64>(strings, &offsets, &mask, compressor, ctx)
@@ -179,10 +175,8 @@ fn train_varbin_array(
179175
}
180176

181177
#[inline]
182-
fn fsst_output_fits_in_i32_offsets(total_input_bytes: usize, non_null: usize) -> bool {
183-
let worst = total_input_bytes
184-
.saturating_mul(FSST_PER_BYTE_OVERHEAD)
185-
.saturating_add(non_null.saturating_mul(FSST_PER_ROW_OVERHEAD));
178+
fn fsst_output_fits_in_i32_offsets(total_input_bytes: usize) -> bool {
179+
let worst = total_input_bytes.saturating_mul(FSST_PER_BYTE_OVERHEAD);
186180
worst <= i32::MAX as usize
187181
}
188182

@@ -305,7 +299,7 @@ impl<'c, O: IntegerPType + 'static> FsstSink<'c, O> {
305299
i32::try_from(s.len()).vortex_expect("per-row uncompressed length must fit in i32"),
306300
);
307301

308-
let target = FSST_PER_BYTE_OVERHEAD * s.len() + FSST_PER_ROW_OVERHEAD;
302+
let target = FSST_PER_BYTE_OVERHEAD * s.len();
309303
if target > self.buffer.len() {
310304
self.buffer.reserve(target - self.buffer.len());
311305
}
@@ -332,6 +326,7 @@ impl<'c, O: IntegerPType + 'static> FsstSink<'c, O> {
332326
#[cfg(test)]
333327
mod tests {
334328
use vortex_array::VortexSessionExecute;
329+
use vortex_array::array_session;
335330
use vortex_array::arrays::VarBinViewArray;
336331
use vortex_array::arrays::varbin::VarBinArrayExt;
337332
use vortex_array::dtype::PType;
@@ -347,18 +342,18 @@ mod tests {
347342
#[test]
348343
fn offset_width_boundary() {
349344
let m = i32::MAX as usize;
350-
assert!(fsst_output_fits_in_i32_offsets(m / 2 - 7, 1));
351-
assert!(!fsst_output_fits_in_i32_offsets(m / 2, 1));
352-
assert!(fsst_output_fits_in_i32_offsets(0, 0));
353-
assert!(!fsst_output_fits_in_i32_offsets(usize::MAX, 1));
345+
assert!(fsst_output_fits_in_i32_offsets(m / 2 - 7));
346+
assert!(fsst_output_fits_in_i32_offsets(m / 2));
347+
assert!(fsst_output_fits_in_i32_offsets(0));
348+
assert!(!fsst_output_fits_in_i32_offsets(usize::MAX));
354349
}
355350

356351
/// Small inputs fit the i32 bound, so `fsst_compress` must pick i32 offsets.
357352
/// The i64 branch is covered by `tests::fsst_compress_offsets_overflow_i32`.
358353
#[test]
359354
fn codes_offsets_dtype_small_input_is_i32() -> VortexResult<()> {
360355
let array = VarBinViewArray::from_iter_str(["hello", "world", "fsst encoded"]);
361-
let mut ctx = vortex_array::array_session().create_execution_ctx();
356+
let mut ctx = array_session().create_execution_ctx();
362357
let compressor = fsst_train_compressor(array.as_array(), &mut ctx)?;
363358
let fsst = fsst_compress(array.as_array(), &compressor, &mut ctx)?;
364359
assert_eq!(fsst.codes().offsets().dtype().as_ptype(), PType::I32);

0 commit comments

Comments
 (0)