|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// SPDX-FileCopyrightText: Copyright the Vortex contributors |
| 3 | + |
| 4 | +#![expect(clippy::unwrap_used)] |
| 5 | + |
| 6 | +//! Row-encoding an FSST-compressed string column: the only realizable strategy is |
| 7 | +//! "unpack then convert" (decompress FSST to a canonical `VarBinView`, then row-encode it), |
| 8 | +//! because FSST is **not order-preserving** — its 1-byte codes are assigned by compression |
| 9 | +//! gain, not by value, so the compressed bytes cannot be compared lexicographically. A |
| 10 | +//! hypothetical "direct" kernel could only *fuse* decompression with row-key emission; it |
| 11 | +//! still has to expand every symbol. |
| 12 | +//! |
| 13 | +//! These benchmarks measure the full path and its two phases so the fusion opportunity is |
| 14 | +//! quantifiable: |
| 15 | +//! * `fsst_unpack_then_convert` — decompress + row-encode (the status quo). |
| 16 | +//! * `fsst_decompress_only` — decompress alone (the irreducible floor: a direct kernel |
| 17 | +//! must still produce these bytes). |
| 18 | +//! * `plain_row_encode_only` — row-encode an already-decompressed `VarBinView` (the part |
| 19 | +//! a fused kernel would overlap with decompression; its writes into the intermediate |
| 20 | +//! buffer + views are what fusion removes). |
| 21 | +
|
| 22 | +use divan::counter::BytesCount; |
| 23 | +use mimalloc::MiMalloc; |
| 24 | +use rand::RngExt; |
| 25 | +use rand::SeedableRng; |
| 26 | +use rand::rngs::StdRng; |
| 27 | +use vortex_array::ArrayRef; |
| 28 | +use vortex_array::Canonical; |
| 29 | +use vortex_array::IntoArray; |
| 30 | +use vortex_array::LEGACY_SESSION; |
| 31 | +use vortex_array::VortexSessionExecute; |
| 32 | +use vortex_array::arrays::VarBinArray; |
| 33 | +use vortex_array::dtype::DType; |
| 34 | +use vortex_array::dtype::Nullability; |
| 35 | +use vortex_fsst::fsst_compress; |
| 36 | +use vortex_fsst::fsst_train_compressor; |
| 37 | +use vortex_row::RowEncoder; |
| 38 | + |
| 39 | +#[global_allocator] |
| 40 | +static GLOBAL: MiMalloc = MiMalloc; |
| 41 | + |
| 42 | +const N: usize = 100_000; |
| 43 | +const AVG_LEN: usize = 64; |
| 44 | +const UNIQUE_CHARS: u8 = 8; |
| 45 | + |
| 46 | +/// Generate compressible, multi-block (>32 byte) strings over a small alphabet so FSST finds |
| 47 | +/// a strong symbol table — the regime where a direct kernel would matter most. |
| 48 | +fn generate_strings() -> (VarBinArray, u64) { |
| 49 | + let mut rng = StdRng::seed_from_u64(0); |
| 50 | + let mut strings = Vec::with_capacity(N); |
| 51 | + let mut total_bytes: u64 = 0; |
| 52 | + for _ in 0..N { |
| 53 | + let len = AVG_LEN * rng.random_range(50..=150) / 100; |
| 54 | + total_bytes += len as u64; |
| 55 | + let s = (0..len) |
| 56 | + .map(|_| rng.random_range(b'a'..(b'a' + UNIQUE_CHARS)) as char) |
| 57 | + .collect::<String>() |
| 58 | + .into_bytes(); |
| 59 | + strings.push(Some(s.into_boxed_slice())); |
| 60 | + } |
| 61 | + let arr = VarBinArray::from_iter(strings, DType::Binary(Nullability::NonNullable)); |
| 62 | + (arr, total_bytes) |
| 63 | +} |
| 64 | + |
| 65 | +fn build_fsst() -> (ArrayRef, u64) { |
| 66 | + let (arr, total_bytes) = generate_strings(); |
| 67 | + let compressor = fsst_train_compressor(&arr); |
| 68 | + let len = arr.len(); |
| 69 | + let dtype = arr.dtype().clone(); |
| 70 | + let mut ctx = LEGACY_SESSION.create_execution_ctx(); |
| 71 | + let fsst = fsst_compress(arr, len, &dtype, &compressor, &mut ctx).into_array(); |
| 72 | + (fsst, total_bytes) |
| 73 | +} |
| 74 | + |
| 75 | +fn decompress(fsst: &ArrayRef) -> ArrayRef { |
| 76 | + let mut ctx = LEGACY_SESSION.create_execution_ctx(); |
| 77 | + fsst.clone() |
| 78 | + .execute::<Canonical>(&mut ctx) |
| 79 | + .unwrap() |
| 80 | + .into_array() |
| 81 | +} |
| 82 | + |
| 83 | +fn main() { |
| 84 | + divan::main(); |
| 85 | +} |
| 86 | + |
| 87 | +/// Status quo: decompress FSST to a canonical `VarBinView`, then row-encode it. |
| 88 | +#[divan::bench] |
| 89 | +fn fsst_unpack_then_convert(bencher: divan::Bencher) { |
| 90 | + let (fsst, total_bytes) = build_fsst(); |
| 91 | + let encoder = RowEncoder::default(); |
| 92 | + bencher.counter(BytesCount::new(total_bytes)).bench_local(|| { |
| 93 | + let mut ctx = LEGACY_SESSION.create_execution_ctx(); |
| 94 | + let decoded = fsst.clone().execute::<Canonical>(&mut ctx).unwrap().into_array(); |
| 95 | + encoder.encode(&[decoded], &mut ctx).unwrap() |
| 96 | + }); |
| 97 | +} |
| 98 | + |
| 99 | +/// Irreducible floor: FSST decompression alone (a direct kernel must still produce these |
| 100 | +/// bytes, since the sort key *is* the decompressed bytes). |
| 101 | +#[divan::bench] |
| 102 | +fn fsst_decompress_only(bencher: divan::Bencher) { |
| 103 | + let (fsst, total_bytes) = build_fsst(); |
| 104 | + bencher |
| 105 | + .counter(BytesCount::new(total_bytes)) |
| 106 | + .bench_local(|| decompress(&fsst)); |
| 107 | +} |
| 108 | + |
| 109 | +/// Row-encode an already-decompressed `VarBinView`. The writes into the decompressed buffer + |
| 110 | +/// views that precede this step are what a fused direct kernel would eliminate. |
| 111 | +#[divan::bench] |
| 112 | +fn plain_row_encode_only(bencher: divan::Bencher) { |
| 113 | + let (fsst, total_bytes) = build_fsst(); |
| 114 | + let decoded = decompress(&fsst); |
| 115 | + let encoder = RowEncoder::default(); |
| 116 | + bencher.counter(BytesCount::new(total_bytes)).bench_local(|| { |
| 117 | + let mut ctx = LEGACY_SESSION.create_execution_ctx(); |
| 118 | + encoder.encode(std::slice::from_ref(&decoded), &mut ctx).unwrap() |
| 119 | + }); |
| 120 | +} |
0 commit comments