From c5f909b11ad8b2863689eae590a5d964c0fa3f6d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 13 May 2026 21:12:42 +0000 Subject: [PATCH 1/5] fastlanes: allow signed integers in Delta encoding Lifts the `is_unsigned_int` gate on `DeltaArray` so `i8` / `i16` / `i32` / `i64` columns can be delta-encoded. The upstream FastLanes kernels (`Delta::delta`, `Transpose::transpose`) are bounded on `T: FastLanes: Unsigned`, so signed inputs are processed by reinterpret-casting the underlying buffer to the same-width unsigned counterpart, running the existing kernel, and reinterpret-casting back. `wrapping_sub`/`wrapping_add` are bit-identical for signed and unsigned operands under two's-complement, so the round-trip is exact. Note that the encoded delta bytes for inputs that cross zero have the high bits set (e.g. delta `-1i8` = `0xFF`); naively bit-packing those would force the bit width to `T`. A follow-up should compose `Delta` with `FoR` so the deltas are stored as `value - min(delta)` before bit-packing. See encodings/fastlanes/src/delta/FUSED_DECODE.md for a design note on a fused triple-kernel (unpack + add-reference + undelta) that addresses the decode bandwidth. Also guards `Delta::cast` against signed sources: value-preserving casts of signed deltas (e.g. `-1i8` -> `4294967295u32`) break the wrapping-add invariant during decompression, so signed sources fall back to the decompress-and-reencode path. Signed-off-by: Claude --- encodings/fastlanes/src/delta/FUSED_DECODE.md | 156 ++++++++++++++++++ .../src/delta/array/delta_compress.rs | 54 +++++- .../src/delta/array/delta_decompress.rs | 25 ++- encodings/fastlanes/src/delta/array/mod.rs | 19 ++- encodings/fastlanes/src/delta/compute/cast.rs | 6 + encodings/fastlanes/src/delta/vtable/mod.rs | 8 +- 6 files changed, 253 insertions(+), 15 deletions(-) create mode 100644 encodings/fastlanes/src/delta/FUSED_DECODE.md diff --git a/encodings/fastlanes/src/delta/FUSED_DECODE.md b/encodings/fastlanes/src/delta/FUSED_DECODE.md new file mode 100644 index 00000000000..9c461a1ec82 --- /dev/null +++ b/encodings/fastlanes/src/delta/FUSED_DECODE.md @@ -0,0 +1,156 @@ +# Fused Delta + FFoR + BitUnpack decoding — design note + +Status: design only. No production kernel in this branch. + +## Motivation + +With signed-integer support added to `DeltaArray`, the natural compressed shape for +signed columns is: + +``` +DeltaArray { + bases: PrimitiveArray // T ∈ {i8..i64, u8..u64} + deltas: BitPackedArray { encoded: FoRArray { encoded: …, ref: min_d } } +} +``` + +That is: delta-encode → frame-of-reference (subtract `min_d`) → bit-pack. This is the +"DELTA + FFoR + BP" stack recommended by the FastLanes paper and the ADMS '24 +follow-up, and it is the only stack that keeps the bit-packing width small when the +deltas can be negative (a single negative delta in two's complement otherwise +sets every high bit, forcing `W = T`). + +Today, decoding such an array makes **three separate passes** over the packed +buffer (and intermediate buffers): + +1. `BitPackedArray::execute` — unpack `W` bits per element into a full-width + primitive array. +2. `FoRArray::execute` — element-wise `wrapping_add(reference)`. +3. `DeltaArray::execute` → `delta_decompress` → `Delta::undelta` — element-wise + cumulative-sum (`wrapping_add(prev)`) within each lane. + +Each pass reads and writes 1024 × `size_of::()` bytes per chunk. For the common +case where `T = i32` and `W = 8`, that is 3 × 4 KiB of bandwidth per chunk to do +work whose minimum information-theoretic cost is one read of 1 KiB (the packed +buffer) and one write of 4 KiB (the output). + +## Upstream building blocks + +`fastlanes` 0.5.0 already ships partial fusions: + +| Kernel | Fuses | Type bound | +|---|---|---| +| `BitPacking::unpack` | unpack only | `Self: FastLanes` (unsigned) | +| `FoR::unfor_pack` | unpack + `wrapping_add(ref)` | `Self: BitPacking` | +| `Delta::undelta_pack` | unpack + lane-cumsum undelta | `Self: BitPacking` | +| `Delta::undelta` | undelta on already-unpacked values | `Self: BitPacking` | + +What is **missing upstream** is a triple-fused kernel: unpack + `wrapping_add(ref)` ++ undelta in a single pass. The two existing fused kernels each pair *one* of the +two reductions with `unpack`; neither pairs both. + +## Proposed kernel + +```rust +/// Triple-fused decode: unpack W-bit values, add a FoR reference, and undo a +/// per-lane delta in one pass. +/// +/// `input` — packed buffer of `B = 1024 * W / T` elements of width `T` +/// `base` — `LANES`-element per-lane bases (already in the natural type) +/// `reference` — FoR reference added to every unpacked element before undelta +/// `output` — 1024 reconstructed values, in lane-transposed order +fn undelta_for_pack( + input: &[Self; B], + base: &[Self; LANES], + reference: Self, + output: &mut [Self; 1024], +); +``` + +Sketch (compare with upstream `Delta::undelta_pack` and `FoR::unfor_pack`): + +```rust +for lane in 0..Self::LANES { + let mut prev = base[lane]; + unpack!(T, W, input, lane, |idx, packed_elem| { + // (1) restore FoR offset, (2) cumulative wrapping-add along the lane + let d = packed_elem.wrapping_add(reference); + let next = d.wrapping_add(prev); + output[idx] = next; + prev = next; + }); +} +``` + +Memory traffic per 1024-chunk: one read of `1024 * W / 8` bytes (packed), one +read of `LANES * size_of::()` bytes (bases), one scalar `reference`, and one +write of `1024 * size_of::()` bytes. For `T = i32`, `W = 8`: 1 KiB read + 128 B +read + 4 KiB write = ~5 KiB total, versus ~13 KiB for the 3-pass path. + +### Type bounds + +The kernel naturally inherits `Self: BitPacking`, which upstream restricts to +unsigned types (`u8`/`u16`/`u32`/`u64`). Signed inputs reuse the kernel via +`FastLanesComparable::Bitpacked` — the same transmute trick used by this branch's +non-fused signed-support change — so a single set of macro instantiations +(`u8`/`u16`/`u32`/`u64`) covers all eight integer types. + +## Where the kernel lives + +Two options, in increasing order of effort: + +1. **Vortex-local kernel** in `encodings/fastlanes/src/delta/undelta_for_pack.rs`, + built with the same `seq_t!` / `pack!` / `unpack!` macros that upstream exports. + Pros: lands in one PR, no upstream churn. Cons: duplicates the lane-iteration + skeleton; future upstream fixes (e.g. patches to the bit-shuffling order) have + to be mirrored. + +2. **Upstream `fastlanes` PR** adding `Delta::undelta_for_pack` next to + `Delta::undelta_pack`. Pros: shares the macro skeleton with the existing + fused kernels. Cons: depends on a release and a workspace pin bump. + +Option 1 is the right starting point. If benchmarks show the win we expect, the +kernel can be lifted upstream with a thin wrapper kept locally. + +## Integration into the decode path + +`delta_decompress` currently calls `array.deltas().clone().execute(ctx)?` and +then `Delta::undelta` lane-by-lane. To use the fused kernel: + +1. Inspect the `deltas` child. The fast path applies only when it is exactly + `BitPacked` *or* `FoR(BitPacked)`. +2. For `FoR(BitPacked)`: read the FoR `reference` scalar; read the packed + buffer, bit-width, and patches from the `BitPacked` child; dispatch to + `undelta_for_pack::` for each 1024-chunk. +3. For `BitPacked` (no FoR layer): dispatch to upstream `Delta::undelta_pack` + (already exists, no new kernel needed). +4. For anything else (e.g. a generic primitive deltas slot): fall through to + the current non-fused path. +5. Handle patches (the BitPacked layer's exception store) after the fused decode, + the same way `for/array/for_decompress.rs::fused_decompress` does it today. + +The signed-vs-unsigned dispatch is the same `reinterpret_cast` trick used in +this branch: rewrap as the unsigned counterpart, call the fused kernel, rewrap +the output. The bases and the FoR reference participate in the same transmute. + +## Benchmark plan (before committing to the kernel) + +A microbench in `vortex-bench/` over four sorted signed columns of 10M elements: + +| Column shape | Expected `W` | Hypothesis | +|---|---|---| +| `i32` monotone increasing from 0 | small | fused wins, no FoR step does much | +| `i32` monotone increasing from −1e9 | small | fused wins; FoR ref nontrivial | +| `i32` near-monotone with 5 % decreases | small | fused wins by larger margin | +| `i32` random in `[−100, +100]` | medium | fused ≈ 3-pass; bandwidth less dominant | + +Decode throughput on a single core; compare 3-pass vs proposed fused kernel. +Worth landing if the fused path is ≥ 1.5× on the first three rows. + +## Out of scope + +- The encoding side: `delta_compress` already runs in one pass; FoR + bit-pack + on the produced deltas is a separate sequential composition that is already + fused well enough by the existing FoR and BitPacked encoders. +- A symmetric `delta_for_pack` (fused encode) — only worth doing once the + decode-side wins are confirmed. diff --git a/encodings/fastlanes/src/delta/array/delta_compress.rs b/encodings/fastlanes/src/delta/array/delta_compress.rs index d51cef72b49..0a2f6f5fac6 100644 --- a/encodings/fastlanes/src/delta/array/delta_compress.rs +++ b/encodings/fastlanes/src/delta/array/delta_compress.rs @@ -9,6 +9,7 @@ use fastlanes::FastLanes; use fastlanes::Transpose; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::dtype::NativePType; use vortex_array::match_each_unsigned_integer_ptype; use vortex_buffer::Buffer; @@ -17,27 +18,46 @@ use vortex_error::VortexResult; use crate::FL_CHUNK_SIZE; use crate::bit_transpose::transpose_validity; +use crate::delta::array::unsigned_counterpart; use crate::fill_forward_nulls; pub fn delta_compress( array: &PrimitiveArray, ctx: &mut ExecutionCtx, ) -> VortexResult<(PrimitiveArray, PrimitiveArray)> { let validity = array.validity()?; - let (bases, deltas) = match_each_unsigned_integer_ptype!(array.ptype(), |T| { + let original_ptype = array.ptype(); + let unsigned_ptype = unsigned_counterpart(original_ptype); + // Signed integers are processed through their unsigned counterpart: `wrapping_sub` + // is bit-identical for signed and unsigned operands, so the encoded bytes are the + // same regardless of how the buffer's elements are interpreted. + let work = if original_ptype == unsigned_ptype { + array.clone() + } else { + array.reinterpret_cast(unsigned_ptype) + }; + + let (bases, deltas) = match_each_unsigned_integer_ptype!(work.ptype(), |T| { // Fill-forward null values so that transposed deltas at null positions remain // small. Without this, bitpacking may skip patches for null positions, and the // corrupted delta values propagate through the cumulative sum during decompression. - let filled = fill_forward_nulls(array.to_buffer::(), &validity, ctx)?; + let filled = fill_forward_nulls(work.to_buffer::(), &validity, ctx)?; let (bases, deltas) = compress_primitive::(&filled); // TODO(robert): This can be avoided if we add TransposedBoolArray that performs index translation when necessary. let validity = transpose_validity(&validity, ctx)?; ( - PrimitiveArray::new(bases, array.dtype().nullability().into()), + PrimitiveArray::new(bases, work.dtype().nullability().into()), PrimitiveArray::new(deltas, validity), ) }); - Ok((bases, deltas)) + Ok(if original_ptype == unsigned_ptype { + (bases, deltas) + } else { + ( + bases.reinterpret_cast(original_ptype), + deltas.reinterpret_cast(original_ptype), + ) + }) } fn compress_primitive(array: &[T]) -> (Buffer, Buffer) @@ -113,11 +133,31 @@ mod tests { LazyLock::new(|| VortexSession::empty().with::()); #[rstest] - #[case((0u32..10_000).collect())] - #[case((0..10_000).map(|i| (i % (u8::MAX as i32)) as u8).collect())] - #[case(PrimitiveArray::from_option_iter( + #[case::u32((0u32..10_000).collect())] + #[case::u8((0..10_000).map(|i| (i % (u8::MAX as i32)) as u8).collect())] + #[case::nullable_u32(PrimitiveArray::from_option_iter( (0u32..10_000).map(|i| (i % 2 == 0).then_some(i)), ))] + // Signed inputs that stay non-negative: encoded deltas are identical to the u32 case + // bit-for-bit, but the buffer's dtype carries the signedness through round-trip. + #[case::i32_non_negative((0i32..10_000).collect())] + // Signed inputs crossing zero: deltas alternate in sign, which under wrapping_sub + // populates the high bits of negative deltas. Bit-packing without preprocessing + // would explode here, but round-tripping the raw delta buffer is still correct. + #[case::i32_crossing_zero((-5_000i32..5_000).collect())] + // All-negative signed values. + #[case::i32_all_negative((-10_000i32..0).collect())] + // i8 across the full type range: tests T::MIN / T::MAX boundaries and the + // remainder-padded chunk path (256 < FL_CHUNK_SIZE). + #[case::i8_full_range((i8::MIN..=i8::MAX).collect())] + // i16 crossing zero. + #[case::i16_crossing_zero((-2_000i16..2_000).collect())] + // i64 with large negative offset. + #[case::i64_large_negative((0i64..5_000).map(|i| i - 1_000_000_000_000).collect())] + // Nullable signed array with values around zero. + #[case::nullable_i32_crossing(PrimitiveArray::from_option_iter( + (-2_000i32..2_000).map(|i| (i % 3 != 0).then_some(i)), + ))] fn test_compress(#[case] array: PrimitiveArray) -> VortexResult<()> { let delta = Delta::try_from_primitive_array(&array, &mut SESSION.create_execution_ctx())?; assert_eq!(delta.len(), array.len()); diff --git a/encodings/fastlanes/src/delta/array/delta_decompress.rs b/encodings/fastlanes/src/delta/array/delta_decompress.rs index d6404dfb905..5b32434e9b6 100644 --- a/encodings/fastlanes/src/delta/array/delta_decompress.rs +++ b/encodings/fastlanes/src/delta/array/delta_decompress.rs @@ -10,6 +10,7 @@ use fastlanes::Transpose; use itertools::Itertools; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::dtype::NativePType; use vortex_array::match_each_unsigned_integer_ptype; use vortex_buffer::Buffer; @@ -19,6 +20,7 @@ use vortex_error::VortexResult; use crate::DeltaArray; use crate::bit_transpose::untranspose_validity; use crate::delta::array::DeltaArrayExt; +use crate::delta::array::unsigned_counterpart; pub fn delta_decompress( array: &DeltaArray, @@ -33,14 +35,33 @@ pub fn delta_decompress( let validity = untranspose_validity(&deltas.validity()?, ctx)?; let validity = validity.slice(start..end)?; - Ok(match_each_unsigned_integer_ptype!(deltas.ptype(), |T| { + let original_ptype = deltas.ptype(); + let unsigned_ptype = unsigned_counterpart(original_ptype); + // Signed inputs are processed through their unsigned counterpart; `wrapping_add` on the + // raw bytes inverts the `wrapping_sub` done at compress time regardless of signedness. + let (bases, deltas) = if original_ptype == unsigned_ptype { + (bases, deltas) + } else { + ( + bases.reinterpret_cast(unsigned_ptype), + deltas.reinterpret_cast(unsigned_ptype), + ) + }; + + let decoded = match_each_unsigned_integer_ptype!(deltas.ptype(), |T| { const LANES: usize = T::LANES; let buffer = decompress_primitive::(bases.as_slice(), deltas.as_slice()); let buffer = buffer.slice(start..end); PrimitiveArray::new(buffer, validity) - })) + }); + + Ok(if original_ptype == unsigned_ptype { + decoded + } else { + decoded.reinterpret_cast(original_ptype) + }) } /// Performs the low-level delta decompression on primitive values. diff --git a/encodings/fastlanes/src/delta/array/mod.rs b/encodings/fastlanes/src/delta/array/mod.rs index 4e96edc59ca..54313f53c93 100644 --- a/encodings/fastlanes/src/delta/array/mod.rs +++ b/encodings/fastlanes/src/delta/array/mod.rs @@ -101,5 +101,22 @@ impl DeltaData { } pub(crate) fn lane_count(ptype: PType) -> usize { - match_each_unsigned_integer_ptype!(ptype, |T| { T::LANES }) + match_each_unsigned_integer_ptype!(unsigned_counterpart(ptype), |T| { T::LANES }) +} + +/// Map a signed integer [`PType`] to its same-width unsigned counterpart; other [`PType`]s +/// pass through unchanged. +/// +/// The FastLanes kernels (`Delta::delta`, `Transpose::transpose`, ...) are only implemented +/// for unsigned integer types. Signed inputs are processed by viewing the same bytes through +/// the unsigned counterpart; `wrapping_sub` / `wrapping_add` are bit-identical for signed and +/// unsigned operands under two's-complement, so the round-trip is exact. +pub(crate) fn unsigned_counterpart(ptype: PType) -> PType { + match ptype { + PType::I8 => PType::U8, + PType::I16 => PType::U16, + PType::I32 => PType::U32, + PType::I64 => PType::U64, + other => other, + } } diff --git a/encodings/fastlanes/src/delta/compute/cast.rs b/encodings/fastlanes/src/delta/compute/cast.rs index 324e2fc9c22..e50c27f1a57 100644 --- a/encodings/fastlanes/src/delta/compute/cast.rs +++ b/encodings/fastlanes/src/delta/compute/cast.rs @@ -24,6 +24,12 @@ impl CastReduce for Delta { if target_ptype.is_signed_int() || source_ptype.bit_width() > target_ptype.bit_width() { return Ok(None); } + // Signed deltas widened by per-element value-preserving cast (e.g. -1i8 -> 4294967295u32) + // break the wrapping-add invariant: zero-extending the delta bytes would preserve it, + // sign-extending the deltas does not. Fall back to full decompress + re-encode. + if source_ptype.is_signed_int() { + return Ok(None); + } // Cast both bases and deltas to the target type let casted_bases = array.bases().cast(dtype.with_nullability(NonNullable))?; diff --git a/encodings/fastlanes/src/delta/vtable/mod.rs b/encodings/fastlanes/src/delta/vtable/mod.rs index 96b6ec27e0c..b5e68791ceb 100644 --- a/encodings/fastlanes/src/delta/vtable/mod.rs +++ b/encodings/fastlanes/src/delta/vtable/mod.rs @@ -4,7 +4,6 @@ use std::hash::Hash; use std::hash::Hasher; -use fastlanes::FastLanes; use prost::Message; use vortex_array::Array; use vortex_array::ArrayEq; @@ -21,7 +20,6 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::buffer::BufferHandle; use vortex_array::dtype::DType; use vortex_array::dtype::PType; -use vortex_array::match_each_unsigned_integer_ptype; use vortex_array::serde::ArrayChildren; use vortex_array::smallvec::smallvec; use vortex_array::vtable::VTable; @@ -156,7 +154,7 @@ impl VTable for Delta { ); let metadata = DeltaMetadata::decode(metadata)?; let ptype = PType::try_from(dtype)?; - let lanes = match_each_unsigned_integer_ptype!(ptype, |T| { ::LANES }); + let lanes = lane_count(ptype); // Compute the length of the bases array let deltas_len = usize::try_from(metadata.deltas_len) @@ -227,8 +225,8 @@ fn validate_parts( ); vortex_ensure!( - bases.dtype().is_unsigned_int(), - "DeltaArray: dtype must be an unsigned integer, got {}", + bases.dtype().is_int(), + "DeltaArray: dtype must be an integer, got {}", bases.dtype() ); From e80a13416e98993cc2aa811cbcfb7d6a2fc1de49 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 13 May 2026 21:24:13 +0000 Subject: [PATCH 2/5] fastlanes: add synthetic delta compression workload test Measures the encoded byte budget under three bit-packing strategies for four representative signed `i32` shapes (monotone, sensor-like wobble around zero, large-negative offset, near-monotone with backtracks): | Workload | range | Wnaive | Wffor | Wzz | ratio | |-----------------------------------|----------------|-------:|------:|----:|--------:| | monotone i32 (0..N) | [0, 1] | 1 | 1 | 2 | 15.97x | | sensor i32 in [-100, 100] | [-196, 199] | 32 | 9 | 9 | 3.20x | | offset i32 base=-1e9 | [0, 1] | 1 | 1 | 2 | 15.97x | | near-monotone i32 (5% backtrack) | [-2, 1] | 32 | 2 | 3 | 10.65x | The "naive" column is the OR-mask of the raw delta bit-patterns: a single negative delta sets every high bit and forces `W = T`, which is why the two workloads with negative deltas (`sensor`, `near-monotone`) blow up to 32 bits. FFoR brings them to 9 and 2 bits. ZigZag matches FFoR only on the symmetric `sensor` workload and loses on every asymmetric column. Asserts that FFoR never exceeds naive, drops below `T` whenever a negative delta is present, and beats ZigZag on the asymmetric workloads. Run with `--nocapture` to see the table. Signed-off-by: Claude --- .../src/delta/array/delta_compress.rs | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/encodings/fastlanes/src/delta/array/delta_compress.rs b/encodings/fastlanes/src/delta/array/delta_compress.rs index 0a2f6f5fac6..5e33777b73b 100644 --- a/encodings/fastlanes/src/delta/array/delta_compress.rs +++ b/encodings/fastlanes/src/delta/array/delta_compress.rs @@ -191,4 +191,105 @@ mod tests { assert_arrays_eq!(packed_delta_prim, array); Ok(()) } + + /// Measures compression of delta-encoded signed columns under three bit-packing strategies: + /// * `naive`: bit-packing the raw delta bytes (every negative delta sets the high bits, + /// so the OR mask forces `W = T`). + /// * `FFoR`: subtracting the per-column `min(delta)` before bit-packing + /// (`W = ceil(log2(max - min + 1))`). + /// * `zigzag`: `(n << 1) ^ (n >> 31)` before bit-packing + /// (`W = 1 + ceil(log2(max(|min|, |max|)))`). + /// + /// Asserts that FFoR beats or ties naive on every workload and beats zigzag on the + /// asymmetric workloads. Run with `--nocapture` to see the full table. + #[test] + fn synthetic_workload_compression() -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); + const N: usize = 8 * 1024; // 8 full FastLanes chunks per workload + + let monotone: Vec = (0..N as i32).collect(); + // Deterministic LCG so the test is reproducible. + let mut lcg = 0u32; + let mut next = || { + lcg = lcg.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + (lcg >> 16) as i32 + }; + let sensor: Vec = (0..N).map(|_| (next() % 201) - 100).collect(); + let offset: Vec = (0..N as i32).map(|i| -1_000_000_000 + i).collect(); + let mut lcg2 = 0u32; + let mut prev = 0i32; + let near_monotone: Vec = (0..N) + .map(|_| { + lcg2 = lcg2.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + let step = if (lcg2 >> 24) < 13 { -2 } else { 1 }; // ~5% backtrack + prev = prev.wrapping_add(step); + prev + }) + .collect(); + let workloads = [ + ("monotone i32 (0..N)", monotone), + ("sensor i32 in [-100, 100]", sensor), + ("offset i32 base=-1e9", offset), + ("near-monotone i32 (5% backtrack)", near_monotone), + ]; + + println!(); + println!( + "{:<36} {:>10} {:>14} {:>5} {:>5} {:>5} {:>10} {:>7}", + "workload", "raw (B)", "Δ range", "Wnaive", "Wffor", "Wzig", "FFoR (B)", "ratio" + ); + println!("{}", "-".repeat(96)); + + for (name, values) in workloads { + let raw_bytes = values.len() * size_of::(); + let array = PrimitiveArray::from_iter(values); + let (bases, deltas) = delta_compress(&array, &mut ctx)?; + let deltas_buf: &[i32] = deltas.as_slice(); + let bases_buf: &[i32] = bases.as_slice(); + + let min_d = *deltas_buf.iter().min().unwrap(); + let max_d = *deltas_buf.iter().max().unwrap(); + + // Naive width = OR of raw u32 bit-patterns of every delta. Any negative delta + // sets the high bits and forces W = 32. + let or: u32 = deltas_buf.iter().fold(0u32, |a, &d| a | (d as u32)); + let naive_w = if or == 0 { 0 } else { 32 - or.leading_zeros() as usize }; + + // FFoR width = ceil(log2(span)) where span = (max - min + 1). + let span = (max_d as i64 - min_d as i64) as u64 + 1; + let ffor_w = if span <= 1 { 0 } else { 64 - (span - 1).leading_zeros() as usize }; + + // ZigZag width = 1 + ceil(log2(max(|min|, |max|))) for any nonzero delta. + let zz_mag = (min_d.unsigned_abs()).max(max_d.unsigned_abs()); + let zz_w = if zz_mag == 0 { 0 } else { 1 + (32 - zz_mag.leading_zeros() as usize) }; + + // FFoR encoded byte size: bases (already unpacked) + ref + ceil(packed bits / 8). + let bases_bytes = bases_buf.len() * size_of::(); + let ref_bytes = size_of::(); + let packed_bits = deltas_buf.len() * ffor_w; + let ffor_packed_bytes = packed_bits.div_ceil(8); + let ffor_total = bases_bytes + ref_bytes + ffor_packed_bytes; + let ratio = raw_bytes as f64 / ffor_total as f64; + + println!( + "{name:<36} {raw_bytes:>10} {:>14} {naive_w:>5} {ffor_w:>5} {zz_w:>5} {ffor_total:>10} {ratio:>6.2}x", + format!("[{min_d}, {max_d}]") + ); + + // Sanity assertions. naive_w is 32 (or near it) for any delta sequence that + // contains a negative value; FFoR/ZigZag width must be strictly smaller for these + // workloads. + assert!(ffor_w <= naive_w.max(1), "FFoR must never exceed naive for {name}"); + if min_d < 0 { + assert_eq!(naive_w, 32, "any negative delta forces naive W to 32 for {name}"); + assert!(ffor_w < 32, "FFoR must compress below T for {name}"); + } + // On the asymmetric workloads (offset, near-monotone) FFoR must beat ZigZag. + if min_d > 0 || max_d < 0 { + assert!(ffor_w < zz_w, "FFoR should beat ZigZag on asymmetric {name}"); + } + } + + Ok(()) + } } From 4cee837e1deaecd68ec1c95b41805cc5a037b2db Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 13 May 2026 21:38:59 +0000 Subject: [PATCH 3/5] fastlanes: measure bases compressibility in delta workload test Extends the synthetic workload report with two extra columns: bases byte size and the FFoR bit-width those bases would pack to. For 8K-element i32 inputs the bases buffer is ~50% of the FFoR total on monotone-like columns, and the bases sequence inherits the smoothness of the input, so recursively packing the bases with FoR gives a further ~1.4x on top of FFoR(deltas): workload FFoR (B) ratio bases (B) Wb +bcomp ratio monotone i32 (0..N) 2052 15.97x 1024 13 1448 22.63x sensor i32 in [-100, 100] 10244 3.20x 1024 8 9480 3.46x offset i32 base=-1e9 2052 15.97x 1024 13 1448 22.63x near-monotone i32 (5% backtrack) 3076 10.65x 1024 13 2472 13.26x This is already structurally enabled: the bases child is an `ArrayRef`, and the btrblocks compressor at vortex-btrblocks/src/schemes/integer.rs:917 already routes bases through `compress_child` so the cascading compressor picks whatever encoding fits (typically FoR + BitPacked). Signed-off-by: Claude --- .../src/delta/array/delta_compress.rs | 40 ++++++++++++++++--- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/encodings/fastlanes/src/delta/array/delta_compress.rs b/encodings/fastlanes/src/delta/array/delta_compress.rs index 5e33777b73b..7ed7dd5ae44 100644 --- a/encodings/fastlanes/src/delta/array/delta_compress.rs +++ b/encodings/fastlanes/src/delta/array/delta_compress.rs @@ -235,10 +235,21 @@ mod tests { println!(); println!( - "{:<36} {:>10} {:>14} {:>5} {:>5} {:>5} {:>10} {:>7}", - "workload", "raw (B)", "Δ range", "Wnaive", "Wffor", "Wzig", "FFoR (B)", "ratio" + "{:<36} {:>10} {:>14} {:>5} {:>5} {:>5} {:>10} {:>7} {:>10} {:>5} {:>10} {:>7}", + "workload", + "raw (B)", + "Δ range", + "Wnaive", + "Wffor", + "Wzig", + "FFoR (B)", + "ratio", + "bases (B)", + "Wb", + "+bcomp (B)", + "ratio", ); - println!("{}", "-".repeat(96)); + println!("{}", "-".repeat(140)); for (name, values) in workloads { let raw_bytes = values.len() * size_of::(); @@ -271,9 +282,23 @@ mod tests { let ffor_total = bases_bytes + ref_bytes + ffor_packed_bytes; let ratio = raw_bytes as f64 / ffor_total as f64; + // Bases compressibility: what we save if the bases child is recursively + // delta-encoded or FoR-encoded. The bases are the "first row of the transposed + // chunk" per lane, so they form a sub-sequence that inherits the smoothness of + // the input. We approximate with FFoR over the bases alone (no recursive Delta, + // which would force padding to 1024 elements per FastLanes chunk and could lose + // for short base sequences). + let min_b = *bases_buf.iter().min().unwrap(); + let max_b = *bases_buf.iter().max().unwrap(); + let bspan = (max_b as i64 - min_b as i64) as u64 + 1; + let bases_w = if bspan <= 1 { 0 } else { 64 - (bspan - 1).leading_zeros() as usize }; + let bases_compressed = (bases_buf.len() * bases_w).div_ceil(8) + ref_bytes; + let total_with_bcomp = bases_compressed + ref_bytes + ffor_packed_bytes; + let ratio_with_bcomp = raw_bytes as f64 / total_with_bcomp as f64; + println!( - "{name:<36} {raw_bytes:>10} {:>14} {naive_w:>5} {ffor_w:>5} {zz_w:>5} {ffor_total:>10} {ratio:>6.2}x", - format!("[{min_d}, {max_d}]") + "{name:<36} {raw_bytes:>10} {:>14} {naive_w:>5} {ffor_w:>5} {zz_w:>5} {ffor_total:>10} {ratio:>6.2}x {bases_bytes:>10} {bases_w:>5} {total_with_bcomp:>10} {ratio_with_bcomp:>6.2}x", + format!("[{min_d}, {max_d}]"), ); // Sanity assertions. naive_w is 32 (or near it) for any delta sequence that @@ -288,6 +313,11 @@ mod tests { if min_d > 0 || max_d < 0 { assert!(ffor_w < zz_w, "FFoR should beat ZigZag on asymmetric {name}"); } + // Sorted inputs => the bases inherit smoothness => the bases bit-width should be + // far smaller than `T` for sorted columns. + if name.starts_with("monotone") || name.starts_with("offset") { + assert!(bases_w < 16, "sorted bases should pack below 16 bits for {name}"); + } } Ok(()) From 1a5c6393739236fa7334c431a2b559ca6ebd1c6a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 13 May 2026 21:44:53 +0000 Subject: [PATCH 4/5] fastlanes: add SPDX headers to FUSED_DECODE.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit REUSE compliance — markdown files outside the patterns in REUSE.toml need inline SPDX comments. Signed-off-by: Claude --- encodings/fastlanes/src/delta/FUSED_DECODE.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/encodings/fastlanes/src/delta/FUSED_DECODE.md b/encodings/fastlanes/src/delta/FUSED_DECODE.md index 9c461a1ec82..b9bc92015f5 100644 --- a/encodings/fastlanes/src/delta/FUSED_DECODE.md +++ b/encodings/fastlanes/src/delta/FUSED_DECODE.md @@ -1,3 +1,6 @@ + + + # Fused Delta + FFoR + BitUnpack decoding — design note Status: design only. No production kernel in this branch. From 48d5602d58712da28505f07b8c93ca1a8436ac8e Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 13 May 2026 21:58:02 +0000 Subject: [PATCH 5/5] fastlanes: polish signed Delta PR Pre-merge polish across the three things a reviewer would notice: * DeltaArray docstring: add a signed `i32` example next to the unsigned one so users see signed support is first-class. Verified by doctest. * Conformance: extend `test_delta_consistency` and `test_delta_binary_numeric` with i32 / i64 / i8 cases (crossing zero, all-negative, single-negative). These run the array-trait conformance harness, so any operation that's silently broken for signed inputs surfaces here. * cast.rs: expand the comment justifying why signed sources fall back to decompress-and-re-encode (the wrapping-add invariant breaks under value-preserving widening; the same hazard applies to cross-signedness). * synthetic_workload_compression table: rename duplicate "ratio" columns to `FFoR x` / `+bcomp x` so the report is unambiguous. 256 -> 263 tests, all pass. Clippy clean. Fmt clean. Signed-off-by: Claude --- .../src/delta/array/delta_compress.rs | 56 ++++++++++++++----- encodings/fastlanes/src/delta/array/mod.rs | 15 +++++ encodings/fastlanes/src/delta/compute/cast.rs | 10 +++- .../fastlanes/src/delta/vtable/operations.rs | 7 +++ 4 files changed, 71 insertions(+), 17 deletions(-) diff --git a/encodings/fastlanes/src/delta/array/delta_compress.rs b/encodings/fastlanes/src/delta/array/delta_compress.rs index 7ed7dd5ae44..daa40e9b6b8 100644 --- a/encodings/fastlanes/src/delta/array/delta_compress.rs +++ b/encodings/fastlanes/src/delta/array/delta_compress.rs @@ -235,7 +235,7 @@ mod tests { println!(); println!( - "{:<36} {:>10} {:>14} {:>5} {:>5} {:>5} {:>10} {:>7} {:>10} {:>5} {:>10} {:>7}", + "{:<36} {:>10} {:>14} {:>5} {:>5} {:>5} {:>10} {:>9} {:>10} {:>5} {:>10} {:>9}", "workload", "raw (B)", "Δ range", @@ -243,16 +243,16 @@ mod tests { "Wffor", "Wzig", "FFoR (B)", - "ratio", + "FFoR x", "bases (B)", "Wb", "+bcomp (B)", - "ratio", + "+bcomp x", ); println!("{}", "-".repeat(140)); for (name, values) in workloads { - let raw_bytes = values.len() * size_of::(); + let raw_bytes = size_of_val(values.as_slice()); let array = PrimitiveArray::from_iter(values); let (bases, deltas) = delta_compress(&array, &mut ctx)?; let deltas_buf: &[i32] = deltas.as_slice(); @@ -264,18 +264,30 @@ mod tests { // Naive width = OR of raw u32 bit-patterns of every delta. Any negative delta // sets the high bits and forces W = 32. let or: u32 = deltas_buf.iter().fold(0u32, |a, &d| a | (d as u32)); - let naive_w = if or == 0 { 0 } else { 32 - or.leading_zeros() as usize }; + let naive_w = if or == 0 { + 0 + } else { + 32 - or.leading_zeros() as usize + }; // FFoR width = ceil(log2(span)) where span = (max - min + 1). let span = (max_d as i64 - min_d as i64) as u64 + 1; - let ffor_w = if span <= 1 { 0 } else { 64 - (span - 1).leading_zeros() as usize }; + let ffor_w = if span <= 1 { + 0 + } else { + 64 - (span - 1).leading_zeros() as usize + }; // ZigZag width = 1 + ceil(log2(max(|min|, |max|))) for any nonzero delta. let zz_mag = (min_d.unsigned_abs()).max(max_d.unsigned_abs()); - let zz_w = if zz_mag == 0 { 0 } else { 1 + (32 - zz_mag.leading_zeros() as usize) }; + let zz_w = if zz_mag == 0 { + 0 + } else { + 1 + (32 - zz_mag.leading_zeros() as usize) + }; // FFoR encoded byte size: bases (already unpacked) + ref + ceil(packed bits / 8). - let bases_bytes = bases_buf.len() * size_of::(); + let bases_bytes = size_of_val(bases_buf); let ref_bytes = size_of::(); let packed_bits = deltas_buf.len() * ffor_w; let ffor_packed_bytes = packed_bits.div_ceil(8); @@ -291,32 +303,48 @@ mod tests { let min_b = *bases_buf.iter().min().unwrap(); let max_b = *bases_buf.iter().max().unwrap(); let bspan = (max_b as i64 - min_b as i64) as u64 + 1; - let bases_w = if bspan <= 1 { 0 } else { 64 - (bspan - 1).leading_zeros() as usize }; + let bases_w = if bspan <= 1 { + 0 + } else { + 64 - (bspan - 1).leading_zeros() as usize + }; let bases_compressed = (bases_buf.len() * bases_w).div_ceil(8) + ref_bytes; let total_with_bcomp = bases_compressed + ref_bytes + ffor_packed_bytes; let ratio_with_bcomp = raw_bytes as f64 / total_with_bcomp as f64; println!( - "{name:<36} {raw_bytes:>10} {:>14} {naive_w:>5} {ffor_w:>5} {zz_w:>5} {ffor_total:>10} {ratio:>6.2}x {bases_bytes:>10} {bases_w:>5} {total_with_bcomp:>10} {ratio_with_bcomp:>6.2}x", + "{name:<36} {raw_bytes:>10} {:>14} {naive_w:>5} {ffor_w:>5} {zz_w:>5} {ffor_total:>10} {ratio:>8.2}x {bases_bytes:>10} {bases_w:>5} {total_with_bcomp:>10} {ratio_with_bcomp:>8.2}x", format!("[{min_d}, {max_d}]"), ); // Sanity assertions. naive_w is 32 (or near it) for any delta sequence that // contains a negative value; FFoR/ZigZag width must be strictly smaller for these // workloads. - assert!(ffor_w <= naive_w.max(1), "FFoR must never exceed naive for {name}"); + assert!( + ffor_w <= naive_w.max(1), + "FFoR must never exceed naive for {name}" + ); if min_d < 0 { - assert_eq!(naive_w, 32, "any negative delta forces naive W to 32 for {name}"); + assert_eq!( + naive_w, 32, + "any negative delta forces naive W to 32 for {name}" + ); assert!(ffor_w < 32, "FFoR must compress below T for {name}"); } // On the asymmetric workloads (offset, near-monotone) FFoR must beat ZigZag. if min_d > 0 || max_d < 0 { - assert!(ffor_w < zz_w, "FFoR should beat ZigZag on asymmetric {name}"); + assert!( + ffor_w < zz_w, + "FFoR should beat ZigZag on asymmetric {name}" + ); } // Sorted inputs => the bases inherit smoothness => the bases bit-width should be // far smaller than `T` for sorted columns. if name.starts_with("monotone") || name.starts_with("offset") { - assert!(bases_w < 16, "sorted bases should pack below 16 bits for {name}"); + assert!( + bases_w < 16, + "sorted bases should pack below 16 bits for {name}" + ); } } diff --git a/encodings/fastlanes/src/delta/array/mod.rs b/encodings/fastlanes/src/delta/array/mod.rs index 54313f53c93..6db40ddbd31 100644 --- a/encodings/fastlanes/src/delta/array/mod.rs +++ b/encodings/fastlanes/src/delta/array/mod.rs @@ -43,6 +43,21 @@ pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["bases", "deltas"]; /// let array = Delta::try_from_primitive_array(&primitive, &mut session.create_execution_ctx()).unwrap(); /// ``` /// +/// Signed inputs are also supported; deltas across negative values are encoded by +/// `wrapping_sub` and recovered by `wrapping_add` at decompress time: +/// +/// ``` +/// use vortex_array::arrays::PrimitiveArray; +/// use vortex_array::VortexSessionExecute; +/// use vortex_array::session::ArraySession; +/// use vortex_session::VortexSession; +/// use vortex_fastlanes::Delta; +/// +/// let session = VortexSession::empty().with::(); +/// let primitive = PrimitiveArray::from_iter([-3_i32, -2, -1, 0, 1, 2]); +/// let array = Delta::try_from_primitive_array(&primitive, &mut session.create_execution_ctx()).unwrap(); +/// ``` +/// /// # Details /// /// To facilitate slicing, this array accepts an `offset` and `logical_len`. The offset must be diff --git a/encodings/fastlanes/src/delta/compute/cast.rs b/encodings/fastlanes/src/delta/compute/cast.rs index e50c27f1a57..43a247df9f0 100644 --- a/encodings/fastlanes/src/delta/compute/cast.rs +++ b/encodings/fastlanes/src/delta/compute/cast.rs @@ -24,9 +24,13 @@ impl CastReduce for Delta { if target_ptype.is_signed_int() || source_ptype.bit_width() > target_ptype.bit_width() { return Ok(None); } - // Signed deltas widened by per-element value-preserving cast (e.g. -1i8 -> 4294967295u32) - // break the wrapping-add invariant: zero-extending the delta bytes would preserve it, - // sign-extending the deltas does not. Fall back to full decompress + re-encode. + // Signed sources need a different cast policy than the lossless widening cast + // used here. The delta bytes are stored as the result of `wrapping_sub`, so e.g. + // a delta of -1i8 has the bit pattern 0xFF. Widening *as a value* (the cast op's + // semantics) sign-extends that to 0xFFFFFFFF, which means `wrapping_add(base, delta)` + // at the wider type produces a different result than at the source type — round-trip + // breaks. Cross-signedness widening has the same hazard for the same reason. Fall + // back to decompress-and-re-encode for both cases. if source_ptype.is_signed_int() { return Ok(None); } diff --git a/encodings/fastlanes/src/delta/vtable/operations.rs b/encodings/fastlanes/src/delta/vtable/operations.rs index 943f11379ac..785cdf6aca8 100644 --- a/encodings/fastlanes/src/delta/vtable/operations.rs +++ b/encodings/fastlanes/src/delta/vtable/operations.rs @@ -248,6 +248,11 @@ mod tests { #[case::delta_large_u64((0u64..2048).collect())] // Single element #[case::delta_single(PrimitiveArray::new(buffer![42u32], Validity::NonNullable))] + // Signed inputs (added with signed-delta support). + #[case::delta_i32_crossing_zero((-100i32..100).collect())] + #[case::delta_i64_negative((0i64..100).map(|i| -i * 10).collect())] + #[case::delta_large_i32((-1024i32..1024).collect())] + #[case::delta_single_negative(PrimitiveArray::new(buffer![-42i32], Validity::NonNullable))] fn test_delta_consistency(#[case] array: PrimitiveArray) { test_array_consistency(&da(&array).into_array()); } @@ -258,6 +263,8 @@ mod tests { #[case::delta_u32_basic(PrimitiveArray::new(buffer![1u32, 1, 1, 1, 1], Validity::NonNullable))] #[case::delta_u64_basic(PrimitiveArray::new(buffer![1u64, 1, 1, 1, 1], Validity::NonNullable))] #[case::delta_u32_large(PrimitiveArray::new(buffer![1u32; 100], Validity::NonNullable))] + #[case::delta_i8_basic(PrimitiveArray::new(buffer![-1i8, -1, -1, -1, -1], Validity::NonNullable))] + #[case::delta_i32_basic(PrimitiveArray::new(buffer![-1i32, -1, -1, -1, -1], Validity::NonNullable))] fn test_delta_binary_numeric(#[case] array: PrimitiveArray) { test_binary_numeric_array(da(&array).into_array()); }