From c5f909b11ad8b2863689eae590a5d964c0fa3f6d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 13 May 2026 21:12:42 +0000
Subject: [PATCH 1/5] fastlanes: allow signed integers in Delta encoding

Lifts the `is_unsigned_int` gate on `DeltaArray` so `i8` / `i16` / `i32` / `i64`
columns can be delta-encoded. The upstream FastLanes kernels (`Delta::delta`,
`Transpose::transpose`) are bounded on `T: FastLanes: Unsigned`, so signed
inputs are processed by reinterpret-casting the underlying buffer to the
same-width unsigned counterpart, running the existing kernel, and
reinterpret-casting back. `wrapping_sub`/`wrapping_add` are bit-identical for
signed and unsigned operands under two's-complement, so the round-trip is
exact.

Note that the encoded delta bytes for inputs that cross zero have the high
bits set (e.g. delta `-1i8` = `0xFF`); naively bit-packing those would force
the bit width to `T`. A follow-up should compose `Delta` with `FoR` so the
deltas are stored as `value - min(delta)` before bit-packing. See
encodings/fastlanes/src/delta/FUSED_DECODE.md for a design note on a fused
triple-kernel (unpack + add-reference + undelta) that addresses the decode
bandwidth.

Also guards `Delta::cast` against signed sources: value-preserving casts of
signed deltas (e.g. `-1i8` -> `4294967295u32`) break the wrapping-add
invariant during decompression, so signed sources fall back to the
decompress-and-reencode path.

Signed-off-by: Claude <noreply@anthropic.com>
---
 encodings/fastlanes/src/delta/FUSED_DECODE.md | 156 ++++++++++++++++++
 .../src/delta/array/delta_compress.rs         |  54 +++++-
 .../src/delta/array/delta_decompress.rs       |  25 ++-
 encodings/fastlanes/src/delta/array/mod.rs    |  19 ++-
 encodings/fastlanes/src/delta/compute/cast.rs |   6 +
 encodings/fastlanes/src/delta/vtable/mod.rs   |   8 +-
 6 files changed, 253 insertions(+), 15 deletions(-)
 create mode 100644 encodings/fastlanes/src/delta/FUSED_DECODE.md
diff --git a/encodings/fastlanes/src/delta/FUSED_DECODE.md b/encodings/fastlanes/src/delta/FUSED_DECODE.md
new file mode 100644
index 00000000000..9c461a1ec82
--- /dev/null
+++ b/encodings/fastlanes/src/delta/FUSED_DECODE.md
@@ -0,0 +1,156 @@
+# Fused Delta + FFoR + BitUnpack decoding — design note
+
+Status: design only. No production kernel in this branch.
+
+## Motivation
+
+With signed-integer support added to `DeltaArray`, the natural compressed shape for
+signed columns is:
+
+```
+DeltaArray {
+    bases:  PrimitiveArray<T>                   // T ∈ {i8..i64, u8..u64}
+    deltas: BitPackedArray { encoded: FoRArray { encoded: …, ref: min_d } }
+}
+```
+
+That is: delta-encode → frame-of-reference (subtract `min_d`) → bit-pack. This is the
+"DELTA + FFoR + BP" stack recommended by the FastLanes paper and the ADMS '24
+follow-up, and it is the only stack that keeps the bit-packing width small when the
+deltas can be negative (a single negative delta in two's complement otherwise
+sets every high bit, forcing `W = T`).
+
+Today, decoding such an array makes **three separate passes** over the packed
+buffer (and intermediate buffers):
+
+1. `BitPackedArray::execute` — unpack `W` bits per element into a full-width
+   primitive array.
+2. `FoRArray::execute` — element-wise `wrapping_add(reference)`.
+3. `DeltaArray::execute` → `delta_decompress` → `Delta::undelta` — element-wise
+   cumulative-sum (`wrapping_add(prev)`) within each lane.
+
+Each pass reads and writes 1024 × `size_of::<T>()` bytes per chunk. For the common
+case where `T = i32` and `W = 8`, that is 3 × 4 KiB of bandwidth per chunk to do
+work whose minimum information-theoretic cost is one read of 1 KiB (the packed
+buffer) and one write of 4 KiB (the output).
+
+## Upstream building blocks
+
+`fastlanes` 0.5.0 already ships partial fusions:
+
+| Kernel | Fuses | Type bound |
+|---|---|---|
+| `BitPacking::unpack<W, B>` | unpack only | `Self: FastLanes` (unsigned) |
+| `FoR::unfor_pack<W, B>` | unpack + `wrapping_add(ref)` | `Self: BitPacking` |
+| `Delta::undelta_pack<LANES, W, B>` | unpack + lane-cumsum undelta | `Self: BitPacking` |
+| `Delta::undelta<LANES>` | undelta on already-unpacked values | `Self: BitPacking` |
+
+What is **missing upstream** is a triple-fused kernel: unpack + `wrapping_add(ref)`
++ undelta in a single pass. The two existing fused kernels each pair *one* of the
+two reductions with `unpack`; neither pairs both.
+
+## Proposed kernel
+
+```rust
+/// Triple-fused decode: unpack W-bit values, add a FoR reference, and undo a
+/// per-lane delta in one pass.
+///
+/// `input`  — packed buffer of `B = 1024 * W / T` elements of width `T`
+/// `base`   — `LANES`-element per-lane bases (already in the natural type)
+/// `reference` — FoR reference added to every unpacked element before undelta
+/// `output` — 1024 reconstructed values, in lane-transposed order
+fn undelta_for_pack<const LANES: usize, const W: usize, const B: usize>(
+    input: &[Self; B],
+    base: &[Self; LANES],
+    reference: Self,
+    output: &mut [Self; 1024],
+);
+```
+
+Sketch (compare with upstream `Delta::undelta_pack` and `FoR::unfor_pack`):
+
+```rust
+for lane in 0..Self::LANES {
+    let mut prev = base[lane];
+    unpack!(T, W, input, lane, |idx, packed_elem| {
+        // (1) restore FoR offset, (2) cumulative wrapping-add along the lane
+        let d = packed_elem.wrapping_add(reference);
+        let next = d.wrapping_add(prev);
+        output[idx] = next;
+        prev = next;
+    });
+}
+```
+
+Memory traffic per 1024-chunk: one read of `1024 * W / 8` bytes (packed), one
+read of `LANES * size_of::<T>()` bytes (bases), one scalar `reference`, and one
+write of `1024 * size_of::<T>()` bytes. For `T = i32`, `W = 8`: 1 KiB read + 128 B
+read + 4 KiB write = ~5 KiB total, versus ~13 KiB for the 3-pass path.
+
+### Type bounds
+
+The kernel naturally inherits `Self: BitPacking`, which upstream restricts to
+unsigned types (`u8`/`u16`/`u32`/`u64`). Signed inputs reuse the kernel via
+`FastLanesComparable::Bitpacked` — the same transmute trick used by this branch's
+non-fused signed-support change — so a single set of macro instantiations
+(`u8`/`u16`/`u32`/`u64`) covers all eight integer types.
+
+## Where the kernel lives
+
+Two options, in increasing order of effort:
+
+1. **Vortex-local kernel** in `encodings/fastlanes/src/delta/undelta_for_pack.rs`,
+   built with the same `seq_t!` / `pack!` / `unpack!` macros that upstream exports.
+   Pros: lands in one PR, no upstream churn. Cons: duplicates the lane-iteration
+   skeleton; future upstream fixes (e.g. patches to the bit-shuffling order) have
+   to be mirrored.
+
+2. **Upstream `fastlanes` PR** adding `Delta::undelta_for_pack` next to
+   `Delta::undelta_pack`. Pros: shares the macro skeleton with the existing
+   fused kernels. Cons: depends on a release and a workspace pin bump.
+
+Option 1 is the right starting point. If benchmarks show the win we expect, the
+kernel can be lifted upstream with a thin wrapper kept locally.
+
+## Integration into the decode path
+
+`delta_decompress` currently calls `array.deltas().clone().execute(ctx)?` and
+then `Delta::undelta` lane-by-lane. To use the fused kernel:
+
+1. Inspect the `deltas` child. The fast path applies only when it is exactly
+   `BitPacked` *or* `FoR(BitPacked)`.
+2. For `FoR(BitPacked)`: read the FoR `reference` scalar; read the packed
+   buffer, bit-width, and patches from the `BitPacked` child; dispatch to
+   `undelta_for_pack::<LANES, W, B>` for each 1024-chunk.
+3. For `BitPacked` (no FoR layer): dispatch to upstream `Delta::undelta_pack`
+   (already exists, no new kernel needed).
+4. For anything else (e.g. a generic primitive deltas slot): fall through to
+   the current non-fused path.
+5. Handle patches (the BitPacked layer's exception store) after the fused decode,
+   the same way `for/array/for_decompress.rs::fused_decompress` does it today.
+
+The signed-vs-unsigned dispatch is the same `reinterpret_cast` trick used in
+this branch: rewrap as the unsigned counterpart, call the fused kernel, rewrap
+the output. The bases and the FoR reference participate in the same transmute.
+
+## Benchmark plan (before committing to the kernel)
+
+A microbench in `vortex-bench/` over four sorted signed columns of 10M elements:
+
+| Column shape | Expected `W` | Hypothesis |
+|---|---|---|
+| `i32` monotone increasing from 0 | small | fused wins, no FoR step does much |
+| `i32` monotone increasing from −1e9 | small | fused wins; FoR ref nontrivial |
+| `i32` near-monotone with 5 % decreases | small | fused wins by larger margin |
+| `i32` random in `[−100, +100]` | medium | fused ≈ 3-pass; bandwidth less dominant |
+
+Decode throughput on a single core; compare 3-pass vs proposed fused kernel.
+Worth landing if the fused path is ≥ 1.5× on the first three rows.
+
+## Out of scope
+
+- The encoding side: `delta_compress` already runs in one pass; FoR + bit-pack
+  on the produced deltas is a separate sequential composition that is already
+  fused well enough by the existing FoR and BitPacked encoders.
+- A symmetric `delta_for_pack` (fused encode) — only worth doing once the
+  decode-side wins are confirmed.
diff --git a/encodings/fastlanes/src/delta/array/delta_compress.rs b/encodings/fastlanes/src/delta/array/delta_compress.rs
index d51cef72b49..0a2f6f5fac6 100644
--- a/encodings/fastlanes/src/delta/array/delta_compress.rs
+++ b/encodings/fastlanes/src/delta/array/delta_compress.rs
@@ -9,6 +9,7 @@ use fastlanes::FastLanes;
 use fastlanes::Transpose;
 use vortex_array::ExecutionCtx;
 use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::primitive::PrimitiveArrayExt;
 use vortex_array::dtype::NativePType;
 use vortex_array::match_each_unsigned_integer_ptype;
 use vortex_buffer::Buffer;
@@ -17,27 +18,46 @@ use vortex_error::VortexResult;
 
 use crate::FL_CHUNK_SIZE;
 use crate::bit_transpose::transpose_validity;
+use crate::delta::array::unsigned_counterpart;
 use crate::fill_forward_nulls;
 pub fn delta_compress(
     array: &PrimitiveArray,
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<(PrimitiveArray, PrimitiveArray)> {
     let validity = array.validity()?;
-    let (bases, deltas) = match_each_unsigned_integer_ptype!(array.ptype(), |T| {
+    let original_ptype = array.ptype();
+    let unsigned_ptype = unsigned_counterpart(original_ptype);
+    // Signed integers are processed through their unsigned counterpart: `wrapping_sub`
+    // is bit-identical for signed and unsigned operands, so the encoded bytes are the
+    // same regardless of how the buffer's elements are interpreted.
+    let work = if original_ptype == unsigned_ptype {
+        array.clone()
+    } else {
+        array.reinterpret_cast(unsigned_ptype)
+    };
+
+    let (bases, deltas) = match_each_unsigned_integer_ptype!(work.ptype(), |T| {
         // Fill-forward null values so that transposed deltas at null positions remain
         // small. Without this, bitpacking may skip patches for null positions, and the
         // corrupted delta values propagate through the cumulative sum during decompression.
-        let filled = fill_forward_nulls(array.to_buffer::<T>(), &validity, ctx)?;
+        let filled = fill_forward_nulls(work.to_buffer::<T>(), &validity, ctx)?;
         let (bases, deltas) = compress_primitive::<T, { T::LANES }>(&filled);
         // TODO(robert): This can be avoided if we add TransposedBoolArray that performs index translation when necessary.
         let validity = transpose_validity(&validity, ctx)?;
         (
-            PrimitiveArray::new(bases, array.dtype().nullability().into()),
+            PrimitiveArray::new(bases, work.dtype().nullability().into()),
             PrimitiveArray::new(deltas, validity),
         )
     });
 
-    Ok((bases, deltas))
+    Ok(if original_ptype == unsigned_ptype {
+        (bases, deltas)
+    } else {
+        (
+            bases.reinterpret_cast(original_ptype),
+            deltas.reinterpret_cast(original_ptype),
+        )
+    })
 }
 
 fn compress_primitive<T, const LANES: usize>(array: &[T]) -> (Buffer<T>, Buffer<T>)
@@ -113,11 +133,31 @@ mod tests {
         LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
 
     #[rstest]
-    #[case((0u32..10_000).collect())]
-    #[case((0..10_000).map(|i| (i % (u8::MAX as i32)) as u8).collect())]
-    #[case(PrimitiveArray::from_option_iter(
+    #[case::u32((0u32..10_000).collect())]
+    #[case::u8((0..10_000).map(|i| (i % (u8::MAX as i32)) as u8).collect())]
+    #[case::nullable_u32(PrimitiveArray::from_option_iter(
             (0u32..10_000).map(|i| (i % 2 == 0).then_some(i)),
     ))]
+    // Signed inputs that stay non-negative: encoded deltas are identical to the u32 case
+    // bit-for-bit, but the buffer's dtype carries the signedness through round-trip.
+    #[case::i32_non_negative((0i32..10_000).collect())]
+    // Signed inputs crossing zero: deltas alternate in sign, which under wrapping_sub
+    // populates the high bits of negative deltas. Bit-packing without preprocessing
+    // would explode here, but round-tripping the raw delta buffer is still correct.
+    #[case::i32_crossing_zero((-5_000i32..5_000).collect())]
+    // All-negative signed values.
+    #[case::i32_all_negative((-10_000i32..0).collect())]
+    // i8 across the full type range: tests T::MIN / T::MAX boundaries and the
+    // remainder-padded chunk path (256 < FL_CHUNK_SIZE).
+    #[case::i8_full_range((i8::MIN..=i8::MAX).collect())]
+    // i16 crossing zero.
+    #[case::i16_crossing_zero((-2_000i16..2_000).collect())]
+    // i64 with large negative offset.
+    #[case::i64_large_negative((0i64..5_000).map(|i| i - 1_000_000_000_000).collect())]
+    // Nullable signed array with values around zero.
+    #[case::nullable_i32_crossing(PrimitiveArray::from_option_iter(
+            (-2_000i32..2_000).map(|i| (i % 3 != 0).then_some(i)),
+    ))]
     fn test_compress(#[case] array: PrimitiveArray) -> VortexResult<()> {
         let delta = Delta::try_from_primitive_array(&array, &mut SESSION.create_execution_ctx())?;
         assert_eq!(delta.len(), array.len());
diff --git a/encodings/fastlanes/src/delta/array/delta_decompress.rs b/encodings/fastlanes/src/delta/array/delta_decompress.rs
index d6404dfb905..5b32434e9b6 100644
--- a/encodings/fastlanes/src/delta/array/delta_decompress.rs
+++ b/encodings/fastlanes/src/delta/array/delta_decompress.rs
@@ -10,6 +10,7 @@ use fastlanes::Transpose;
 use itertools::Itertools;
 use vortex_array::ExecutionCtx;
 use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::primitive::PrimitiveArrayExt;
 use vortex_array::dtype::NativePType;
 use vortex_array::match_each_unsigned_integer_ptype;
 use vortex_buffer::Buffer;
@@ -19,6 +20,7 @@ use vortex_error::VortexResult;
 use crate::DeltaArray;
 use crate::bit_transpose::untranspose_validity;
 use crate::delta::array::DeltaArrayExt;
+use crate::delta::array::unsigned_counterpart;
 
 pub fn delta_decompress(
     array: &DeltaArray,
@@ -33,14 +35,33 @@ pub fn delta_decompress(
     let validity = untranspose_validity(&deltas.validity()?, ctx)?;
     let validity = validity.slice(start..end)?;
 
-    Ok(match_each_unsigned_integer_ptype!(deltas.ptype(), |T| {
+    let original_ptype = deltas.ptype();
+    let unsigned_ptype = unsigned_counterpart(original_ptype);
+    // Signed inputs are processed through their unsigned counterpart; `wrapping_add` on the
+    // raw bytes inverts the `wrapping_sub` done at compress time regardless of signedness.
+    let (bases, deltas) = if original_ptype == unsigned_ptype {
+        (bases, deltas)
+    } else {
+        (
+            bases.reinterpret_cast(unsigned_ptype),
+            deltas.reinterpret_cast(unsigned_ptype),
+        )
+    };
+
+    let decoded = match_each_unsigned_integer_ptype!(deltas.ptype(), |T| {
         const LANES: usize = T::LANES;
 
         let buffer = decompress_primitive::<T, LANES>(bases.as_slice(), deltas.as_slice());
         let buffer = buffer.slice(start..end);
 
         PrimitiveArray::new(buffer, validity)
-    }))
+    });
+
+    Ok(if original_ptype == unsigned_ptype {
+        decoded
+    } else {
+        decoded.reinterpret_cast(original_ptype)
+    })
 }
 
 /// Performs the low-level delta decompression on primitive values.
diff --git a/encodings/fastlanes/src/delta/array/mod.rs b/encodings/fastlanes/src/delta/array/mod.rs
index 4e96edc59ca..54313f53c93 100644
--- a/encodings/fastlanes/src/delta/array/mod.rs
+++ b/encodings/fastlanes/src/delta/array/mod.rs
@@ -101,5 +101,22 @@ impl DeltaData {
 }
 
 pub(crate) fn lane_count(ptype: PType) -> usize {
-    match_each_unsigned_integer_ptype!(ptype, |T| { T::LANES })
+    match_each_unsigned_integer_ptype!(unsigned_counterpart(ptype), |T| { T::LANES })
+}
+
+/// Map a signed integer [`PType`] to its same-width unsigned counterpart; other [`PType`]s
+/// pass through unchanged.
+///
+/// The FastLanes kernels (`Delta::delta`, `Transpose::transpose`, ...) are only implemented
+/// for unsigned integer types. Signed inputs are processed by viewing the same bytes through
+/// the unsigned counterpart; `wrapping_sub` / `wrapping_add` are bit-identical for signed and
+/// unsigned operands under two's-complement, so the round-trip is exact.
+pub(crate) fn unsigned_counterpart(ptype: PType) -> PType {
+    match ptype {
+        PType::I8 => PType::U8,
+        PType::I16 => PType::U16,
+        PType::I32 => PType::U32,
+        PType::I64 => PType::U64,
+        other => other,
+    }
 }
diff --git a/encodings/fastlanes/src/delta/compute/cast.rs b/encodings/fastlanes/src/delta/compute/cast.rs
index 324e2fc9c22..e50c27f1a57 100644
--- a/encodings/fastlanes/src/delta/compute/cast.rs
+++ b/encodings/fastlanes/src/delta/compute/cast.rs
@@ -24,6 +24,12 @@ impl CastReduce for Delta {
         if target_ptype.is_signed_int() || source_ptype.bit_width() > target_ptype.bit_width() {
             return Ok(None);
         }
+        // Signed deltas widened by per-element value-preserving cast (e.g. -1i8 -> 4294967295u32)
+        // break the wrapping-add invariant: zero-extending the delta bytes would preserve it,
+        // sign-extending the deltas does not. Fall back to full decompress + re-encode.
+        if source_ptype.is_signed_int() {
+            return Ok(None);
+        }
 
         // Cast both bases and deltas to the target type
         let casted_bases = array.bases().cast(dtype.with_nullability(NonNullable))?;
diff --git a/encodings/fastlanes/src/delta/vtable/mod.rs b/encodings/fastlanes/src/delta/vtable/mod.rs
index 96b6ec27e0c..b5e68791ceb 100644
--- a/encodings/fastlanes/src/delta/vtable/mod.rs
+++ b/encodings/fastlanes/src/delta/vtable/mod.rs
@@ -4,7 +4,6 @@
 use std::hash::Hash;
 use std::hash::Hasher;
 
-use fastlanes::FastLanes;
 use prost::Message;
 use vortex_array::Array;
 use vortex_array::ArrayEq;
@@ -21,7 +20,6 @@ use vortex_array::arrays::PrimitiveArray;
 use vortex_array::buffer::BufferHandle;
 use vortex_array::dtype::DType;
 use vortex_array::dtype::PType;
-use vortex_array::match_each_unsigned_integer_ptype;
 use vortex_array::serde::ArrayChildren;
 use vortex_array::smallvec::smallvec;
 use vortex_array::vtable::VTable;
@@ -156,7 +154,7 @@ impl VTable for Delta {
         );
         let metadata = DeltaMetadata::decode(metadata)?;
         let ptype = PType::try_from(dtype)?;
-        let lanes = match_each_unsigned_integer_ptype!(ptype, |T| { <T as FastLanes>::LANES });
+        let lanes = lane_count(ptype);
 
         // Compute the length of the bases array
         let deltas_len = usize::try_from(metadata.deltas_len)
@@ -227,8 +225,8 @@ fn validate_parts(
     );
 
     vortex_ensure!(
-        bases.dtype().is_unsigned_int(),
-        "DeltaArray: dtype must be an unsigned integer, got {}",
+        bases.dtype().is_int(),
+        "DeltaArray: dtype must be an integer, got {}",
         bases.dtype()
     );
 

From e80a13416e98993cc2aa811cbcfb7d6a2fc1de49 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 13 May 2026 21:24:13 +0000
Subject: [PATCH 2/5] fastlanes: add synthetic delta compression workload test

Measures the encoded byte budget under three bit-packing strategies for four
representative signed `i32` shapes (monotone, sensor-like wobble around zero,
large-negative offset, near-monotone with backtracks):

| Workload                          | range          | Wnaive | Wffor | Wzz | ratio   |
|-----------------------------------|----------------|-------:|------:|----:|--------:|
| monotone i32 (0..N)               | [0, 1]         |      1 |     1 |   2 | 15.97x  |
| sensor i32 in [-100, 100]         | [-196, 199]    |     32 |     9 |   9 |  3.20x  |
| offset i32 base=-1e9              | [0, 1]         |      1 |     1 |   2 | 15.97x  |
| near-monotone i32 (5% backtrack)  | [-2, 1]        |     32 |     2 |   3 | 10.65x  |

The "naive" column is the OR-mask of the raw delta bit-patterns: a single
negative delta sets every high bit and forces `W = T`, which is why the two
workloads with negative deltas (`sensor`, `near-monotone`) blow up to 32 bits.
FFoR brings them to 9 and 2 bits. ZigZag matches FFoR only on the symmetric
`sensor` workload and loses on every asymmetric column.

Asserts that FFoR never exceeds naive, drops below `T` whenever a negative
delta is present, and beats ZigZag on the asymmetric workloads. Run with
`--nocapture` to see the table.

Signed-off-by: Claude <noreply@anthropic.com>
---
 .../src/delta/array/delta_compress.rs         | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/encodings/fastlanes/src/delta/array/delta_compress.rs b/encodings/fastlanes/src/delta/array/delta_compress.rs
index 0a2f6f5fac6..5e33777b73b 100644
--- a/encodings/fastlanes/src/delta/array/delta_compress.rs
+++ b/encodings/fastlanes/src/delta/array/delta_compress.rs
@@ -191,4 +191,105 @@ mod tests {
         assert_arrays_eq!(packed_delta_prim, array);
         Ok(())
     }
+
+    /// Measures compression of delta-encoded signed columns under three bit-packing strategies:
+    ///   * `naive`: bit-packing the raw delta bytes (every negative delta sets the high bits,
+    ///     so the OR mask forces `W = T`).
+    ///   * `FFoR`:  subtracting the per-column `min(delta)` before bit-packing
+    ///     (`W = ceil(log2(max - min + 1))`).
+    ///   * `zigzag`: `(n << 1) ^ (n >> 31)` before bit-packing
+    ///     (`W = 1 + ceil(log2(max(|min|, |max|)))`).
+    ///
+    /// Asserts that FFoR beats or ties naive on every workload and beats zigzag on the
+    /// asymmetric workloads. Run with `--nocapture` to see the full table.
+    #[test]
+    fn synthetic_workload_compression() -> VortexResult<()> {
+        let mut ctx = SESSION.create_execution_ctx();
+        const N: usize = 8 * 1024; // 8 full FastLanes chunks per workload
+
+        let monotone: Vec<i32> = (0..N as i32).collect();
+        // Deterministic LCG so the test is reproducible.
+        let mut lcg = 0u32;
+        let mut next = || {
+            lcg = lcg.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+            (lcg >> 16) as i32
+        };
+        let sensor: Vec<i32> = (0..N).map(|_| (next() % 201) - 100).collect();
+        let offset: Vec<i32> = (0..N as i32).map(|i| -1_000_000_000 + i).collect();
+        let mut lcg2 = 0u32;
+        let mut prev = 0i32;
+        let near_monotone: Vec<i32> = (0..N)
+            .map(|_| {
+                lcg2 = lcg2.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+                let step = if (lcg2 >> 24) < 13 { -2 } else { 1 }; // ~5% backtrack
+                prev = prev.wrapping_add(step);
+                prev
+            })
+            .collect();
+        let workloads = [
+            ("monotone i32 (0..N)", monotone),
+            ("sensor i32 in [-100, 100]", sensor),
+            ("offset i32 base=-1e9", offset),
+            ("near-monotone i32 (5% backtrack)", near_monotone),
+        ];
+
+        println!();
+        println!(
+            "{:<36} {:>10} {:>14} {:>5} {:>5} {:>5} {:>10} {:>7}",
+            "workload", "raw (B)", "Δ range", "Wnaive", "Wffor", "Wzig", "FFoR (B)", "ratio"
+        );
+        println!("{}", "-".repeat(96));
+
+        for (name, values) in workloads {
+            let raw_bytes = values.len() * size_of::<i32>();
+            let array = PrimitiveArray::from_iter(values);
+            let (bases, deltas) = delta_compress(&array, &mut ctx)?;
+            let deltas_buf: &[i32] = deltas.as_slice();
+            let bases_buf: &[i32] = bases.as_slice();
+
+            let min_d = *deltas_buf.iter().min().unwrap();
+            let max_d = *deltas_buf.iter().max().unwrap();
+
+            // Naive width = OR of raw u32 bit-patterns of every delta. Any negative delta
+            // sets the high bits and forces W = 32.
+            let or: u32 = deltas_buf.iter().fold(0u32, |a, &d| a | (d as u32));
+            let naive_w = if or == 0 { 0 } else { 32 - or.leading_zeros() as usize };
+
+            // FFoR width = ceil(log2(span)) where span = (max - min + 1).
+            let span = (max_d as i64 - min_d as i64) as u64 + 1;
+            let ffor_w = if span <= 1 { 0 } else { 64 - (span - 1).leading_zeros() as usize };
+
+            // ZigZag width = 1 + ceil(log2(max(|min|, |max|))) for any nonzero delta.
+            let zz_mag = (min_d.unsigned_abs()).max(max_d.unsigned_abs());
+            let zz_w = if zz_mag == 0 { 0 } else { 1 + (32 - zz_mag.leading_zeros() as usize) };
+
+            // FFoR encoded byte size: bases (already unpacked) + ref + ceil(packed bits / 8).
+            let bases_bytes = bases_buf.len() * size_of::<i32>();
+            let ref_bytes = size_of::<i32>();
+            let packed_bits = deltas_buf.len() * ffor_w;
+            let ffor_packed_bytes = packed_bits.div_ceil(8);
+            let ffor_total = bases_bytes + ref_bytes + ffor_packed_bytes;
+            let ratio = raw_bytes as f64 / ffor_total as f64;
+
+            println!(
+                "{name:<36} {raw_bytes:>10} {:>14} {naive_w:>5} {ffor_w:>5} {zz_w:>5} {ffor_total:>10} {ratio:>6.2}x",
+                format!("[{min_d}, {max_d}]")
+            );
+
+            // Sanity assertions. naive_w is 32 (or near it) for any delta sequence that
+            // contains a negative value; FFoR/ZigZag width must be strictly smaller for these
+            // workloads.
+            assert!(ffor_w <= naive_w.max(1), "FFoR must never exceed naive for {name}");
+            if min_d < 0 {
+                assert_eq!(naive_w, 32, "any negative delta forces naive W to 32 for {name}");
+                assert!(ffor_w < 32, "FFoR must compress below T for {name}");
+            }
+            // On the asymmetric workloads (offset, near-monotone) FFoR must beat ZigZag.
+            if min_d > 0 || max_d < 0 {
+                assert!(ffor_w < zz_w, "FFoR should beat ZigZag on asymmetric {name}");
+            }
+        }
+
+        Ok(())
+    }
 }

From 4cee837e1deaecd68ec1c95b41805cc5a037b2db Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 13 May 2026 21:38:59 +0000
Subject: [PATCH 3/5] fastlanes: measure bases compressibility in delta
 workload test

Extends the synthetic workload report with two extra columns: bases byte size
and the FFoR bit-width those bases would pack to. For 8K-element i32 inputs
the bases buffer is ~50% of the FFoR total on monotone-like columns, and the
bases sequence inherits the smoothness of the input, so recursively packing
the bases with FoR gives a further ~1.4x on top of FFoR(deltas):

  workload                              FFoR (B)  ratio   bases (B) Wb +bcomp ratio
  monotone i32 (0..N)                       2052  15.97x       1024  13   1448 22.63x
  sensor i32 in [-100, 100]                10244   3.20x       1024   8   9480  3.46x
  offset i32 base=-1e9                      2052  15.97x       1024  13   1448 22.63x
  near-monotone i32 (5% backtrack)          3076  10.65x       1024  13   2472 13.26x

This is already structurally enabled: the bases child is an `ArrayRef`, and
the btrblocks compressor at vortex-btrblocks/src/schemes/integer.rs:917
already routes bases through `compress_child` so the cascading compressor
picks whatever encoding fits (typically FoR + BitPacked).

Signed-off-by: Claude <noreply@anthropic.com>
---
 .../src/delta/array/delta_compress.rs         | 40 ++++++++++++++++---
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/encodings/fastlanes/src/delta/array/delta_compress.rs b/encodings/fastlanes/src/delta/array/delta_compress.rs
index 5e33777b73b..7ed7dd5ae44 100644
--- a/encodings/fastlanes/src/delta/array/delta_compress.rs
+++ b/encodings/fastlanes/src/delta/array/delta_compress.rs
@@ -235,10 +235,21 @@ mod tests {
 
         println!();
         println!(
-            "{:<36} {:>10} {:>14} {:>5} {:>5} {:>5} {:>10} {:>7}",
-            "workload", "raw (B)", "Δ range", "Wnaive", "Wffor", "Wzig", "FFoR (B)", "ratio"
+            "{:<36} {:>10} {:>14} {:>5} {:>5} {:>5} {:>10} {:>7}  {:>10} {:>5} {:>10} {:>7}",
+            "workload",
+            "raw (B)",
+            "Δ range",
+            "Wnaive",
+            "Wffor",
+            "Wzig",
+            "FFoR (B)",
+            "ratio",
+            "bases (B)",
+            "Wb",
+            "+bcomp (B)",
+            "ratio",
         );
-        println!("{}", "-".repeat(96));
+        println!("{}", "-".repeat(140));
 
         for (name, values) in workloads {
             let raw_bytes = values.len() * size_of::<i32>();
@@ -271,9 +282,23 @@ mod tests {
             let ffor_total = bases_bytes + ref_bytes + ffor_packed_bytes;
             let ratio = raw_bytes as f64 / ffor_total as f64;
 
+            // Bases compressibility: what we save if the bases child is recursively
+            // delta-encoded or FoR-encoded. The bases are the "first row of the transposed
+            // chunk" per lane, so they form a sub-sequence that inherits the smoothness of
+            // the input. We approximate with FFoR over the bases alone (no recursive Delta,
+            // which would force padding to 1024 elements per FastLanes chunk and could lose
+            // for short base sequences).
+            let min_b = *bases_buf.iter().min().unwrap();
+            let max_b = *bases_buf.iter().max().unwrap();
+            let bspan = (max_b as i64 - min_b as i64) as u64 + 1;
+            let bases_w = if bspan <= 1 { 0 } else { 64 - (bspan - 1).leading_zeros() as usize };
+            let bases_compressed = (bases_buf.len() * bases_w).div_ceil(8) + ref_bytes;
+            let total_with_bcomp = bases_compressed + ref_bytes + ffor_packed_bytes;
+            let ratio_with_bcomp = raw_bytes as f64 / total_with_bcomp as f64;
+
             println!(
-                "{name:<36} {raw_bytes:>10} {:>14} {naive_w:>5} {ffor_w:>5} {zz_w:>5} {ffor_total:>10} {ratio:>6.2}x",
-                format!("[{min_d}, {max_d}]")
+                "{name:<36} {raw_bytes:>10} {:>14} {naive_w:>5} {ffor_w:>5} {zz_w:>5} {ffor_total:>10} {ratio:>6.2}x  {bases_bytes:>10} {bases_w:>5} {total_with_bcomp:>10} {ratio_with_bcomp:>6.2}x",
+                format!("[{min_d}, {max_d}]"),
             );
 
             // Sanity assertions. naive_w is 32 (or near it) for any delta sequence that
@@ -288,6 +313,11 @@ mod tests {
             if min_d > 0 || max_d < 0 {
                 assert!(ffor_w < zz_w, "FFoR should beat ZigZag on asymmetric {name}");
             }
+            // Sorted inputs => the bases inherit smoothness => the bases bit-width should be
+            // far smaller than `T` for sorted columns.
+            if name.starts_with("monotone") || name.starts_with("offset") {
+                assert!(bases_w < 16, "sorted bases should pack below 16 bits for {name}");
+            }
         }
 
         Ok(())

From 1a5c6393739236fa7334c431a2b559ca6ebd1c6a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 13 May 2026 21:44:53 +0000
Subject: [PATCH 4/5] fastlanes: add SPDX headers to FUSED_DECODE.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

REUSE compliance — markdown files outside the patterns in REUSE.toml need
inline SPDX comments.

Signed-off-by: Claude <noreply@anthropic.com>
---
 encodings/fastlanes/src/delta/FUSED_DECODE.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/encodings/fastlanes/src/delta/FUSED_DECODE.md b/encodings/fastlanes/src/delta/FUSED_DECODE.md
index 9c461a1ec82..b9bc92015f5 100644
--- a/encodings/fastlanes/src/delta/FUSED_DECODE.md
+++ b/encodings/fastlanes/src/delta/FUSED_DECODE.md
@@ -1,3 +1,6 @@
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+<!-- SPDX-FileCopyrightText: Copyright the Vortex contributors -->
+
 # Fused Delta + FFoR + BitUnpack decoding — design note
 
 Status: design only. No production kernel in this branch.

From 48d5602d58712da28505f07b8c93ca1a8436ac8e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 13 May 2026 21:58:02 +0000
Subject: [PATCH 5/5] fastlanes: polish signed Delta PR

Pre-merge polish across the three things a reviewer would notice:

* DeltaArray docstring: add a signed `i32` example next to the unsigned one
  so users see signed support is first-class. Verified by doctest.
* Conformance: extend `test_delta_consistency` and `test_delta_binary_numeric`
  with i32 / i64 / i8 cases (crossing zero, all-negative, single-negative).
  These run the array-trait conformance harness, so any operation that's
  silently broken for signed inputs surfaces here.
* cast.rs: expand the comment justifying why signed sources fall back to
  decompress-and-re-encode (the wrapping-add invariant breaks under
  value-preserving widening; the same hazard applies to cross-signedness).
* synthetic_workload_compression table: rename duplicate "ratio" columns
  to `FFoR x` / `+bcomp x` so the report is unambiguous.

256 -> 263 tests, all pass. Clippy clean. Fmt clean.

Signed-off-by: Claude <noreply@anthropic.com>
---
 .../src/delta/array/delta_compress.rs         | 56 ++++++++++++++-----
 encodings/fastlanes/src/delta/array/mod.rs    | 15 +++++
 encodings/fastlanes/src/delta/compute/cast.rs | 10 +++-
 .../fastlanes/src/delta/vtable/operations.rs  |  7 +++
 4 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/encodings/fastlanes/src/delta/array/delta_compress.rs b/encodings/fastlanes/src/delta/array/delta_compress.rs
index 7ed7dd5ae44..daa40e9b6b8 100644
--- a/encodings/fastlanes/src/delta/array/delta_compress.rs
+++ b/encodings/fastlanes/src/delta/array/delta_compress.rs
@@ -235,7 +235,7 @@ mod tests {
 
         println!();
         println!(
-            "{:<36} {:>10} {:>14} {:>5} {:>5} {:>5} {:>10} {:>7}  {:>10} {:>5} {:>10} {:>7}",
+            "{:<36} {:>10} {:>14} {:>5} {:>5} {:>5} {:>10} {:>9}  {:>10} {:>5} {:>10} {:>9}",
             "workload",
             "raw (B)",
             "Δ range",
@@ -243,16 +243,16 @@ mod tests {
             "Wffor",
             "Wzig",
             "FFoR (B)",
-            "ratio",
+            "FFoR x",
             "bases (B)",
             "Wb",
             "+bcomp (B)",
-            "ratio",
+            "+bcomp x",
         );
         println!("{}", "-".repeat(140));
 
         for (name, values) in workloads {
-            let raw_bytes = values.len() * size_of::<i32>();
+            let raw_bytes = size_of_val(values.as_slice());
             let array = PrimitiveArray::from_iter(values);
             let (bases, deltas) = delta_compress(&array, &mut ctx)?;
             let deltas_buf: &[i32] = deltas.as_slice();
@@ -264,18 +264,30 @@ mod tests {
             // Naive width = OR of raw u32 bit-patterns of every delta. Any negative delta
             // sets the high bits and forces W = 32.
             let or: u32 = deltas_buf.iter().fold(0u32, |a, &d| a | (d as u32));
-            let naive_w = if or == 0 { 0 } else { 32 - or.leading_zeros() as usize };
+            let naive_w = if or == 0 {
+                0
+            } else {
+                32 - or.leading_zeros() as usize
+            };
 
             // FFoR width = ceil(log2(span)) where span = (max - min + 1).
             let span = (max_d as i64 - min_d as i64) as u64 + 1;
-            let ffor_w = if span <= 1 { 0 } else { 64 - (span - 1).leading_zeros() as usize };
+            let ffor_w = if span <= 1 {
+                0
+            } else {
+                64 - (span - 1).leading_zeros() as usize
+            };
 
             // ZigZag width = 1 + ceil(log2(max(|min|, |max|))) for any nonzero delta.
             let zz_mag = (min_d.unsigned_abs()).max(max_d.unsigned_abs());
-            let zz_w = if zz_mag == 0 { 0 } else { 1 + (32 - zz_mag.leading_zeros() as usize) };
+            let zz_w = if zz_mag == 0 {
+                0
+            } else {
+                1 + (32 - zz_mag.leading_zeros() as usize)
+            };
 
             // FFoR encoded byte size: bases (already unpacked) + ref + ceil(packed bits / 8).
-            let bases_bytes = bases_buf.len() * size_of::<i32>();
+            let bases_bytes = size_of_val(bases_buf);
             let ref_bytes = size_of::<i32>();
             let packed_bits = deltas_buf.len() * ffor_w;
             let ffor_packed_bytes = packed_bits.div_ceil(8);
@@ -291,32 +303,48 @@ mod tests {
             let min_b = *bases_buf.iter().min().unwrap();
             let max_b = *bases_buf.iter().max().unwrap();
             let bspan = (max_b as i64 - min_b as i64) as u64 + 1;
-            let bases_w = if bspan <= 1 { 0 } else { 64 - (bspan - 1).leading_zeros() as usize };
+            let bases_w = if bspan <= 1 {
+                0
+            } else {
+                64 - (bspan - 1).leading_zeros() as usize
+            };
             let bases_compressed = (bases_buf.len() * bases_w).div_ceil(8) + ref_bytes;
             let total_with_bcomp = bases_compressed + ref_bytes + ffor_packed_bytes;
             let ratio_with_bcomp = raw_bytes as f64 / total_with_bcomp as f64;
 
             println!(
-                "{name:<36} {raw_bytes:>10} {:>14} {naive_w:>5} {ffor_w:>5} {zz_w:>5} {ffor_total:>10} {ratio:>6.2}x  {bases_bytes:>10} {bases_w:>5} {total_with_bcomp:>10} {ratio_with_bcomp:>6.2}x",
+                "{name:<36} {raw_bytes:>10} {:>14} {naive_w:>5} {ffor_w:>5} {zz_w:>5} {ffor_total:>10} {ratio:>8.2}x  {bases_bytes:>10} {bases_w:>5} {total_with_bcomp:>10} {ratio_with_bcomp:>8.2}x",
                 format!("[{min_d}, {max_d}]"),
             );
 
             // Sanity assertions. naive_w is 32 (or near it) for any delta sequence that
             // contains a negative value; FFoR/ZigZag width must be strictly smaller for these
             // workloads.
-            assert!(ffor_w <= naive_w.max(1), "FFoR must never exceed naive for {name}");
+            assert!(
+                ffor_w <= naive_w.max(1),
+                "FFoR must never exceed naive for {name}"
+            );
             if min_d < 0 {
-                assert_eq!(naive_w, 32, "any negative delta forces naive W to 32 for {name}");
+                assert_eq!(
+                    naive_w, 32,
+                    "any negative delta forces naive W to 32 for {name}"
+                );
                 assert!(ffor_w < 32, "FFoR must compress below T for {name}");
             }
             // On the asymmetric workloads (offset, near-monotone) FFoR must beat ZigZag.
             if min_d > 0 || max_d < 0 {
-                assert!(ffor_w < zz_w, "FFoR should beat ZigZag on asymmetric {name}");
+                assert!(
+                    ffor_w < zz_w,
+                    "FFoR should beat ZigZag on asymmetric {name}"
+                );
             }
             // Sorted inputs => the bases inherit smoothness => the bases bit-width should be
             // far smaller than `T` for sorted columns.
             if name.starts_with("monotone") || name.starts_with("offset") {
-                assert!(bases_w < 16, "sorted bases should pack below 16 bits for {name}");
+                assert!(
+                    bases_w < 16,
+                    "sorted bases should pack below 16 bits for {name}"
+                );
             }
         }
 
diff --git a/encodings/fastlanes/src/delta/array/mod.rs b/encodings/fastlanes/src/delta/array/mod.rs
index 54313f53c93..6db40ddbd31 100644
--- a/encodings/fastlanes/src/delta/array/mod.rs
+++ b/encodings/fastlanes/src/delta/array/mod.rs
@@ -43,6 +43,21 @@ pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["bases", "deltas"];
 /// let array = Delta::try_from_primitive_array(&primitive, &mut session.create_execution_ctx()).unwrap();
 /// ```
 ///
+/// Signed inputs are also supported; deltas across negative values are encoded by
+/// `wrapping_sub` and recovered by `wrapping_add` at decompress time:
+///
+/// ```
+/// use vortex_array::arrays::PrimitiveArray;
+/// use vortex_array::VortexSessionExecute;
+/// use vortex_array::session::ArraySession;
+/// use vortex_session::VortexSession;
+/// use vortex_fastlanes::Delta;
+///
+/// let session = VortexSession::empty().with::<ArraySession>();
+/// let primitive = PrimitiveArray::from_iter([-3_i32, -2, -1, 0, 1, 2]);
+/// let array = Delta::try_from_primitive_array(&primitive, &mut session.create_execution_ctx()).unwrap();
+/// ```
+///
 /// # Details
 ///
 /// To facilitate slicing, this array accepts an `offset` and `logical_len`. The offset must be
diff --git a/encodings/fastlanes/src/delta/compute/cast.rs b/encodings/fastlanes/src/delta/compute/cast.rs
index e50c27f1a57..43a247df9f0 100644
--- a/encodings/fastlanes/src/delta/compute/cast.rs
+++ b/encodings/fastlanes/src/delta/compute/cast.rs
@@ -24,9 +24,13 @@ impl CastReduce for Delta {
         if target_ptype.is_signed_int() || source_ptype.bit_width() > target_ptype.bit_width() {
             return Ok(None);
         }
-        // Signed deltas widened by per-element value-preserving cast (e.g. -1i8 -> 4294967295u32)
-        // break the wrapping-add invariant: zero-extending the delta bytes would preserve it,
-        // sign-extending the deltas does not. Fall back to full decompress + re-encode.
+        // Signed sources need a different cast policy than the lossless widening cast
+        // used here. The delta bytes are stored as the result of `wrapping_sub`, so e.g.
+        // a delta of -1i8 has the bit pattern 0xFF. Widening *as a value* (the cast op's
+        // semantics) sign-extends that to 0xFFFFFFFF, which means `wrapping_add(base, delta)`
+        // at the wider type produces a different result than at the source type — round-trip
+        // breaks. Cross-signedness widening has the same hazard for the same reason. Fall
+        // back to decompress-and-re-encode for both cases.
         if source_ptype.is_signed_int() {
             return Ok(None);
         }
diff --git a/encodings/fastlanes/src/delta/vtable/operations.rs b/encodings/fastlanes/src/delta/vtable/operations.rs
index 943f11379ac..785cdf6aca8 100644
--- a/encodings/fastlanes/src/delta/vtable/operations.rs
+++ b/encodings/fastlanes/src/delta/vtable/operations.rs
@@ -248,6 +248,11 @@ mod tests {
     #[case::delta_large_u64((0u64..2048).collect())]
     // Single element
     #[case::delta_single(PrimitiveArray::new(buffer![42u32], Validity::NonNullable))]
+    // Signed inputs (added with signed-delta support).
+    #[case::delta_i32_crossing_zero((-100i32..100).collect())]
+    #[case::delta_i64_negative((0i64..100).map(|i| -i * 10).collect())]
+    #[case::delta_large_i32((-1024i32..1024).collect())]
+    #[case::delta_single_negative(PrimitiveArray::new(buffer![-42i32], Validity::NonNullable))]
     fn test_delta_consistency(#[case] array: PrimitiveArray) {
         test_array_consistency(&da(&array).into_array());
     }
@@ -258,6 +263,8 @@ mod tests {
     #[case::delta_u32_basic(PrimitiveArray::new(buffer![1u32, 1, 1, 1, 1], Validity::NonNullable))]
     #[case::delta_u64_basic(PrimitiveArray::new(buffer![1u64, 1, 1, 1, 1], Validity::NonNullable))]
     #[case::delta_u32_large(PrimitiveArray::new(buffer![1u32; 100], Validity::NonNullable))]
+    #[case::delta_i8_basic(PrimitiveArray::new(buffer![-1i8, -1, -1, -1, -1], Validity::NonNullable))]
+    #[case::delta_i32_basic(PrimitiveArray::new(buffer![-1i32, -1, -1, -1, -1], Validity::NonNullable))]
     fn test_delta_binary_numeric(#[case] array: PrimitiveArray) {
         test_binary_numeric_array(da(&array).into_array());
     }