alp_rd: speedup (#7064)

joseph-isaacs · web-flow · commit 59b0083ec8dc · 2026-04-01T17:46:38.000-04:00
## Summary  Closes: #000  ## Testing  --------- Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs
@@ -103,20 +103,47 @@ fn decompress_alp<T: ALPFloat + NativePType>(bencher: Bencher, args: (usize, f64
         .bench_values(|(v, mut ctx)| decompress_into_array(v, &mut ctx));
 }
 
-#[divan::bench(types = [f32, f64], args = [10_000, 100_000])]
-fn compress_rd<T: ALPRDFloat>(bencher: Bencher, n: usize) {
-    let primitive = PrimitiveArray::new(buffer![T::from(1.23).unwrap(); n], Validity::NonNullable);
-    let encoder = RDEncoder::new(&[T::from(1.23).unwrap()]);
+const RD_BENCH_ARGS: &[(usize, f64)] = &[
+    // length, fraction_patch
+    (10_000, 0.0),
+    (10_000, 0.01),
+    (10_000, 0.1),
+    (100_000, 0.0),
+    (100_000, 0.01),
+    (100_000, 0.1),
+];
+
+fn make_rd_array<T: ALPRDFloat + NativePType>(n: usize, fraction_patch: f64) -> PrimitiveArray {
+    let base_val = T::from(1.23).unwrap();
+    let mut rng = StdRng::seed_from_u64(42);
+    let mut values = buffer![base_val; n].into_mut();
+    if fraction_patch > 0.0 {
+        let outlier = T::from(1000.0).unwrap();
+        for index in 0..values.len() {
+            if rng.random_bool(fraction_patch) {
+                values[index] = outlier;
+            }
+        }
+    }
+    PrimitiveArray::new(values.freeze(), Validity::NonNullable)
+}
+
+#[divan::bench(types = [f32, f64], args = RD_BENCH_ARGS)]
+fn compress_rd<T: ALPRDFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
+    let (n, fraction_patch) = args;
+    let primitive = make_rd_array::<T>(n, fraction_patch);
+    let encoder = RDEncoder::new(primitive.as_slice::<T>());
 
     bencher
         .with_inputs(|| (&primitive, &encoder))
         .bench_refs(|(primitive, encoder)| encoder.encode(primitive))
 }
 
-#[divan::bench(types = [f32, f64], args = [10_000, 100_000])]
-fn decompress_rd<T: ALPRDFloat>(bencher: Bencher, n: usize) {
-    let primitive = PrimitiveArray::new(buffer![T::from(1.23).unwrap(); n], Validity::NonNullable);
-    let encoder = RDEncoder::new(&[T::from(1.23).unwrap()]);
+#[divan::bench(types = [f32, f64], args = RD_BENCH_ARGS)]
+fn decompress_rd<T: ALPRDFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
+    let (n, fraction_patch) = args;
+    let primitive = make_rd_array::<T>(n, fraction_patch);
+    let encoder = RDEncoder::new(primitive.as_slice::<T>());
     let encoded = encoder.encode(&primitive);
 
     bencher
diff --git a/encodings/alp/public-api.lock b/encodings/alp/public-api.lock
@@ -582,7 +582,7 @@ pub fn f64::to_u16(bits: Self::UINT) -> u16
 
 pub fn vortex_alp::alp_encode(parray: &vortex_array::arrays::primitive::array::PrimitiveArray, exponents: core::option::Option<vortex_alp::Exponents>) -> vortex_error::VortexResult<vortex_alp::ALPArray>
 
-pub fn vortex_alp::alp_rd_decode<T: vortex_alp::ALPRDFloat>(left_parts: vortex_buffer::buffer::Buffer<u16>, left_parts_dict: &[u16], right_bit_width: u8, right_parts: vortex_buffer::buffer::Buffer<<T as vortex_alp::ALPRDFloat>::UINT>, left_parts_patches: core::option::Option<vortex_array::patches::Patches>, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_buffer::buffer::Buffer<T>>
+pub fn vortex_alp::alp_rd_decode<T: vortex_alp::ALPRDFloat>(left_parts: vortex_buffer::buffer_mut::BufferMut<u16>, left_parts_dict: &[u16], right_bit_width: u8, right_parts: vortex_buffer::buffer_mut::BufferMut<<T as vortex_alp::ALPRDFloat>::UINT>, left_parts_patches: core::option::Option<vortex_array::patches::Patches>, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_buffer::buffer::Buffer<T>>
 
 pub fn vortex_alp::decompress_into_array(array: vortex_alp::ALPArray, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::arrays::primitive::array::PrimitiveArray>
 
diff --git a/encodings/alp/src/alp_rd/array.rs b/encodings/alp/src/alp_rd/array.rs
@@ -307,10 +307,10 @@ impl VTable for ALPRD {
         let decoded_array = if ptype == PType::F32 {
             PrimitiveArray::new(
                 alp_rd_decode::<f32>(
-                    left_parts.into_buffer::<u16>(),
+                    left_parts.into_buffer_mut::<u16>(),
                     &left_parts_dict,
                     right_bit_width,
-                    right_parts.into_buffer::<u32>(),
+                    right_parts.into_buffer_mut::<u32>(),
                     left_parts_patches,
                     ctx,
                 )?,
@@ -319,10 +319,10 @@ impl VTable for ALPRD {
         } else {
             PrimitiveArray::new(
                 alp_rd_decode::<f64>(
-                    left_parts.into_buffer::<u16>(),
+                    left_parts.into_buffer_mut::<u16>(),
                     &left_parts_dict,
                     right_bit_width,
-                    right_parts.into_buffer::<u64>(),
+                    right_parts.into_buffer_mut::<u64>(),
                     left_parts_patches,
                     ctx,
                 )?,
diff --git a/encodings/alp/src/alp_rd/mod.rs b/encodings/alp/src/alp_rd/mod.rs
@@ -284,44 +284,68 @@ impl RDEncoder {
     }
 }
 
-/// Decode a vector of ALP-RD encoded values back into their original floating point format.
+/// Decode ALP-RD encoded values back into their original floating point format.
 ///
 /// # Panics
 ///
-/// The function panics if the provided `left_parts` and `right_parts` differ in length.
+/// Panics if `left_parts` and `right_parts` differ in length.
 pub fn alp_rd_decode<T: ALPRDFloat>(
-    left_parts: Buffer<u16>,
+    mut left_parts: BufferMut<u16>,
     left_parts_dict: &[u16],
     right_bit_width: u8,
-    right_parts: Buffer<T::UINT>,
+    right_parts: BufferMut<T::UINT>,
     left_parts_patches: Option<Patches>,
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<Buffer<T>> {
     if left_parts.len() != right_parts.len() {
         vortex_panic!("alp_rd_decode: left_parts.len != right_parts.len");
     }
 
-    // Decode the left-parts dictionary
-    let mut values = BufferMut::<u16>::from_iter(
-        left_parts
-            .iter()
-            .map(|code| left_parts_dict[*code as usize]),
-    );
+    let shift = right_bit_width as usize;
 
-    // Apply any patches
     if let Some(patches) = left_parts_patches {
+        // Patched path: some left-part codes map to exception values that live outside
+        // the dictionary. We must dictionary-decode first, then overwrite the exceptions,
+        // before we can combine with right-parts.
+
+        // Dictionary-decode every code in-place (code → actual left bit-pattern).
+        for code in left_parts.iter_mut() {
+            *code = left_parts_dict[*code as usize];
+        }
+
+        // Overwrite exception positions with their true left bit-patterns.
         let indices = patches.indices().clone().execute::<PrimitiveArray>(ctx)?;
         let patch_values = patches.values().clone().execute::<PrimitiveArray>(ctx)?;
-        alp_rd_apply_patches(&mut values, &indices, &patch_values, patches.offset());
-    }
+        alp_rd_apply_patches(&mut left_parts, &indices, &patch_values, patches.offset());
+
+        // Reconstruct floats by shifting each decoded left value into the MSBs
+        // and OR-ing with the corresponding right value.
+        alp_rd_combine_inplace::<T>(
+            right_parts,
+            |right, &left| {
+                *right = (<T as ALPRDFloat>::from_u16(left) << shift) | *right;
+            },
+            left_parts.as_ref(),
+        )
+    } else {
+        // Non-patched fast path: every code maps through the dictionary, so we can
+        // pre-shift the entire dictionary once and reduce the per-element hot loop to
+        // a single table lookup + OR.
+        let mut shifted_dict = [T::UINT::default(); MAX_DICT_SIZE as usize];
+        for (i, &entry) in left_parts_dict.iter().enumerate() {
+            shifted_dict[i] = <T as ALPRDFloat>::from_u16(entry) << shift;
+        }
 
-    // Shift the left-parts and add in the right-parts.
-    Ok(alp_rd_decode_core(
-        left_parts_dict,
-        right_bit_width,
-        right_parts,
-        values,
-    ))
+        // Each element: look up the pre-shifted left value by code, OR with right-parts.
+        alp_rd_combine_inplace::<T>(
+            right_parts,
+            |right, &code| {
+                // SAFETY: codes are bounded by dict size (< left_parts_dict.len() <= MAX_DICT_SIZE).
+                *right = unsafe { *shifted_dict.get_unchecked(code as usize) } | *right;
+            },
+            left_parts.as_ref(),
+        )
+    }
 }
 
 /// Apply patches to the decoded left-parts values.
@@ -342,23 +366,18 @@ fn alp_rd_apply_patches(
     })
 }
 
-/// Core decode logic shared between `alp_rd_decode` and `execute_alp_rd_decode`.
-fn alp_rd_decode_core<T: ALPRDFloat>(
-    _left_parts_dict: &[u16],
-    right_bit_width: u8,
-    right_parts: Buffer<T::UINT>,
-    values: BufferMut<u16>,
-) -> Buffer<T> {
-    // Shift the left-parts and add in the right-parts.
-    let mut index = 0;
-    right_parts
-        .map_each_in_place(|right| {
-            let left = values[index];
-            index += 1;
-            let left = <T as ALPRDFloat>::from_u16(left);
-            T::from_bits((left << (right_bit_width as usize)) | right)
-        })
-        .freeze()
+/// Zip `right_parts` with `left_data`, apply `combine_fn` per element, then reinterpret the
+/// buffer from `T::UINT` to `T` (same bit-width: u32↔f32, u64↔f64).
+fn alp_rd_combine_inplace<T: ALPRDFloat>(
+    mut right_parts: BufferMut<T::UINT>,
+    combine_fn: impl Fn(&mut T::UINT, &u16),
+    left_data: &[u16],
+) -> VortexResult<Buffer<T>> {
+    for (right, left) in right_parts.as_mut_slice().iter_mut().zip(left_data.iter()) {
+        combine_fn(right, left);
+    }
+    // SAFETY: all bit patterns of T::UINT are valid T (u32↔f32 or u64↔f64).
+    Ok(unsafe { right_parts.transmute::<T>() }.freeze())
 }
 /// Find the best "cut point" for a set of floating point values such that we can
 /// cast them all to the relevant value instead.