vortex-data
diff --git a/‎vortex-cuda/benches/dict_cuda.rs‎
Lines changed: 3 additions & 2 deletions b/‎vortex-cuda/benches/dict_cuda.rs‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎vortex-cuda/benches/dynamic_dispatch_cuda.rs‎
Lines changed: 93 additions & 18 deletions b/‎vortex-cuda/benches/dynamic_dispatch_cuda.rs‎
Lines changed: 93 additions & 18 deletions
diff --git a/‎vortex-cuda/benches/filter_cuda.rs‎
Lines changed: 3 additions & 9 deletions b/‎vortex-cuda/benches/filter_cuda.rs‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎vortex-cuda/benches/runend_cuda.rs‎
Lines changed: 2 additions & 1 deletion b/‎vortex-cuda/benches/runend_cuda.rs‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎vortex-cuda/kernels/src/dynamic_dispatch.cu‎
Lines changed: 23 additions & 4 deletions b/‎vortex-cuda/kernels/src/dynamic_dispatch.cu‎
Lines changed: 23 additions & 4 deletions
diff --git a/‎vortex-cuda/kernels/src/dynamic_dispatch.h‎
Lines changed: 4 additions & 4 deletions b/‎vortex-cuda/kernels/src/dynamic_dispatch.h‎
Lines changed: 4 additions & 4 deletions
@@ -8,6 +8,7 @@
 
 mod common;
 
+use std::fmt::Debug;
 use std::mem::size_of;
 use std::sync::Arc;
 use std::sync::atomic::Ordering;
@@ -48,7 +49,7 @@ fn make_dict_array_typed<V, C>(len: usize, dict_size: usize) -> DictArray
 where
     V: NativePType + From<u32>,
     C: NativePType + TryFrom<usize>,
-    <C as TryFrom<usize>>::Error: std::fmt::Debug,
+    <C as TryFrom<usize>>::Error: Debug,
 {
     // Dictionary values
     let values: Vec<V> = (0..dict_size)
@@ -71,7 +72,7 @@ fn benchmark_dict_typed<V, C>(c: &mut Criterion, config: &DictBenchConfig)
 where
     V: NativePType + DeviceRepr + From<u32>,
     C: NativePType + DeviceRepr + TryFrom<usize>,
-    <C as TryFrom<usize>>::Error: std::fmt::Debug,
+    <C as TryFrom<usize>>::Error: Debug,
 {
     let mut group = c.benchmark_group("dict_cuda");
 
 
@@ -5,6 +5,7 @@
 #![expect(clippy::cast_possible_truncation)]
 #![expect(clippy::expect_used)]
 
+use std::marker::PhantomData;
 use std::mem::size_of;
 use std::sync::Arc;
 use std::time::Duration;
@@ -14,19 +15,22 @@ use criterion::Criterion;
 use criterion::Throughput;
 use cudarc::driver::CudaSlice;
 use cudarc::driver::DevicePtr;
+use cudarc::driver::DeviceRepr;
 use cudarc::driver::LaunchConfig;
 use cudarc::driver::PushKernelArg;
 use cudarc::driver::sys::CUevent_flags;
 use futures::executor::block_on;
+use vortex::array::ArrayRef;
 use vortex::array::IntoArray;
 use vortex::array::LEGACY_SESSION;
 use vortex::array::VortexSessionExecute;
 use vortex::array::arrays::DictArray;
 use vortex::array::arrays::PrimitiveArray;
+use vortex::array::buffer;
 use vortex::array::scalar::Scalar;
 use vortex::array::validity::Validity::NonNullable;
 use vortex::buffer::Buffer;
-use vortex::dtype::PType;
+use vortex::dtype::NativePType;
 use vortex::encodings::alp::ALP;
 use vortex::encodings::alp::ALPArrayExt;
 use vortex::encodings::alp::ALPArraySlotsExt;
@@ -59,16 +63,16 @@ const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M"), (100_000_000, "100M"
 /// This deliberately does not use `CudaDispatchPlan::execute` because the
 /// benchmark pre-allocates the output buffer and device plan once, then reuses
 /// them across iterations.
-fn run_timed(
+fn run_timed<T: DeviceRepr + NativePType>(
     cuda_ctx: &mut CudaExecutionCtx,
     array_len: usize,
     output_buf: &CudaDeviceBuffer,
     device_plan: &Arc<CudaSlice<u8>>,
     shared_mem_bytes: u32,
 ) -> VortexResult<Duration> {
-    let cuda_function = cuda_ctx.load_function("dynamic_dispatch", &[PType::U32])?;
+    let cuda_function = cuda_ctx.load_function("dynamic_dispatch", &[T::PTYPE])?;
     let array_len_u64 = array_len as u64;
-    let output_view = output_buf.as_view::<u32>();
+    let output_view = output_buf.as_view::<T>();
     let (output_ptr, record_output) = output_view.device_ptr(cuda_ctx.stream());
     let (plan_ptr, record_plan) = device_plan.device_ptr(cuda_ctx.stream());
 
@@ -115,17 +119,21 @@ fn run_timed(
 }
 
 /// Benchmark runner: builds a dynamic plan and launches the kernel.
-struct BenchRunner {
+///
+/// `T` is the unsigned integer type matching the output element width
+/// (e.g. `u32` for f32/i32/u32, `u64` for f64/i64/u64).
+struct BenchRunner<T> {
     _plan: CudaDispatchPlan,
     smem_bytes: u32,
     len: usize,
     device_plan: Arc<CudaSlice<u8>>,
     output_buf: CudaDeviceBuffer,
-    _plan_buffers: Vec<vortex::array::buffer::BufferHandle>,
+    _plan_buffers: Vec<buffer::BufferHandle>,
+    _phantom: PhantomData<T>,
 }
 
-impl BenchRunner {
-    fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &mut CudaExecutionCtx) -> Self {
+impl<T: DeviceRepr + NativePType> BenchRunner<T> {
+    fn new(array: &ArrayRef, len: usize, cuda_ctx: &mut CudaExecutionCtx) -> Self {
         let plan = match DispatchPlan::new(array, CudaDispatchMode::DynDispatchOnly)
             .vortex_expect("build_dyn_dispatch_plan")
         {
@@ -153,16 +161,17 @@ impl BenchRunner {
             device_plan,
             output_buf: CudaDeviceBuffer::new(
                 cuda_ctx
-                    .device_alloc::<u32>(len.next_multiple_of(1024))
+                    .device_alloc::<T>(len.next_multiple_of(1024))
                     .expect("alloc output"),
             ),
             _plan_buffers: device_buffers,
+            _phantom: PhantomData,
         }
     }
 
     fn run(&self, cuda_ctx: &mut CudaExecutionCtx) -> Duration {
         cuda_ctx.stream().synchronize().unwrap();
-        run_timed(
+        run_timed::<T>(
             cuda_ctx,
             self.len,
             &self.output_buf,
@@ -205,7 +214,7 @@ fn bench_for_bitpacked(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+                let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -250,7 +259,7 @@ fn bench_dict_bp_codes(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+                let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -294,7 +303,72 @@ fn bench_runend(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+                let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
+
+                b.iter_custom(|iters| {
+                    let mut total_time = Duration::ZERO;
+                    for _ in 0..iters {
+                        total_time += bench_runner.run(&mut cuda_ctx);
+                    }
+                    total_time
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+// ---------------------------------------------------------------------------
+// Benchmark: ALP(FoR(BitPacked)) — f64
+// ---------------------------------------------------------------------------
+fn bench_alp_for_bitpacked_f64(c: &mut Criterion) {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let mut group = c.benchmark_group("alp_for_bp_6bw_f64");
+
+    let exponents = Exponents { e: 2, f: 0 };
+    let bit_width: u8 = 6;
+
+    for (len, len_str) in BENCH_ARGS {
+        group.throughput(Throughput::Bytes((len * size_of::<f64>()) as u64));
+
+        // Generate f64 values that ALP-encode without patches.
+        let floats: Vec<f64> = (0..*len)
+            .map(|i| <f64 as ALPFloat>::decode_single(10 + (i as i64 % 64), exponents))
+            .collect();
+        let float_prim = PrimitiveArray::new(Buffer::from(floats), NonNullable);
+
+        // Encode: ALP → FoR → BitPacked
+        let alp =
+            alp_encode(float_prim.as_view(), Some(exponents), &mut ctx).vortex_expect("alp_encode");
+        assert!(alp.patches().is_none());
+        let for_arr = FoRData::encode(
+            alp.encoded()
+                .clone()
+                .execute::<PrimitiveArray>(&mut ctx)
+                .vortex_expect("to primitive"),
+        )
+        .vortex_expect("for encode");
+        let bp = BitPackedData::encode(for_arr.encoded(), bit_width, &mut ctx)
+            .vortex_expect("bitpack encode");
+
+        let tree = ALP::new(
+            FoR::try_new(bp.into_array(), for_arr.reference_scalar().clone())
+                .vortex_expect("for_new")
+                .into_array(),
+            exponents,
+            None,
+        );
+        let array = tree.into_array();
+
+        group.bench_with_input(
+            BenchmarkId::new("dynamic_dispatch_f64", len_str),
+            len,
+            |b, &n| {
+                let mut cuda_ctx =
+                    CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
+
+                let bench_runner = BenchRunner::<u64>::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -348,7 +422,7 @@ fn bench_dict_bp_codes_bp_for_values(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+                let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -413,7 +487,7 @@ fn bench_alp_for_bitpacked(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+                let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -460,7 +534,7 @@ fn bench_dict_bp_u8_codes_u32_values(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+                let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -503,7 +577,7 @@ fn bench_dict_bp_u16_codes_u32_values(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+                let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -546,7 +620,7 @@ fn bench_dict_bp_u32_codes_u32_values(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+                let bench_runner = BenchRunner::<u32>::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -568,6 +642,7 @@ fn benchmark_dynamic_dispatch(c: &mut Criterion) {
     bench_runend(c);
     bench_dict_bp_codes_bp_for_values(c);
     bench_alp_for_bitpacked(c);
+    bench_alp_for_bitpacked_f64(c);
     bench_dict_bp_u8_codes_u32_values(c);
     bench_dict_bp_u16_codes_u32_values(c);
     bench_dict_bp_u32_codes_u32_values(c);
 
@@ -18,6 +18,7 @@ use cudarc::driver::CudaSlice;
 use cudarc::driver::CudaView;
 use cudarc::driver::DevicePtr;
 use cudarc::driver::DevicePtrMut;
+use cudarc::driver::DeviceRepr;
 use cudarc::driver::sys::CUevent_flags;
 use futures::executor::block_on;
 use vortex::error::VortexExpect;
@@ -64,7 +65,7 @@ fn make_bitmask(len: usize, selectivity: f64) -> (Vec<u8>, usize) {
 
 /// Runs the CUB filter kernel and returns elapsed GPU time.
 #[expect(clippy::too_many_arguments)]
-async fn run_filter_timed<T: CubFilterable + cudarc::driver::DeviceRepr>(
+async fn run_filter_timed<T: CubFilterable + DeviceRepr>(
     d_input: CudaView<'_, T>,
     d_bitmask: CudaView<'_, u8>,
     d_output: &mut CudaSlice<T>,
@@ -132,14 +133,7 @@ async fn run_filter_timed<T: CubFilterable + cudarc::driver::DeviceRepr>(
 /// Benchmark filter for a specific type.
 fn benchmark_filter_type<T>(c: &mut Criterion, type_name: &str)
 where
-    T: CubFilterable
-        + cudarc::driver::DeviceRepr
-        + From<u8>
-        + Debug
-        + Clone
-        + Send
-        + Sync
-        + 'static,
+    T: CubFilterable + DeviceRepr + From<u8> + Debug + Clone + Send + Sync + 'static,
 {
     let mut group = c.benchmark_group(format!("filter_cuda_{type_name}"));
 
 
@@ -18,6 +18,7 @@ use criterion::Criterion;
 use criterion::Throughput;
 use cudarc::driver::DeviceRepr;
 use futures::executor::block_on;
+use vortex::array::ExecutionCtx;
 use vortex::array::IntoArray;
 use vortex::array::arrays::PrimitiveArray;
 use vortex::array::validity::Validity;
@@ -37,7 +38,7 @@ use crate::common::TimedLaunchStrategy;
 fn make_runend_array_typed<T>(
     output_len: usize,
     avg_run_len: usize,
-    ctx: &mut vortex::array::ExecutionCtx,
+    ctx: &mut ExecutionCtx,
 ) -> RunEndArray
 where
     T: NativePType + From<u8>,
 
@@ -140,11 +140,30 @@ scalar_op(T *values, const struct ScalarOp &op, char *__restrict smem, uint64_t
         break;
     }
     case ScalarOp::ALP: {
-        const float f = op.params.alp.f, e = op.params.alp.e;
+        if constexpr (sizeof(T) == 4) {
+            // The plan builder stores f32 F10/IF10 table entries as f64
+            // in AlpParams. The round-trip f32→f64→f32 is exact per the
+            // C++ standard: [conv.fpprom] guarantees the widening is
+            // value-preserving, and [conv.double] guarantees the narrowing
+            // recovers the original value when it is exactly representable
+            // in the destination type (which it is, having originated as f32).
+            const float f = static_cast<float>(op.params.alp.f);
+            const float e = static_cast<float>(op.params.alp.e);
 #pragma unroll
-        for (uint32_t i = 0; i < N; ++i) {
-            float r = static_cast<float>(static_cast<int32_t>(values[i])) * f * e;
-            values[i] = static_cast<T>(__float_as_uint(r));
+            for (uint32_t i = 0; i < N; ++i) {
+                float r = static_cast<float>(static_cast<int32_t>(values[i])) * f * e;
+                values[i] = static_cast<T>(__float_as_uint(r));
+            }
+        } else if constexpr (sizeof(T) == 8) {
+            const double f = op.params.alp.f, e = op.params.alp.e;
+#pragma unroll
+            for (uint32_t i = 0; i < N; ++i) {
+                double r = static_cast<double>(static_cast<int64_t>(values[i])) * f * e;
+                // __double_as_longlong reinterprets f64 bits as int64, and
+                // static_cast to T (uint64_t) preserves the bit pattern
+                // under C++20's two's complement guarantee.
+                values[i] = static_cast<T>(__double_as_longlong(r));
+            }
         }
         // Apply ALP patches: override positions whose float value couldn't
         // be reconstructed through the ALP encode/decode cycle.
 
@@ -20,7 +20,7 @@
 ///
 /// Each source op and scalar op may produce a different PType than its input.
 /// For example, DICT transforms codes (e.g. u8) into values (e.g. f32), and
-/// ALP transforms encoded integers (i32) into floats (f32).
+/// ALP transforms encoded integers into floats (e.g. i32 → f32, i64 → f64).
 ///
 /// `PTypeTag` is a compact enum that identifies the primitive type at each
 /// point in the pipeline. The kernel uses it to dispatch typed memory
@@ -171,7 +171,7 @@ struct SourceOp {
 /// Each scalar op declares its `output_ptype` — the PType of the values it
 /// produces. Most ops preserve the input type (FOR, ZIGZAG), but some
 /// change it:
-///   - ALP: encoded int → float (e.g. i32 → f32)
+///   - ALP: encoded int → float (e.g. i32 → f32, i64 → f64)
 ///   - DICT: codes type → values type (e.g. u8 → u32)
 ///
 /// The plan builder uses `output_ptype` to determine the element width
@@ -183,8 +183,8 @@ union ScalarParams {
     } frame_of_ref;
 
     struct AlpParams {
-        float f;
-        float e;
+        double f;
+        double e;
         uint64_t patches_ptr; // device pointer to GPUPatches struct (0 = none)
     } alp;