vortex-data
diff --git a/‎vortex-cuda/kernels/src/alp.cu‎
Lines changed: 68 additions & 29 deletions b/‎vortex-cuda/kernels/src/alp.cu‎
Lines changed: 68 additions & 29 deletions
diff --git a/‎vortex-cuda/src/kernel/encodings/alp.rs‎
Lines changed: 138 additions & 27 deletions b/‎vortex-cuda/src/kernel/encodings/alp.rs‎
Lines changed: 138 additions & 27 deletions
@@ -1,36 +1,75 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-#include "scalar_kernel.cuh"
-
-// ALP (Adaptive Lossless floating-Point) decode operation.
-// Converts integers to floats by multiplying by precomputed exponent factors.
-// Formula: decoded = (float)encoded * f * e
-// Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
-template <typename EncodedT, typename FloatT>
-struct AlpOp {
-    FloatT f; // F10[exponents.f] - power of 10
-    FloatT e; // IF10[exponents.e] - inverse power of 10
-
-    __device__ inline FloatT operator()(EncodedT value) const {
-        return static_cast<FloatT>(value) * f * e;
+#include "patches.cuh"
+
+// ALP (Adaptive Lossless floating-Point) decode: out[i] = (FloatT)in[i] * f * e.
+//
+// Each block processes one 1024-element chunk cooperatively and applies patches
+// into shared memory before writing to global memory, mirroring the strategy
+// used by bit_unpack. f = F10[exponents.f], e = IF10[exponents.e].
+template <typename EncT, typename FloatT, int ThreadCount>
+__device__ void _alp_device(const EncT *__restrict in, FloatT *__restrict out, FloatT f,
+                            FloatT e, uint64_t array_len, int thread_idx, GPUPatches &patches) {
+    __shared__ FloatT shared_out[1024];
+
+    constexpr int per_thread = 1024 / ThreadCount;
+    uint64_t chunk_base = static_cast<uint64_t>(blockIdx.x) * 1024;
+
+    // Step 1: decode the chunk into shared memory. The tail block is bounds-checked;
+    // all interior blocks take the fast path with no per-element branch.
+    if (chunk_base + 1024 <= array_len) {
+        #pragma unroll
+        for (int i = 0; i < per_thread; i++) {
+            int idx = i * ThreadCount + thread_idx;
+            shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
+        }
+    } else {
+        #pragma unroll
+        for (int i = 0; i < per_thread; i++) {
+            int idx = i * ThreadCount + thread_idx;
+            uint64_t global_idx = chunk_base + static_cast<uint64_t>(idx);
+            if (global_idx < array_len) {
+                shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
+            } else {
+                shared_out[idx] = FloatT{};
+            }
+        }
+    }
+    __syncwarp();
+
+    // Step 2: apply patches in parallel across the warp.
+    PatchesCursor<FloatT> cursor(patches, blockIdx.x, thread_idx, ThreadCount);
+    auto patch = cursor.next();
+    while (patch.index != 1024) {
+        shared_out[patch.index] = patch.value;
+        patch = cursor.next();
+    }
+    __syncwarp();
+
+    // Step 3: coalesced write-out. Slop past `array_len` in the tail chunk is
+    // overwritten harmlessly; the caller slices the final buffer to `array_len`.
+    #pragma unroll
+    for (int i = 0; i < per_thread; i++) {
+        int idx = i * ThreadCount + thread_idx;
+        out[idx] = shared_out[idx];
     }
-};
-
-// Macro to generate ALP kernel for each type combination.
-// Input is integer (encoded), output is float (decoded).
-#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType)                                    \
-    extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded,          \
-                                                                 FloatType *__restrict decoded,              \
-                                                                 FloatType f,                                \
-                                                                 FloatType e,                                \
-                                                                 uint64_t array_len) {                       \
-        scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e});                        \
+}
+
+#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncT, FloatT, THREAD_COUNT)                    \
+    extern "C" __global__ void alp_##enc_suffix##_##float_suffix##_##THREAD_COUNT##t(                \
+        const EncT *__restrict full_in, FloatT *__restrict full_out, FloatT f, FloatT e,             \
+        uint64_t array_len, GPUPatches patches) {                                                    \
+        int thread_idx = threadIdx.x;                                                                \
+        auto in = full_in + (blockIdx.x * 1024);                                                     \
+        auto out = full_out + (blockIdx.x * 1024);                                                   \
+        _alp_device<EncT, FloatT, THREAD_COUNT>(in, out, f, e, array_len, thread_idx, patches);      \
     }
 
-// f32 variants (ALP for f32 encodes as i32 or i64)
-GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
-GENERATE_ALP_KERNEL(i64, f32, int64_t, float)
+// f32 decoded from i32 or i64 encoded — 32 threads per block (32 elements each).
+GENERATE_ALP_KERNEL(i32, f32, int32_t, float, 32)
+GENERATE_ALP_KERNEL(i64, f32, int64_t, float, 32)
 
-// f64 variants (ALP for f64 encodes as i64)
-GENERATE_ALP_KERNEL(i64, f64, int64_t, double)
+// f64 decoded from i64 encoded — 16 threads per block (64 elements each) to match
+// the lane count bit_unpack uses for 64-bit output widths.
+GENERATE_ALP_KERNEL(i64, f64, int64_t, double, 16)
@@ -2,18 +2,18 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fmt::Debug;
-use std::sync::Arc;
 
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
+use cudarc::driver::LaunchConfig;
 use cudarc::driver::PushKernelArg;
 use tracing::instrument;
 use vortex::array::ArrayRef;
 use vortex::array::Canonical;
 use vortex::array::arrays::PrimitiveArray;
 use vortex::array::arrays::primitive::PrimitiveDataParts;
 use vortex::array::buffer::BufferHandle;
-use vortex::array::match_each_unsigned_integer_ptype;
+use vortex::array::buffer::DeviceBufferExt;
 use vortex::dtype::NativePType;
 use vortex::encodings::alp::ALP;
 use vortex::encodings::alp::ALPArray;
@@ -30,7 +30,8 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::kernel::patches::execute_patches;
+use crate::kernel::patches::build_gpu_patches;
+use crate::kernel::patches::types::load_patches;
 
 /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
 #[derive(Debug)]
@@ -54,6 +55,13 @@ impl CudaExecute for ALPExecutor {
     }
 }
 
+/// Thread count per block, matching the strategy used by `bit_unpack`:
+/// 16 threads (64 elements each) for 64-bit output widths, otherwise 32.
+const fn alp_thread_count<A>() -> u32 {
+    if size_of::<A>() == 8 { 16 } else { 32 }
+}
+
+#[instrument(skip_all)]
 async fn decode_alp<A>(array: ALPArray, ctx: &mut CudaExecutionCtx) -> VortexResult<Canonical>
 where
     A: ALPFloat + NativePType + DeviceRepr + Send + Sync + 'static,
@@ -67,50 +75,69 @@ where
     let f: A = A::F10[exponents.f as usize];
     let e: A = A::IF10[exponents.e as usize];
 
-    // Execute child and copy to device
+    // Execute child and copy to device.
     let canonical = array.encoded().clone().execute_cuda(ctx).await?;
     let primitive = canonical.into_primitive();
     let PrimitiveDataParts {
         buffer, validity, ..
     } = primitive.into_data_parts();
 
     let device_input = ctx.ensure_on_device(buffer).await?;
-
-    // Get CUDA view of input
     let input_view = device_input.cuda_view::<A::ALPInt>()?;
 
-    // Allocate output buffer
-    let output_slice = ctx.device_alloc::<A>(array_len)?;
+    // Allocate output rounded up to a full chunk: the fused kernel writes a
+    // whole 1024-element chunk per block, and we slice off any padding below.
+    let output_slice = ctx.device_alloc::<A>(array_len.next_multiple_of(1024))?;
     let output_buf = CudaDeviceBuffer::new(output_slice);
     let output_view = output_buf.as_view::<A>();
 
-    let array_len_u64 = array_len as u64;
-
-    // Load kernel function
-    let kernel_ptypes = [A::ALPInt::PTYPE, A::PTYPE];
-    let cuda_function = ctx.load_function("alp", &kernel_ptypes)?;
+    // Patch validity does not need to be scattered: the ALP encoder strips null
+    // positions from the exception list, so patches only exist at valid
+    // positions. load_patches additionally rejects patches without
+    // chunk_offsets (required by the fused kernel's PatchesCursor).
+    let device_patches = if let Some(patches) = array.patches() {
+        Some(load_patches(&patches, ctx).await?)
+    } else {
+        None
+    };
+    let patches_arg = build_gpu_patches(device_patches.as_ref())?;
+
+    // Load the kernel: alp_{enc}_{float}_{threads}t
+    let thread_count = alp_thread_count::<A>();
+    let thread_suffix = format!("{thread_count}t");
+    let enc_suffix = A::ALPInt::PTYPE.to_string();
+    let float_suffix = A::PTYPE.to_string();
+    let cuda_function = ctx.load_function_with_suffixes(
+        "alp",
+        &[
+            enc_suffix.as_str(),
+            float_suffix.as_str(),
+            thread_suffix.as_str(),
+        ],
+    )?;
+
+    let num_blocks = u32::try_from(array_len.div_ceil(1024))?;
+    let config = LaunchConfig {
+        grid_dim: (num_blocks, 1, 1),
+        block_dim: (thread_count, 1, 1),
+        shared_mem_bytes: 0,
+    };
 
-    ctx.launch_kernel(&cuda_function, array_len, |args| {
+    let array_len_u64 = array_len as u64;
+    ctx.launch_kernel_config(&cuda_function, config, array_len, |args| {
         args.arg(&input_view)
             .arg(&output_view)
             .arg(&f)
             .arg(&e)
-            .arg(&array_len_u64);
+            .arg(&array_len_u64)
+            .arg(&patches_arg);
     })?;
 
-    // Check if there are any patches to decode here. Patch validity does not
-    // need to be scattered: the ALP encoder strips null positions from the
-    // exception list, so patches only exist at valid positions. execute_patches
-    // additionally guards against nullable patch values at runtime.
-    let output_buf = if let Some(patches) = array.patches() {
-        match_each_unsigned_integer_ptype!(patches.indices_ptype()?, |I| {
-            execute_patches::<A, I>(patches.clone(), output_buf, ctx).await?
-        })
-    } else {
-        output_buf
-    };
+    // Synchronize so the device patches buffers remain alive for the kernel.
+    ctx.synchronize_stream()?;
+    drop(device_patches);
 
-    let output_handle = BufferHandle::new_device(Arc::new(output_buf));
+    let output_handle = BufferHandle::new_device(output_buf.slice_typed::<A>(0..array_len));
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
         output_handle,
         A::PTYPE,
@@ -257,4 +284,88 @@ mod tests {
         assert_arrays_eq!(cpu_result, gpu_result);
         Ok(())
     }
+
+    /// Multi-chunk ALP (> 1024 elements) with patches scattered across chunks.
+    /// Exercises the fused kernel's per-block patches cursor math when more
+    /// than one block is launched.
+    #[crate::test]
+    async fn test_cuda_alp_multi_chunk_with_patches() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        // 3072 values (3 chunks). Inject exceptions (values ALP can't encode
+        // losslessly) at a handful of positions spread across chunks.
+        let mut values: Vec<f32> = Vec::with_capacity(3072);
+        for i in 0u32..3072 {
+            if matches!(i, 0 | 100 | 1023 | 1024 | 2000 | 3071) {
+                values.push(1.0_f32 / 7.0 + i as f32);
+            } else {
+                values.push(i as f32);
+            }
+        }
+        let prim = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
+        let alp_array = alp_encode(
+            prim.as_view(),
+            None,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(
+            alp_array.patches().is_some(),
+            "expected patches from ALP exceptions"
+        );
+
+        let cpu_result = crate::canonicalize_cpu(alp_array.clone())?.into_array();
+
+        let gpu_result = alp_array
+            .into_array()
+            .execute_cuda(&mut cuda_ctx)
+            .await?
+            .into_host()
+            .await?
+            .into_array();
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+        Ok(())
+    }
+
+    /// Tail-chunk bounds check: an array whose length is not a multiple of
+    /// 1024 forces the kernel's tail-block path to bounds-check its decode
+    /// loop. Includes a patch in the tail.
+    #[crate::test]
+    async fn test_cuda_alp_partial_tail_chunk() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        let mut values: Vec<f64> = Vec::with_capacity(1500);
+        for i in 0u32..1500 {
+            if i == 1400 {
+                values.push(1.0_f64 / 3.0);
+            } else {
+                values.push(i as f64);
+            }
+        }
+        let prim = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
+        let alp_array = alp_encode(
+            prim.as_view(),
+            None,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(
+            alp_array.patches().is_some(),
+            "expected patches from ALP exceptions"
+        );
+
+        let cpu_result = crate::canonicalize_cpu(alp_array.clone())?.into_array();
+
+        let gpu_result = alp_array
+            .into_array()
+            .execute_cuda(&mut cuda_ctx)
+            .await?
+            .into_host()
+            .await?
+            .into_array();
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+        Ok(())
+    }
 }