vortex-data
diff --git a/‎vortex-cuda/kernels/src/alp.cu‎
Lines changed: 84 additions & 28 deletions b/‎vortex-cuda/kernels/src/alp.cu‎
Lines changed: 84 additions & 28 deletions
@@ -1,36 +1,92 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-#include "scalar_kernel.cuh"
-
-// ALP (Adaptive Lossless floating-Point) decode operation.
-// Converts integers to floats by multiplying by precomputed exponent factors.
-// Formula: decoded = (float)encoded * f * e
-// Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
-template <typename EncodedT, typename FloatT>
-struct AlpOp {
-    FloatT f; // F10[exponents.f] - power of 10
-    FloatT e; // IF10[exponents.e] - inverse power of 10
-
-    __device__ inline FloatT operator()(EncodedT value) const {
-        return static_cast<FloatT>(value) * f * e;
+#include "patches.cuh"
+
+// ALP (Adaptive Lossless floating-Point) decode: out[i] = (FloatT)in[i] * f * e.
+//
+// Each block processes one 1024-element chunk cooperatively and applies patches
+// into shared memory before writing to global memory, mirroring the strategy
+// used by bit_unpack. f = F10[exponents.f], e = IF10[exponents.e].
+//
+// The cast from EncT to FloatT must preserve ALP's lossless contract: f32 is
+// only encoded as i32, and f64 is only encoded as i64. The i64 → double cast
+// is lossless for all values ALP can produce.
+template <typename EncT, typename FloatT>
+__device__ void alp_device(const EncT *__restrict in,
+                           FloatT *__restrict out,
+                           FloatT f,
+                           FloatT e,
+                           uint64_t array_len,
+                           int thread_idx,
+                           GPUPatches &patches) {
+    constexpr int ThreadCount = 32;
+    // ThreadCount == 32 (one warp) is baked into this kernel:
+    //   - __syncwarp() below is only sufficient because all threads live in one warp.
+    //   - per_thread must evenly divide 1024 so the unrolled loops cover the chunk.
+    static_assert(ThreadCount == 32, "alp kernel requires exactly one warp per block");
+    static_assert(1024 % ThreadCount == 0, "ThreadCount must evenly divide 1024");
+    __shared__ FloatT shared_out[1024];
+
+    constexpr int per_thread = 1024 / ThreadCount;
+    uint64_t chunk_base = static_cast<uint64_t>(blockIdx.x) * 1024;
+
+    // Step 1: decode the chunk into shared memory. The tail block is bounds-checked;
+    // all interior blocks take the fast path with no per-element branch.
+    if (chunk_base + 1024 <= array_len) {
+#pragma unroll
+        for (int i = 0; i < per_thread; i++) {
+            int idx = i * ThreadCount + thread_idx;
+            shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
+        }
+    } else {
+#pragma unroll
+        for (int i = 0; i < per_thread; i++) {
+            int idx = i * ThreadCount + thread_idx;
+            uint64_t global_idx = chunk_base + static_cast<uint64_t>(idx);
+            if (global_idx < array_len) {
+                shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
+            } else {
+                shared_out[idx] = FloatT {};
+            }
+        }
     }
-};
-
-// Macro to generate ALP kernel for each type combination.
-// Input is integer (encoded), output is float (decoded).
-#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType)                                    \
-    extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded,          \
-                                                                 FloatType *__restrict decoded,              \
-                                                                 FloatType f,                                \
-                                                                 FloatType e,                                \
-                                                                 uint64_t array_len) {                       \
-        scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e});                        \
+    __syncwarp();
+
+    // Step 2: apply patches in parallel across the warp.
+    PatchesCursor<FloatT> cursor(patches, blockIdx.x, thread_idx, static_cast<uint32_t>(ThreadCount));
+    auto patch = cursor.next();
+    while (patch.index != 1024) {
+        shared_out[patch.index] = patch.value;
+        patch = cursor.next();
     }
+    __syncwarp();
 
-// f32 variants (ALP for f32 encodes as i32 or i64)
-GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
-GENERATE_ALP_KERNEL(i64, f32, int64_t, float)
+// Step 3: coalesced write-out of the full 1024-element chunk. The caller
+// allocates `full_out` rounded up to a multiple of 1024, so every block
+// writes entirely within bounds. Positions in `[array_len, rounded_len)`
+// of the tail chunk hold don't-care values; the caller slices them off.
+#pragma unroll
+    for (int i = 0; i < per_thread; i++) {
+        int idx = i * ThreadCount + thread_idx;
+        out[idx] = shared_out[idx];
+    }
+}
+
+#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncT, FloatT)                                          \
+    extern "C" __global__ void alp_##enc_suffix##_##float_suffix##_32t(const EncT *__restrict full_in,       \
+                                                                       FloatT *__restrict full_out,          \
+                                                                       FloatT f,                             \
+                                                                       FloatT e,                             \
+                                                                       uint64_t array_len,                   \
+                                                                       GPUPatches patches) {                 \
+        int thread_idx = threadIdx.x;                                                                        \
+        auto in = full_in + (blockIdx.x * 1024);                                                             \
+        auto out = full_out + (blockIdx.x * 1024);                                                           \
+        alp_device<EncT, FloatT>(in, out, f, e, array_len, thread_idx, patches);                             \
+    }
 
-// f64 variants (ALP for f64 encodes as i64)
+// The only ALPInt bindings produced by the encoder are (f32, i32) and (f64, i64).
+// i64 → double is lossless; i32 → float is lossless for all values ALP emits.
+GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
 GENERATE_ALP_KERNEL(i64, f64, int64_t, double)