chore: add patches_ptr to BitunpackParams and AlpParams

0ax1 · 0ax1 · commit af8e9cc1f1bd · 2026-04-20T13:36:30.000Z
Structural plumbing for per-op exception patches in the fused
dynamic dispatch kernel. Adds PackedPatchesHeader and kernel
helpers (patch_fl_chunk, patch_all_fl_chunks) but does not yet
populate patches_ptr - all constructors initialize it to 0.

Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -162,11 +162,8 @@ __device__ inline void bitunpack(const T *__restrict packed,
                                  uint64_t chunk_start,
                                  uint32_t chunk_len,
                                  const struct SourceOp &src) {
-    constexpr uint32_t T_BITS = sizeof(T) * 8;
-    constexpr uint32_t FL_CHUNK = 1024;
-    constexpr uint32_t LANES = FL_CHUNK / T_BITS;
     const uint32_t bw = src.params.bitunpack.bit_width;
-    const uint32_t words_per_block = LANES * bw;
+    const uint32_t words_per_block = FL_LANES<T> * bw;
     const uint32_t elem_off = src.params.bitunpack.element_offset;
     const uint32_t dst_off = (chunk_start + elem_off) % FL_CHUNK;
     const uint64_t first_block = (chunk_start + elem_off) / FL_CHUNK;
@@ -177,12 +174,77 @@ __device__ inline void bitunpack(const T *__restrict packed,
     for (uint32_t c = 0; c < n_chunks; ++c) {
         const T *src_chunk = packed + (first_block + c) * words_per_block;
         T *chunk_dst = dst + c * FL_CHUNK;
-        for (uint32_t lane = threadIdx.x; lane < LANES; lane += blockDim.x) {
+        for (uint32_t lane = threadIdx.x; lane < FL_LANES<T>; lane += blockDim.x) {
             bit_unpack_lane<T>(src_chunk, chunk_dst, 0, lane, bw);
         }
     }
 }
 
+// ═══════════════════════════════════════════════════════════════════════════
+// Patches
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Parsed view into a packed patches buffer (the fused-dispatch counterpart
+/// of GPUPatches, which is used by the standalone per-bitwidth kernels).
+/// Each op with patches gets its own contiguous device allocation holding
+/// chunk_offsets, indices, and values, referenced by a single uint64_t
+/// pointer (patches_ptr in BitunpackParams); see PackedPatchesHeader in
+/// patches.h for the layout.
+template <typename T>
+struct PackedPatchesView {
+    const uint32_t *chunk_offsets; // n_chunks+1 entries (sentinel)
+    uint32_t n_chunks;
+    const uint16_t *indices;       // within-chunk positions (0–1023)
+    const T *values;
+};
+
+/// Parse a packed patches buffer into its component arrays.
+template <typename T>
+__device__ inline PackedPatchesView<T> parse_patches(uint64_t patches_ptr) {
+    const uint8_t *base = reinterpret_cast<const uint8_t *>(patches_ptr);
+    const auto *header = reinterpret_cast<const PackedPatchesHeader *>(base);
+    return {
+        reinterpret_cast<const uint32_t *>(base + sizeof(PackedPatchesHeader)),
+        header->n_chunks,
+        reinterpret_cast<const uint16_t *>(base + header->indices_byte_offset),
+        reinterpret_cast<const T *>(base + header->values_byte_offset),
+    };
+}
+
+/// Overwrite exception positions in `out` for a single chunk.
+/// All threads in the block cooperate. Caller must issue __syncthreads()
+/// afterward if other threads read from `out`.
+template <typename T>
+__device__ __noinline__ void apply_patches(uint64_t patches_ptr, T *__restrict out, uint32_t chunk) {
+    const auto patches = parse_patches<T>(patches_ptr);
+    assert(chunk + 1 <= patches.n_chunks);
+    uint32_t start = patches.chunk_offsets[chunk];
+    uint32_t end = patches.chunk_offsets[chunk + 1];
+    for (uint32_t i = start + threadIdx.x; i < end; i += blockDim.x) {
+        out[patches.indices[i]] = patches.values[i];
+    }
+}
+
+/// Overwrite exception positions in `out` for a range of chunks.
+/// All threads in the block cooperate. Caller must issue __syncthreads()
+/// afterward if other threads read from `out`.
+template <typename T>
+__device__ __noinline__ void
+apply_patches_range(uint64_t patches_ptr, T *__restrict out, uint32_t stage_len, uint32_t element_offset) {
+    const auto patches = parse_patches<T>(patches_ptr);
+    const uint32_t first_chunk = element_offset / FL_CHUNK;
+    const uint32_t n_chunks = (stage_len + (element_offset % FL_CHUNK) + FL_CHUNK - 1) / FL_CHUNK;
+    assert(first_chunk + n_chunks <= patches.n_chunks);
+    for (uint32_t c = 0; c < n_chunks; ++c) {
+        T *chunk_base = out + c * FL_CHUNK;
+        uint32_t start = patches.chunk_offsets[first_chunk + c];
+        uint32_t end = patches.chunk_offsets[first_chunk + c + 1];
+        for (uint32_t i = start + threadIdx.x; i < end; i += blockDim.x) {
+            chunk_base[patches.indices[i]] = patches.values[i];
+        }
+    }
+}
+
 /// Read N values from a source op into `out`.
 ///
 /// Dispatches on `src.op_code` to handle each encoding:
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.h b/vortex-cuda/kernels/src/dynamic_dispatch.h
@@ -30,6 +30,7 @@
 #pragma once
 
 #include <stdint.h>
+#include "patches.h"
 
 /// Compact tag identifying a Vortex PType for GPU dispatch.
 ///
@@ -108,6 +109,7 @@ union SourceParams {
     struct BitunpackParams {
         uint8_t bit_width;
         uint32_t element_offset; // Sub-byte offset
+        uint64_t patches_ptr;    // device pointer to packed patches buffer (0 = none)
     } bitunpack;
 
     /// Copy from global to shared memory.
@@ -157,6 +159,7 @@ union ScalarParams {
     struct AlpParams {
         float f;
         float e;
+        uint64_t patches_ptr; // device pointer to packed patches buffer (0 = none)
     } alp;
 
     /// Dictionary gather: use current value as index into decoded values in smem.
diff --git a/vortex-cuda/kernels/src/fastlanes_common.cuh b/vortex-cuda/kernels/src/fastlanes_common.cuh
@@ -8,8 +8,25 @@
 // FastLanes ordering array
 __constant__ int FL_ORDER[] = {0, 4, 2, 6, 1, 5, 3, 7};
 
+// FastLanes organises every 1024-element vector into a transposed layout
+// of FL_LANES columns × (1024 / FL_LANES) rows. Each column is a "lane"
+// that can be processed independently of every other lane, which is what
+// makes all FastLanes encodings (FFOR, DELTA, RLE, ALP, …) fully
+// data-parallel. One CUDA thread or one CPU SIMD lane handles one
+// FastLanes lane.
+//
+// Paper: https://ir.cwi.nl/pub/35881/35881.pdf
+// Repo:  https://github.com/cwida/FastLanes
+
+/// FastLanes chunk size in elements.
+constexpr uint32_t FL_CHUNK = 1024;
+
+/// Number of FastLanes lanes for element type T (1024 / bit-width).
+template <typename T>
+constexpr uint32_t FL_LANES = FL_CHUNK / (sizeof(T) * 8);
+
 // Compute the index in the FastLanes layout
 #define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)
 
 // Create a mask with 'width' bits set
-#define MASK(T, width) (((T)1 << width) - 1)
+#define MASK(T, width) (((T)1 << width) - 1)
diff --git a/vortex-cuda/kernels/src/patches.h b/vortex-cuda/kernels/src/patches.h
@@ -9,6 +9,17 @@
 extern "C" {
 #endif
 
+/// Header at the start of a packed patches buffer.
+///
+/// Layout: [PackedPatchesHeader | chunk_offsets (u32, n_chunks+1 sentinel) | indices (u16) | padding | values (V)]
+///
+/// A `patches_ptr` of 0 signals no patches.
+struct PackedPatchesHeader {
+    uint32_t n_chunks;            // number of FL chunks covered
+    uint32_t indices_byte_offset; // absolute byte offset from buffer start to indices
+    uint32_t values_byte_offset;  // absolute byte offset from buffer start to values
+};
+
 /// Type tag for chunk_offsets pointer.
 typedef enum { CO_U8 = 0, CO_U16 = 1, CO_U32 = 2, CO_U64 = 3 } ChunkOffsetType;
 
diff --git a/vortex-cuda/src/dynamic_dispatch/mod.rs b/vortex-cuda/src/dynamic_dispatch/mod.rs
@@ -313,6 +313,7 @@ impl SourceOp {
                 bitunpack: SourceParams_BitunpackParams {
                     bit_width,
                     element_offset: u32::from(element_offset),
+                    patches_ptr: 0,
                 },
             },
         }
@@ -393,7 +394,11 @@ impl ScalarOp {
             op_code: ScalarOp_ScalarOpCode_ALP,
             output_ptype: PTypeTag_PTYPE_F32,
             params: ScalarParams {
-                alp: ScalarParams_AlpParams { f, e },
+                alp: ScalarParams_AlpParams {
+                    f,
+                    e,
+                    patches_ptr: 0,
+                },
             },
         }
     }