chore: add patches_ptr to BitunpackParams and AlpParams

0ax1 · 0ax1 · commit 1e6e8a5afe2c · 2026-04-20T11:02:28.000Z
Structural plumbing for per-op exception patches in the fused
dynamic dispatch kernel. Adds PackedPatchesHeader and kernel
helpers (patch_fl_chunk, patch_all_fl_chunks) but does not yet
populate patches_ptr - all constructors initialize it to 0.

Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -162,11 +162,8 @@ __device__ inline void bitunpack(const T *__restrict packed,
                                  uint64_t chunk_start,
                                  uint32_t chunk_len,
                                  const struct SourceOp &src) {
-    constexpr uint32_t T_BITS = sizeof(T) * 8;
-    constexpr uint32_t FL_CHUNK = 1024;
-    constexpr uint32_t LANES = FL_CHUNK / T_BITS;
     const uint32_t bw = src.params.bitunpack.bit_width;
-    const uint32_t words_per_block = LANES * bw;
+    const uint32_t words_per_block = FL_LANES<T> * bw;
     const uint32_t elem_off = src.params.bitunpack.element_offset;
     const uint32_t dst_off = (chunk_start + elem_off) % FL_CHUNK;
     const uint64_t first_block = (chunk_start + elem_off) / FL_CHUNK;
@@ -177,12 +174,86 @@ __device__ inline void bitunpack(const T *__restrict packed,
     for (uint32_t c = 0; c < n_chunks; ++c) {
         const T *src_chunk = packed + (first_block + c) * words_per_block;
         T *chunk_dst = dst + c * FL_CHUNK;
-        for (uint32_t lane = threadIdx.x; lane < LANES; lane += blockDim.x) {
+        for (uint32_t lane = threadIdx.x; lane < FL_LANES<T>; lane += blockDim.x) {
             bit_unpack_lane<T>(src_chunk, chunk_dst, 0, lane, bw);
         }
     }
 }
 
+// ═══════════════════════════════════════════════════════════════════════════
+// Patches
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Parsed view into a packed patches buffer (the fused-dispatch counterpart
+/// of GPUPatches, which is used by the standalone per-bitwidth kernels).
+/// Each op with patches gets its own contiguous device allocation holding
+/// lane_offsets, indices, and values, referenced by a single uint64_t pointer
+/// (patches_ptr in BitunpackParams / AlpParams); see PackedPatchesHeader in
+/// patches.h for the layout.
+template <typename T>
+struct PackedPatchesView {
+    const uint32_t *lane_offsets;
+    uint32_t num_lane_offsets;
+    const uint16_t *indices;
+    const T *values;
+};
+
+/// Parse a packed patches buffer into its component arrays.
+template <typename T>
+__device__ inline PackedPatchesView<T> parse_patches(uint64_t patches_ptr) {
+    const uint8_t *base = reinterpret_cast<const uint8_t *>(patches_ptr);
+    const auto *header = reinterpret_cast<const PackedPatchesHeader *>(base);
+    return {
+        reinterpret_cast<const uint32_t *>(base + sizeof(PackedPatchesHeader)),
+        static_cast<uint32_t>((header->indices_byte_offset - sizeof(PackedPatchesHeader)) / sizeof(uint32_t)),
+        reinterpret_cast<const uint16_t *>(base + header->indices_byte_offset),
+        reinterpret_cast<const T *>(base + header->values_byte_offset),
+    };
+}
+
+/// Apply source patches for a single FL chunk.
+///
+/// Overwrites patched positions in `out` and issues __syncthreads().
+template <typename T>
+__device__ inline void patch_fl_chunk(uint64_t patches_ptr, T *__restrict out, uint32_t fl_chunk) {
+    const auto patches = parse_patches<T>(patches_ptr);
+
+    for (uint32_t lane = threadIdx.x; lane < FL_LANES<T>; lane += blockDim.x) {
+        auto slot = fl_chunk * FL_LANES<T> + lane;
+        assert(slot + 1 < patches.num_lane_offsets);
+        auto start = patches.lane_offsets[slot];
+        auto end = patches.lane_offsets[slot + 1];
+        for (auto i = start; i < end; ++i) {
+            out[patches.indices[i]] = patches.values[i];
+        }
+    }
+    __syncthreads();
+}
+
+/// Apply source patches for all FL chunks in a contiguous region.
+/// Overwrites patched positions in `out` and issues __syncthreads().
+template <typename T>
+__device__ inline void
+patch_all_fl_chunks(uint64_t patches_ptr, T *__restrict out, uint32_t stage_len, uint32_t element_offset) {
+    const auto patches = parse_patches<T>(patches_ptr);
+
+    const uint32_t first_chunk = element_offset / FL_CHUNK;
+    const uint32_t n_chunks = (stage_len + (element_offset % FL_CHUNK) + FL_CHUNK - 1) / FL_CHUNK;
+    for (uint32_t c = 0; c < n_chunks; ++c) {
+        T *chunk_base = out + c * FL_CHUNK;
+        for (uint32_t lane = threadIdx.x; lane < FL_LANES<T>; lane += blockDim.x) {
+            auto slot = (first_chunk + c) * FL_LANES<T> + lane;
+            assert(slot + 1 < patches.num_lane_offsets);
+            auto start = patches.lane_offsets[slot];
+            auto end = patches.lane_offsets[slot + 1];
+            for (auto i = start; i < end; ++i) {
+                chunk_base[patches.indices[i]] = patches.values[i];
+            }
+        }
+    }
+    __syncthreads();
+}
+
 /// Read N values from a source op into `out`.
 ///
 /// Dispatches on `src.op_code` to handle each encoding:
@@ -313,11 +384,17 @@ __device__ void execute_output_stage(T *__restrict output,
                          block_start + elem_idx,
                          chunk_len,
                          src);
-            constexpr uint32_t FL_CHUNK = 1024; // FastLanes chunk size
             const uint32_t align = (block_start + elem_idx + src.params.bitunpack.element_offset) % FL_CHUNK;
             smem_src = scratch + align;
             // Write barrier: all threads finished bitunpack, safe to read from scratch.
             __syncthreads();
+
+            // Overwrite patched positions in the decoded scratch buffer.
+            if (src.params.bitunpack.patches_ptr != 0) {
+                const uint32_t fl_chunk = static_cast<uint32_t>(
+                    (block_start + elem_idx + src.params.bitunpack.element_offset) / FL_CHUNK);
+                patch_fl_chunk<T>(src.params.bitunpack.patches_ptr, scratch, fl_chunk);
+            }
         } else {
             chunk_len = block_len;
         }
@@ -392,12 +469,22 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) {
     const auto &src = stage.source;
 
     if (src.op_code == SourceOp::BITUNPACK) {
+        T *raw_smem = smem_out;
         bitunpack<T>(reinterpret_cast<const T *>(stage.input_ptr), smem_out, 0, stage.len, src);
-        smem_out += src.params.bitunpack.element_offset % SMEM_TILE_SIZE;
         // Write barrier: cooperative bitunpack finished, safe to read
         // decoded elements in the scalar-op loop below.
         __syncthreads();
 
+        // Overwrite exception positions in the decoded buffer with patch values.
+        if (src.params.bitunpack.patches_ptr != 0) {
+            patch_all_fl_chunks<T>(src.params.bitunpack.patches_ptr,
+                                   raw_smem,
+                                   stage.len,
+                                   src.params.bitunpack.element_offset);
+        }
+
+        smem_out += src.params.bitunpack.element_offset % SMEM_TILE_SIZE;
+
         if (stage.num_scalar_ops > 0) {
             for (uint32_t i = threadIdx.x; i < stage.len; i += blockDim.x) {
                 T val = smem_out[i];
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.h b/vortex-cuda/kernels/src/dynamic_dispatch.h
@@ -30,6 +30,7 @@
 #pragma once
 
 #include <stdint.h>
+#include "patches.h"
 
 /// Compact tag identifying a Vortex PType for GPU dispatch.
 ///
@@ -108,6 +109,7 @@ union SourceParams {
     struct BitunpackParams {
         uint8_t bit_width;
         uint32_t element_offset; // Sub-byte offset
+        uint64_t patches_ptr;    // device pointer to packed patches buffer (0 = none)
     } bitunpack;
 
     /// Copy from global to shared memory.
@@ -157,6 +159,7 @@ union ScalarParams {
     struct AlpParams {
         float f;
         float e;
+        uint64_t patches_ptr; // device pointer to packed patches buffer (0 = none)
     } alp;
 
     /// Dictionary gather: use current value as index into decoded values in smem.
diff --git a/vortex-cuda/kernels/src/fastlanes_common.cuh b/vortex-cuda/kernels/src/fastlanes_common.cuh
@@ -8,8 +8,25 @@
 // FastLanes ordering array
 __constant__ int FL_ORDER[] = {0, 4, 2, 6, 1, 5, 3, 7};
 
+// FastLanes organises every 1024-element vector into a transposed layout
+// of FL_LANES columns × (1024 / FL_LANES) rows. Each column is a "lane"
+// that can be processed independently of every other lane, which is what
+// makes all FastLanes encodings (FFOR, DELTA, RLE, ALP, …) fully
+// data-parallel. One CUDA thread or one CPU SIMD lane handles one
+// FastLanes lane.
+//
+// Paper: https://ir.cwi.nl/pub/35881/35881.pdf
+// Repo:  https://github.com/cwida/FastLanes
+
+/// FastLanes chunk size in elements.
+constexpr uint32_t FL_CHUNK = 1024;
+
+/// Number of FastLanes lanes for element type T (32 for ≤32-bit, 16 for 64-bit).
+template <typename T>
+constexpr uint32_t FL_LANES = (sizeof(T) < 8) ? 32 : 16;
+
 // Compute the index in the FastLanes layout
 #define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)
 
 // Create a mask with 'width' bits set
-#define MASK(T, width) (((T)1 << width) - 1)
+#define MASK(T, width) (((T)1 << width) - 1)
diff --git a/vortex-cuda/kernels/src/patches.h b/vortex-cuda/kernels/src/patches.h
@@ -9,6 +9,16 @@
 extern "C" {
 #endif
 
+/// Header at the start of a packed patches buffer.
+///
+/// Layout: [PackedPatchesHeader | lane_offsets (u32, N+1 sentinel) | indices (u16) | padding | values (V)]
+///
+/// A `patches_ptr` of 0 signals no patches.
+struct PackedPatchesHeader {
+    uint32_t indices_byte_offset;   // absolute byte offset from buffer start to indices
+    uint32_t values_byte_offset;    // absolute byte offset from buffer start to values
+};
+
 /// Type tag for chunk_offsets pointer.
 typedef enum { CO_U8 = 0, CO_U16 = 1, CO_U32 = 2, CO_U64 = 3 } ChunkOffsetType;
 
diff --git a/vortex-cuda/src/dynamic_dispatch/mod.rs b/vortex-cuda/src/dynamic_dispatch/mod.rs
@@ -313,6 +313,7 @@ impl SourceOp {
                 bitunpack: SourceParams_BitunpackParams {
                     bit_width,
                     element_offset: u32::from(element_offset),
+                    patches_ptr: 0,
                 },
             },
         }
@@ -393,7 +394,11 @@ impl ScalarOp {
             op_code: ScalarOp_ScalarOpCode_ALP,
             output_ptype: PTypeTag_PTYPE_F32,
             params: ScalarParams {
-                alp: ScalarParams_AlpParams { f, e },
+                alp: ScalarParams_AlpParams {
+                    f,
+                    e,
+                    patches_ptr: 0,
+                },
             },
         }
     }