style(cuda): replace bare 1024 with FL_CHUNK in patches.cuh, dynamic_dispatch.cu, and bit_unpack_gen.rs

0ax1 · 0ax1 · commit 296b541c7d38 · 2026-04-21T14:04:48.000Z
Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-cuda/kernels/src/bit_unpack_16.cu b/vortex-cuda/kernels/src/bit_unpack_16.cu
@@ -4,7 +4,7 @@
 
 template <int BW>
 __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx, GPUPatches& patches) {
-    __shared__ uint16_t shared_out[1024];
+    __shared__ uint16_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
@@ -16,7 +16,7 @@ __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *_
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint16_t> cursor(patches, blockIdx.x, thread_idx, 32);
     auto patch = cursor.next();
-    while (patch.index != 1024) {
+    while (patch.index != FL_CHUNK) {
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }
diff --git a/vortex-cuda/kernels/src/bit_unpack_32.cu b/vortex-cuda/kernels/src/bit_unpack_32.cu
@@ -4,7 +4,7 @@
 
 template <int BW>
 __device__ void _bit_unpack_32_device(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx, GPUPatches& patches) {
-    __shared__ uint32_t shared_out[1024];
+    __shared__ uint32_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
@@ -16,7 +16,7 @@ __device__ void _bit_unpack_32_device(const uint32_t *__restrict in, uint32_t *_
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint32_t> cursor(patches, blockIdx.x, thread_idx, 32);
     auto patch = cursor.next();
-    while (patch.index != 1024) {
+    while (patch.index != FL_CHUNK) {
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }
diff --git a/vortex-cuda/kernels/src/bit_unpack_64.cu b/vortex-cuda/kernels/src/bit_unpack_64.cu
@@ -4,7 +4,7 @@
 
 template <int BW>
 __device__ void _bit_unpack_64_device(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx, GPUPatches& patches) {
-    __shared__ uint64_t shared_out[1024];
+    __shared__ uint64_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
@@ -16,7 +16,7 @@ __device__ void _bit_unpack_64_device(const uint64_t *__restrict in, uint64_t *_
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint64_t> cursor(patches, blockIdx.x, thread_idx, 16);
     auto patch = cursor.next();
-    while (patch.index != 1024) {
+    while (patch.index != FL_CHUNK) {
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }
diff --git a/vortex-cuda/kernels/src/bit_unpack_8.cu b/vortex-cuda/kernels/src/bit_unpack_8.cu
@@ -4,7 +4,7 @@
 
 template <int BW>
 __device__ void _bit_unpack_8_device(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx, GPUPatches& patches) {
-    __shared__ uint8_t shared_out[1024];
+    __shared__ uint8_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
@@ -16,7 +16,7 @@ __device__ void _bit_unpack_8_device(const uint8_t *__restrict in, uint8_t *__re
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint8_t> cursor(patches, blockIdx.x, thread_idx, 32);
     auto patch = cursor.next();
-    while (patch.index != 1024) {
+    while (patch.index != FL_CHUNK) {
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -147,16 +147,16 @@ scalar_op(T *values, const struct ScalarOp &op, char *__restrict smem, uint64_t
             // chunk_start is the first original chunk covered by the sliced
             // chunk_offsets array. PatchesCursor indexes from 0 into that
             // array, so we subtract chunk_start from the absolute chunk.
-            const uint32_t chunk_start = patches.offset / 1024;
+            const uint32_t chunk_start = patches.offset / FL_CHUNK;
 #pragma unroll
             for (uint32_t i = 0; i < N; ++i) {
                 uint64_t my_pos = (N > 1) ? abs_pos + i * blockDim.x + threadIdx.x : abs_pos;
                 uint64_t orig = my_pos + patches.offset;
-                uint32_t chunk = static_cast<uint32_t>(orig / 1024) - chunk_start;
-                uint32_t within = static_cast<uint32_t>(orig % 1024);
+                uint32_t chunk = static_cast<uint32_t>(orig / FL_CHUNK) - chunk_start;
+                uint32_t within = static_cast<uint32_t>(orig % FL_CHUNK);
                 PatchesCursor<T> cursor(patches, chunk, 0, 1);
                 auto patch = cursor.next();
-                while (patch.index != 1024) {
+                while (patch.index != FL_CHUNK) {
                     if (patch.index == within) {
                         values[i] = patch.value;
                         break;
@@ -192,7 +192,7 @@ __device__ __forceinline__ void
 scatter_patches_chunk(const GPUPatches &patches, T *__restrict out, uint32_t chunk) {
     PatchesCursor<T> cursor(patches, chunk, threadIdx.x, blockDim.x);
     auto patch = cursor.next();
-    while (patch.index != 1024) {
+    while (patch.index != FL_CHUNK) {
         out[patch.index] = patch.value;
         patch = cursor.next();
     }
diff --git a/vortex-cuda/kernels/src/patches.cuh b/vortex-cuda/kernels/src/patches.cuh
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "fastlanes_common.cuh"
 #include "patches.h"
 
 /// Load a chunk offset value, dispatching on the runtime type.
@@ -21,8 +22,8 @@ __device__ inline uint32_t load_chunk_offset(const GPUPatches &patches, uint32_t
 }
 
 /// A single patch: a within-chunk index and its replacement value.
-/// A sentinel patch has index == 1024, which can never match a valid
-/// within-chunk position (0–1023).
+/// A sentinel patch has index == FL_CHUNK, which can never match a valid
+/// within-chunk position (0–FL_CHUNK-1).
 template <typename T>
 struct Patch {
     uint16_t index;
@@ -38,7 +39,7 @@ struct Patch {
 ///
 ///     PatchesCursor<uint32_t> cursor(patches, blockIdx.x, thread_idx, 32);
 ///     auto patch = cursor.next();
-///     while (patch.index != 1024) {
+///     while (patch.index != FL_CHUNK) {
 ///         shared_out[patch.index] = patch.value;
 ///         patch = cursor.next();
 ///     }
@@ -89,15 +90,15 @@ public:
         // The iterator returns indices relative to the start of the chunk.
         // `chunk_base` is the index of the first element within a chunk, accounting
         // for the slice offset.
-        chunk_base = chunk * 1024 + patches.offset;
-        chunk_base -= min(chunk_base, patches.offset % 1024);
+        chunk_base = chunk * FL_CHUNK + patches.offset;
+        chunk_base -= min(chunk_base, patches.offset % FL_CHUNK);
     }
 
     /// Return the current patch (with within-chunk index) and advance,
     /// or a sentinel {1024, 0} if exhausted.
     __device__ Patch<T> next() {
         if (remaining == 0) {
-            return {1024, T {}};
+            return {FL_CHUNK, T {}};
         }
         uint16_t within_chunk = static_cast<uint16_t>(*indices - chunk_base);
         Patch<T> patch = {within_chunk, *values};
diff --git a/vortex-cuda/src/bit_unpack_gen.rs b/vortex-cuda/src/bit_unpack_gen.rs
@@ -143,7 +143,7 @@ fn generate_device_kernel_template(
         output,
         r#"template <int BW>
 __device__ void _bit_unpack_{bits}_device(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx, GPUPatches& patches) {{
-    __shared__ uint{bits}_t shared_out[1024];
+    __shared__ uint{bits}_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
@@ -155,7 +155,7 @@ __device__ void _bit_unpack_{bits}_device(const uint{bits}_t *__restrict in, uin
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint{bits}_t> cursor(patches, blockIdx.x, thread_idx, {thread_count});
     auto patch = cursor.next();
-    while (patch.index != 1024) {{
+    while (patch.index != FL_CHUNK) {{
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }}