perf: reduce register pressure in dyn dispatch

0ax1 · 0ax1 · commit 84ec712da77c · 2026-04-16T16:04:58.000Z
We decrease the number of values per tile in the output stage each
GPU thread uses, as well as limit the register count to 32 in the
launch bounds.

Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -279,7 +279,8 @@ __device__ void execute_output_stage(T *__restrict output,
                                      char *__restrict smem,
                                      uint64_t block_start,
                                      uint32_t block_len) {
-    constexpr uint32_t VALUES_PER_TILE = 32 / sizeof(T);
+    // Cap at 4 values per thread per tile to minimise register pressure.
+    constexpr uint32_t VALUES_PER_TILE = (32 / sizeof(T)) < 4 ? (32 / sizeof(T)) : 4;
     const uint32_t tile_size = blockDim.x * VALUES_PER_TILE;
     const auto &src = stage.source;
     const void *raw_input = reinterpret_cast<const void *>(stage.input_ptr);
@@ -472,9 +473,10 @@ dynamic_dispatch(T *__restrict output, uint64_t array_len, const uint8_t *__rest
 // matters is load_element(), which dispatches on the per-op PTypeTag to
 // sign-extend or zero-extend when widening a narrow source to T.
 #define GENERATE_KERNEL(suffix, Type)                                                                        \
-    extern "C" __global__ void dynamic_dispatch_##suffix(Type *__restrict output,                            \
-                                                         uint64_t array_len,                                 \
-                                                         const uint8_t *__restrict packed_plan) {            \
+    extern "C" __global__ void __launch_bounds__(BLOCK_SIZE, 32)                                             \
+        dynamic_dispatch_##suffix(Type *__restrict output,                                                   \
+                                 uint64_t array_len,                                                         \
+                                 const uint8_t *__restrict packed_plan) {                                    \
         dynamic_dispatch<Type>(output, array_len, packed_plan);                                              \
     }