From 84ec712da77c4981cc1caec3928e83c936e18e1d Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Thu, 16 Apr 2026 13:39:28 +0000
Subject: [PATCH] perf: reduce register pressure in dyn dispatch

We decrease the number of values per tile in the output stage each
GPU thread uses, as well as limit the register count to 32 in the
launch bounds.

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
---
 vortex-cuda/kernels/src/dynamic_dispatch.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
index 29bf26efefb..e16e685a1af 100644
--- a/vortex-cuda/kernels/src/dynamic_dispatch.cu
+++ b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -279,7 +279,8 @@ __device__ void execute_output_stage(T *__restrict output,
                                      char *__restrict smem,
                                      uint64_t block_start,
                                      uint32_t block_len) {
-    constexpr uint32_t VALUES_PER_TILE = 32 / sizeof(T);
+    // Cap at 4 values per thread per tile to minimise register pressure.
+    constexpr uint32_t VALUES_PER_TILE = (32 / sizeof(T)) < 4 ? (32 / sizeof(T)) : 4;
     const uint32_t tile_size = blockDim.x * VALUES_PER_TILE;
     const auto &src = stage.source;
     const void *raw_input = reinterpret_cast<const void *>(stage.input_ptr);
@@ -472,9 +473,10 @@ dynamic_dispatch(T *__restrict output, uint64_t array_len, const uint8_t *__rest
 // matters is load_element(), which dispatches on the per-op PTypeTag to
 // sign-extend or zero-extend when widening a narrow source to T.
 #define GENERATE_KERNEL(suffix, Type)                                                                        \
-    extern "C" __global__ void dynamic_dispatch_##suffix(Type *__restrict output,                            \
-                                                         uint64_t array_len,                                 \
-                                                         const uint8_t *__restrict packed_plan) {            \
+    extern "C" __global__ void __launch_bounds__(BLOCK_SIZE, 32)                                             \
+        dynamic_dispatch_##suffix(Type *__restrict output,                                                   \
+                                 uint64_t array_len,                                                         \
+                                 const uint8_t *__restrict packed_plan) {                                    \
         dynamic_dispatch<Type>(output, array_len, packed_plan);                                              \
     }