diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu index 29bf26efefb..d3950485435 100644 --- a/vortex-cuda/kernels/src/dynamic_dispatch.cu +++ b/vortex-cuda/kernels/src/dynamic_dispatch.cu @@ -279,7 +279,8 @@ __device__ void execute_output_stage(T *__restrict output, char *__restrict smem, uint64_t block_start, uint32_t block_len) { - constexpr uint32_t VALUES_PER_TILE = 32 / sizeof(T); + // Cap at 4 values per thread per tile to minimise register pressure. + constexpr uint32_t VALUES_PER_TILE = (32 / sizeof(T)) < 4 ? (32 / sizeof(T)) : 4; const uint32_t tile_size = blockDim.x * VALUES_PER_TILE; const auto &src = stage.source; const void *raw_input = reinterpret_cast(stage.input_ptr); @@ -472,9 +473,10 @@ dynamic_dispatch(T *__restrict output, uint64_t array_len, const uint8_t *__rest // matters is load_element(), which dispatches on the per-op PTypeTag to // sign-extend or zero-extend when widening a narrow source to T. #define GENERATE_KERNEL(suffix, Type) \ - extern "C" __global__ void dynamic_dispatch_##suffix(Type *__restrict output, \ - uint64_t array_len, \ - const uint8_t *__restrict packed_plan) { \ + extern "C" __global__ void __launch_bounds__(BLOCK_SIZE, 32) \ + dynamic_dispatch_##suffix(Type *__restrict output, \ + uint64_t array_len, \ + const uint8_t *__restrict packed_plan) { \ dynamic_dispatch(output, array_len, packed_plan); \ }