Skip to content

Commit 84ec712

Browse files
committed
perf: reduce register pressure in dyn dispatch
We decrease the number of values per tile in the output stage each GPU thread uses, as well as limit the register count to 32 in the launch bounds. Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 5e5475a commit 84ec712

1 file changed

Lines changed: 6 additions & 4 deletions

File tree

vortex-cuda/kernels/src/dynamic_dispatch.cu

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,8 @@ __device__ void execute_output_stage(T *__restrict output,
279279
char *__restrict smem,
280280
uint64_t block_start,
281281
uint32_t block_len) {
282-
constexpr uint32_t VALUES_PER_TILE = 32 / sizeof(T);
282+
// Cap at 4 values per thread per tile to minimise register pressure.
283+
constexpr uint32_t VALUES_PER_TILE = (32 / sizeof(T)) < 4 ? (32 / sizeof(T)) : 4;
283284
const uint32_t tile_size = blockDim.x * VALUES_PER_TILE;
284285
const auto &src = stage.source;
285286
const void *raw_input = reinterpret_cast<const void *>(stage.input_ptr);
@@ -472,9 +473,10 @@ dynamic_dispatch(T *__restrict output, uint64_t array_len, const uint8_t *__rest
472473
// matters is load_element(), which dispatches on the per-op PTypeTag to
473474
// sign-extend or zero-extend when widening a narrow source to T.
474475
#define GENERATE_KERNEL(suffix, Type) \
475-
extern "C" __global__ void dynamic_dispatch_##suffix(Type *__restrict output, \
476-
uint64_t array_len, \
477-
const uint8_t *__restrict packed_plan) { \
476+
extern "C" __global__ void __launch_bounds__(BLOCK_SIZE, 32) \
477+
dynamic_dispatch_##suffix(Type *__restrict output, \
478+
uint64_t array_len, \
479+
const uint8_t *__restrict packed_plan) { \
478480
dynamic_dispatch<Type>(output, array_len, packed_plan); \
479481
}
480482

0 commit comments

Comments
 (0)