Skip to content

Commit 2f07feb

Browse files
authored
chore[gpu]: don't inline bitunpack lane impls (#7441)
Fully inlining the bitunpack kernels adds memory pressure such that the dynamic dispatch kernel previously operated at the 32 registers per thread max. Preventing to inline the bit unpack line impls drops the register count per thread to 24. This is relevant for future changes which will requires more registers such as patches support on the GPU. The performance hit we take for bitunpack kernels here is 10%. It's probably worthwhile to investigate whether there might a tradeoff here to get similar perf with less aggressive inlining in the future. One thing we could also look at is trading in register spills via launch bounds for more occupancy. Splitting out the lane implementations into headers is needed such that the dynamic dispatch kernel ptx can be compiled without the standalone bitunpack kernels which are not relevant for the dynamic dispatch kernel. This reduces the amount of assembly for the dynamic dispatch to 48k lines from 128k lines. Besides nvcc compile times, this is relevant for the dynamic dispatch kernel in terms of ptx to device compilation which should be as fast as possible. --------- Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 4a5b7d7 commit 2f07feb

12 files changed

Lines changed: 17012 additions & 16952 deletions

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ jobs:
277277
git ls-files vortex-cuda vortex-cxx vortex-duckdb vortex-ffi \
278278
| grep -E '\.(cpp|hpp|cu|cuh|h)$' \
279279
| grep -v 'kernels/src/bit_unpack_.*\.cu$' \
280+
| grep -v 'kernels/src/bit_unpack_.*_lanes\.cuh$' \
280281
| xargs clang-format --dry-run --Werror --style=file
281282
282283
rust-lint-no-default:

vortex-cuda/build.rs

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ use std::process::Command;
1414

1515
use fastlanes::FastLanes;
1616

17-
use crate::bit_unpack_gen::generate_cuda_unpack;
17+
use crate::bit_unpack_gen::generate_cuda_unpack_kernels;
18+
use crate::bit_unpack_gen::generate_cuda_unpack_lanes;
1819

1920
#[path = "src/bit_unpack_gen.rs"]
2021
pub mod bit_unpack_gen;
@@ -72,7 +73,11 @@ fn main() {
7273

7374
match path.extension().and_then(|e| e.to_str()) {
7475
Some("cuh") | Some("h") => {
75-
println!("cargo:rerun-if-changed={}", path.display())
76+
// Only watch hand-written .cuh/.h files, not generated ones
77+
// (generated files are rebuilt when cuda_kernel_generator changes)
78+
if !is_generated {
79+
println!("cargo:rerun-if-changed={}", path.display());
80+
}
7681
}
7782
Some("cu") => {
7883
// Only watch hand-written .cu files, not generated ones
@@ -94,10 +99,19 @@ fn main() {
9499
}
95100

96101
fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<PathBuf> {
97-
let path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
98-
let mut cu_file = File::create(&path)?;
99-
generate_cuda_unpack::<T>(&mut cu_file, thread_count)?;
100-
Ok(path)
102+
// Generate the lanes header (.cuh) — device functions only, no __global__ kernels.
103+
// This is what dynamic_dispatch.cu includes (via bit_unpack.cuh).
104+
let cuh_path = output_dir.join(format!("bit_unpack_{}_lanes.cuh", T::T));
105+
let mut cuh_file = File::create(&cuh_path)?;
106+
generate_cuda_unpack_lanes::<T>(&mut cuh_file)?;
107+
108+
// Generate the standalone kernels (.cu) — includes the lanes header,
109+
// adds _device template + __global__ wrappers. Compiled to its own PTX.
110+
let cu_path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
111+
let mut cu_file = File::create(&cu_path)?;
112+
generate_cuda_unpack_kernels::<T>(&mut cu_file, thread_count)?;
113+
114+
Ok(cu_path)
101115
}
102116

103117
fn nvcc_compile_ptx(

vortex-cuda/kernels/src/bit_unpack.cuh

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
#include <cuda_runtime.h>
88
#include <stdint.h>
99

10-
#include "bit_unpack_8.cu"
11-
#include "bit_unpack_16.cu"
12-
#include "bit_unpack_32.cu"
13-
#include "bit_unpack_64.cu"
10+
#include "bit_unpack_8_lanes.cuh"
11+
#include "bit_unpack_16_lanes.cuh"
12+
#include "bit_unpack_32_lanes.cuh"
13+
#include "bit_unpack_64_lanes.cuh"
1414
#include "patches.h"
1515

1616
/// Decodes a single lane of packed data.
@@ -26,22 +26,22 @@
2626
/// * `lane` - Lane index within the block (used to determine which packed words to process)
2727
/// * `bit_width` - Number of bits with which each value is encoded
2828
template <typename T>
29-
__device__ inline void bit_unpack_lane(const T *__restrict packed_chunk,
30-
T *__restrict output_buffer,
31-
T reference,
32-
unsigned int lane,
33-
uint32_t bit_width);
29+
__device__ __noinline__ void bit_unpack_lane(const T *__restrict packed_chunk,
30+
T *__restrict output_buffer,
31+
T reference,
32+
unsigned int lane,
33+
uint32_t bit_width);
3434

3535
/// Template specializations for `bitunpack_lane_to_smem` for different integer types.
3636
///
3737
/// Generates template specializations for each supported integer size (8, 16, 32, 64 bits).
3838
#define BIT_UNPACK_LANE(bits) \
3939
template <> \
40-
__device__ inline void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in, \
41-
uint##bits##_t *out, \
42-
uint##bits##_t reference, \
43-
unsigned int lane, \
44-
uint32_t bw) { \
40+
__device__ __noinline__ void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in, \
41+
uint##bits##_t *out, \
42+
uint##bits##_t reference, \
43+
unsigned int lane, \
44+
uint32_t bw) { \
4545
bit_unpack_##bits##_lane(in, out, reference, lane, bw); \
4646
}
4747

0 commit comments

Comments
 (0)