Skip to content

Commit 7b150ad

Browse files
committed
chore[gpu]: don't inline bitunpack lane impls
Fully inlining adds memory pressure such that the dynamic dispatch kernel operate at the 32 registers per thread max. Preventing to inline the bit unpack line impls drops the register count per thread to 24. This is relevant for future changes which will requires more registers such as patches support on the GPU. Splitting out the lane implementations into headers is needed such that the dynamic dispatch kernel ptx can be compiled without the standalone bitunpack kernels. This reduces the amount of assembly for the dynamic dispatch to 48k lines from 128k lines. Besides static compile times, this is relevant for the dynamic dispatch kernel as the ptx to device compilation should be as fast as possible. For full JIT static in the background, longer compiler times are fine. Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 4a5b7d7 commit 7b150ad

12 files changed

Lines changed: 17007 additions & 16951 deletions

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ jobs:
277277
git ls-files vortex-cuda vortex-cxx vortex-duckdb vortex-ffi \
278278
| grep -E '\.(cpp|hpp|cu|cuh|h)$' \
279279
| grep -v 'kernels/src/bit_unpack_.*\.cu$' \
280+
| grep -v 'kernels/src/bit_unpack_.*_lanes\.cuh$' \
280281
| xargs clang-format --dry-run --Werror --style=file
281282
282283
rust-lint-no-default:

vortex-cuda/build.rs

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ use std::process::Command;
1414

1515
use fastlanes::FastLanes;
1616

17-
use crate::bit_unpack_gen::generate_cuda_unpack;
17+
use crate::bit_unpack_gen::generate_cuda_unpack_kernels;
18+
use crate::bit_unpack_gen::generate_cuda_unpack_lanes;
1819

1920
#[path = "src/bit_unpack_gen.rs"]
2021
pub mod bit_unpack_gen;
@@ -94,10 +95,19 @@ fn main() {
9495
}
9596

9697
fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<PathBuf> {
97-
let path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
98-
let mut cu_file = File::create(&path)?;
99-
generate_cuda_unpack::<T>(&mut cu_file, thread_count)?;
100-
Ok(path)
98+
// Generate the lanes header (.cuh) — device functions only, no __global__ kernels.
99+
// This is what dynamic_dispatch.cu includes (via bit_unpack.cuh).
100+
let cuh_path = output_dir.join(format!("bit_unpack_{}_lanes.cuh", T::T));
101+
let mut cuh_file = File::create(&cuh_path)?;
102+
generate_cuda_unpack_lanes::<T>(&mut cuh_file)?;
103+
104+
// Generate the standalone kernels (.cu) — includes the lanes header,
105+
// adds _device template + __global__ wrappers. Compiled to its own PTX.
106+
let cu_path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
107+
let mut cu_file = File::create(&cu_path)?;
108+
generate_cuda_unpack_kernels::<T>(&mut cu_file, thread_count)?;
109+
110+
Ok(cu_path)
101111
}
102112

103113
fn nvcc_compile_ptx(

vortex-cuda/kernels/src/bit_unpack.cuh

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
#include <cuda_runtime.h>
88
#include <stdint.h>
99

10-
#include "bit_unpack_8.cu"
11-
#include "bit_unpack_16.cu"
12-
#include "bit_unpack_32.cu"
13-
#include "bit_unpack_64.cu"
10+
#include "bit_unpack_8_lanes.cuh"
11+
#include "bit_unpack_16_lanes.cuh"
12+
#include "bit_unpack_32_lanes.cuh"
13+
#include "bit_unpack_64_lanes.cuh"
1414
#include "patches.h"
1515

1616
/// Decodes a single lane of packed data.
@@ -26,22 +26,22 @@
2626
/// * `lane` - Lane index within the block (used to determine which packed words to process)
2727
/// * `bit_width` - Number of bits with which each value is encoded
2828
template <typename T>
29-
__device__ inline void bit_unpack_lane(const T *__restrict packed_chunk,
30-
T *__restrict output_buffer,
31-
T reference,
32-
unsigned int lane,
33-
uint32_t bit_width);
29+
__device__ __noinline__ void bit_unpack_lane(const T *__restrict packed_chunk,
30+
T *__restrict output_buffer,
31+
T reference,
32+
unsigned int lane,
33+
uint32_t bit_width);
3434

3535
/// Template specializations for `bitunpack_lane_to_smem` for different integer types.
3636
///
3737
/// Generates template specializations for each supported integer size (8, 16, 32, 64 bits).
3838
#define BIT_UNPACK_LANE(bits) \
3939
template <> \
40-
__device__ inline void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in, \
41-
uint##bits##_t *out, \
42-
uint##bits##_t reference, \
43-
unsigned int lane, \
44-
uint32_t bw) { \
40+
__device__ __noinline__ void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in, \
41+
uint##bits##_t *out, \
42+
uint##bits##_t reference, \
43+
unsigned int lane, \
44+
uint32_t bw) { \
4545
bit_unpack_##bits##_lane(in, out, reference, lane, bw); \
4646
}
4747

0 commit comments

Comments
 (0)