Skip to content

Commit 6ab6b5f

Browse files
authored
feature: Add fastlanes bit unpacking cuda kernels (#6145)
Signed-off-by: Robert Kruszewski <github@robertk.io>
1 parent 68130ce commit 6ab6b5f

17 files changed

Lines changed: 19101 additions & 7 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

REUSE.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,9 @@ path = ["**/.gitignore", ".gitmodules", ".python-version", "**/*.lock", "**/*.lo
3535
precedence = "override"
3636
SPDX-FileCopyrightText = "Copyright the Vortex contributors"
3737
SPDX-License-Identifier = "Apache-2.0"
38+
39+
[[annotations]]
40+
path = ["vortex-cuda/kernels/bit_unpack_*"]
41+
precedence = "override"
42+
SPDX-FileCopyrightText = "Copyright the Vortex contributors"
43+
SPDX-License-Identifier = "Apache-2.0"

encodings/fastlanes/src/bitpacking/array/mod.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@ use crate::bitpack_compress::bitpack_encode;
2323
use crate::unpack_iter::BitPacked;
2424
use crate::unpack_iter::BitUnpackedChunks;
2525

26+
pub struct BitPackedArrayParts {
27+
pub offset: u16,
28+
pub bit_width: u8,
29+
pub len: usize,
30+
pub packed: BufferHandle,
31+
pub patches: Option<Patches>,
32+
pub validity: Validity,
33+
}
34+
2635
#[derive(Clone, Debug)]
2736
pub struct BitPackedArray {
2837
/// The offset within the first block (created with a slice).
@@ -275,6 +284,17 @@ impl BitPackedArray {
275284
pub fn max_packed_value(&self) -> usize {
276285
(1 << self.bit_width()) - 1
277286
}
287+
288+
pub fn into_parts(self) -> BitPackedArrayParts {
289+
BitPackedArrayParts {
290+
offset: self.offset,
291+
bit_width: self.bit_width,
292+
len: self.len,
293+
packed: self.packed,
294+
patches: self.patches,
295+
validity: self.validity,
296+
}
297+
}
278298
}
279299

280300
#[cfg(test)]

encodings/fastlanes/src/bitpacking/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
mod array;
55
pub use array::BitPackedArray;
6+
pub use array::BitPackedArrayParts;
67
pub use array::bitpack_compress;
78
pub use array::bitpack_decompress;
89
pub use array::unpack_iter;

vortex-cuda/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ _test-harness = []
2323
[dependencies]
2424
async-trait = { workspace = true }
2525
cudarc = { workspace = true }
26+
fastlanes = { workspace = true }
2627
futures = { workspace = true, features = ["executor"] }
2728
kanal = { workspace = true }
2829
tracing = { workspace = true }
@@ -52,6 +53,7 @@ vortex-dtype = { workspace = true, features = ["cudarc"] }
5253
vortex-scalar = { workspace = true }
5354

5455
[build-dependencies]
56+
fastlanes = { workspace = true }
5557

5658
[[bench]]
5759
name = "for_cuda"

vortex-cuda/build.rs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,18 @@
66
#![allow(clippy::use_debug)]
77

88
use std::env;
9+
use std::fs::File;
10+
use std::io;
911
use std::path::Path;
1012
use std::process::Command;
1113

14+
use fastlanes::FastLanes;
15+
16+
use crate::cuda_kernel_generator::IndentedWriter;
17+
use crate::cuda_kernel_generator::generate_cuda_unpack_for_width;
18+
19+
pub mod cuda_kernel_generator;
20+
1221
fn main() {
1322
let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir");
1423
let kernels_dir = Path::new(&manifest_dir).join("kernels");
@@ -22,12 +31,17 @@ fn main() {
2231
kernels_dir.display()
2332
);
2433

34+
println!("cargo:rerun-if-changed={}", kernels_dir.to_str().unwrap());
35+
36+
generate_unpack::<u8>(&kernels_dir, 32).expect("Failed to generate unpack for u8");
37+
generate_unpack::<u16>(&kernels_dir, 32).expect("Failed to generate unpack for u16");
38+
generate_unpack::<u32>(&kernels_dir, 32).expect("Failed to generate unpack for u32");
39+
generate_unpack::<u64>(&kernels_dir, 16).expect("Failed to generate unpack for u64");
40+
2541
if !is_cuda_available() {
2642
return;
2743
}
2844

29-
println!("cargo:rerun-if-changed={}", kernels_dir.to_str().unwrap());
30-
3145
if let Ok(entries) = std::fs::read_dir(&kernels_dir) {
3246
for path in entries.flatten().map(|entry| entry.path()) {
3347
match path.extension().and_then(|e| e.to_str()) {
@@ -47,7 +61,13 @@ fn main() {
4761
}
4862
}
4963

50-
fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> std::io::Result<()> {
64+
fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<()> {
65+
let mut cu_file = File::create(output_dir.join(format!("bit_unpack_{}.cu", T::T)))?;
66+
let mut cu_writer = IndentedWriter::new(&mut cu_file);
67+
generate_cuda_unpack_for_width::<T, _>(&mut cu_writer, thread_count)
68+
}
69+
70+
fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> io::Result<()> {
5171
// https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts
5272
let profile = env::var("PROFILE").unwrap();
5373

@@ -114,7 +134,7 @@ fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> std::io::Result<()> {
114134
}
115135
}
116136

117-
return Err(std::io::Error::other(format!(
137+
return Err(io::Error::other(format!(
118138
"nvcc compilation failed for {}",
119139
cu_path.display()
120140
)));
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use std::fmt;
5+
use std::io;
6+
use std::io::Write;
7+
8+
pub struct IndentedWriter<W: Write> {
9+
write: W,
10+
indent: String,
11+
}
12+
13+
impl<W: Write> IndentedWriter<W> {
14+
pub fn new(write: W) -> Self {
15+
Self {
16+
write,
17+
indent: String::new(),
18+
}
19+
}
20+
21+
/// # Errors
22+
///
23+
/// Will return Err if writing to the underlying writer fails.
24+
pub fn indent<F>(&mut self, indented: F) -> io::Result<()>
25+
where
26+
F: FnOnce(&mut IndentedWriter<W>) -> io::Result<()>,
27+
{
28+
let original_ident = self.indent.clone();
29+
self.indent += " ";
30+
let res = indented(self);
31+
self.indent = original_ident;
32+
res
33+
}
34+
35+
/// # Errors
36+
///
37+
/// Will return Err if writing to the underlying writer fails.
38+
pub fn write_fmt(&mut self, fmt: fmt::Arguments<'_>) -> io::Result<()> {
39+
write!(self.write, "{}{}", self.indent, fmt)
40+
}
41+
}
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
mod indent;
5+
6+
use std::io;
7+
use std::io::Write;
8+
9+
use fastlanes::FastLanes;
10+
pub use indent::IndentedWriter;
11+
12+
fn generate_lane_decoder<T: FastLanes, W: Write>(
13+
output: &mut IndentedWriter<W>,
14+
bit_width: usize,
15+
) -> io::Result<()> {
16+
let bits = <T>::T;
17+
let lanes = T::LANES;
18+
19+
let func_name = format!("bit_unpack_{bits}_{bit_width}bw_lane");
20+
21+
writeln!(
22+
output,
23+
"__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, unsigned int lane) {{"
24+
)?;
25+
26+
output.indent(|output| {
27+
writeln!(output, "unsigned int LANE_COUNT = {lanes};")?;
28+
if bit_width == 0 {
29+
writeln!(output, "uint{bits}_t zero = 0ULL;")?;
30+
writeln!(output)?;
31+
for row in 0..bits {
32+
writeln!(output, "out[INDEX({row}, lane)] = zero;")?;
33+
}
34+
} else if bit_width == bits {
35+
writeln!(output)?;
36+
for row in 0..bits {
37+
writeln!(
38+
output,
39+
"out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];",
40+
)?;
41+
}
42+
} else {
43+
writeln!(output, "uint{bits}_t src;")?;
44+
writeln!(output, "uint{bits}_t tmp;")?;
45+
46+
writeln!(output)?;
47+
writeln!(output, "src = in[lane];")?;
48+
for row in 0..bits {
49+
let curr_word = (row * bit_width) / bits;
50+
let next_word = ((row + 1) * bit_width) / bits;
51+
let shift = (row * bit_width) % bits;
52+
53+
if next_word > curr_word {
54+
let remaining_bits = ((row + 1) * bit_width) % bits;
55+
let current_bits = bit_width - remaining_bits;
56+
writeln!(
57+
output,
58+
"tmp = (src >> {shift}) & MASK(uint{bits}_t, {current_bits});"
59+
)?;
60+
61+
if next_word < bit_width {
62+
writeln!(output, "src = in[lane + LANE_COUNT * {next_word}];")?;
63+
writeln!(
64+
output,
65+
"tmp |= (src & MASK(uint{bits}_t, {remaining_bits})) << {current_bits};"
66+
)?;
67+
}
68+
} else {
69+
writeln!(
70+
output,
71+
"tmp = (src >> {shift}) & MASK(uint{bits}_t, {bit_width});"
72+
)?;
73+
}
74+
75+
writeln!(output, "out[INDEX({row}, lane)] = tmp;")?;
76+
}
77+
}
78+
Ok(())
79+
})?;
80+
81+
writeln!(output, "}}")
82+
}
83+
84+
fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
85+
output: &mut IndentedWriter<W>,
86+
bit_width: usize,
87+
thread_count: usize,
88+
) -> io::Result<()> {
89+
let bits = <T>::T;
90+
let lanes = T::LANES;
91+
let per_thread_loop_count = lanes / thread_count;
92+
93+
let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
94+
95+
let local_func_params = format!(
96+
"(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, int thread_idx)"
97+
);
98+
99+
writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?;
100+
101+
output.indent(|output| {
102+
for thread_lane in 0..per_thread_loop_count {
103+
writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, out, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
104+
}
105+
Ok(())
106+
})?;
107+
108+
writeln!(output, "}}")
109+
}
110+
111+
fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
112+
output: &mut IndentedWriter<W>,
113+
bit_width: usize,
114+
thread_count: usize,
115+
) -> io::Result<()> {
116+
let bits = <T>::T;
117+
118+
let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
119+
let func_params =
120+
format!("(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out)");
121+
122+
writeln!(
123+
output,
124+
"extern \"C\" __global__ void {func_name}{func_params} {{"
125+
)?;
126+
127+
output.indent(|output| {
128+
writeln!(output, "int thread_idx = threadIdx.x;")?;
129+
writeln!(
130+
output,
131+
"auto in = full_in + (blockIdx.x * (128 * {bit_width} / sizeof(uint{bits}_t)));"
132+
)?;
133+
writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?;
134+
135+
writeln!(output, "_{func_name}(in, out, thread_idx);")
136+
})?;
137+
138+
writeln!(output, "}}")
139+
}
140+
141+
/// # Errors
142+
///
143+
/// Will return Err if writing to the underlying writer fails.
144+
pub fn generate_cuda_unpack_for_width<T: FastLanes, W: Write>(
145+
output: &mut IndentedWriter<W>,
146+
thread_count: usize,
147+
) -> io::Result<()> {
148+
writeln!(output, "// AUTO-GENERATED. Do not edit by hand!")?;
149+
writeln!(output, "#include <cuda.h>")?;
150+
writeln!(output, "#include <cuda_runtime.h>")?;
151+
writeln!(output, "#include <stdint.h>")?;
152+
writeln!(output, "#include \"fastlanes_common.cuh\"")?;
153+
writeln!(output)?;
154+
155+
for bit_width in 0..=<T>::T {
156+
generate_lane_decoder::<T, _>(output, bit_width)?;
157+
writeln!(output)?;
158+
generate_device_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
159+
writeln!(output)?;
160+
161+
generate_global_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
162+
writeln!(output)?;
163+
}
164+
165+
Ok(())
166+
}

0 commit comments

Comments
 (0)