Skip to content

Commit 68dc253

Browse files
authored
Revert "fix: build CUDA kernels as multi-arch fatbin with PTX fallback (#8047)"
This reverts commit f852d72.
1 parent c54ce7e commit 68dc253

5 files changed

Lines changed: 36 additions & 40 deletions

File tree

.gitignore

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,6 @@ benchmarks/.out
194194
*.parquet
195195
!docs/_static/example.parquet
196196

197-
# Generated CUDA kernel artifacts
198-
*.fatbin
199-
200197
# TPC-H benchmarking data
201198
/data/
202199

vortex-cuda/build.rs

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#![expect(clippy::use_debug)]
77

88
use std::env;
9-
use std::fs;
9+
use std::fs::File;
1010
use std::io;
1111
use std::path::Path;
1212
use std::path::PathBuf;
@@ -27,13 +27,13 @@ fn main() {
2727

2828
// Source directory for kernels (hand-written and generated .cu/.cuh files)
2929
let kernels_src = Path::new(&manifest_dir).join("kernels/src");
30-
// Output directory for compiled CUDA module files - separate by profile.
30+
// Output directory for compiled .ptx files - separate by profile.
3131
let kernels_gen = Path::new(&manifest_dir).join("kernels/gen").join(&profile);
3232

33-
fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");
33+
std::fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");
3434

3535
// Always emit the kernels output directory path as a compile-time env var so any binary
36-
// linking against vortex-cuda can find the CUDA module files. This must be set regardless
36+
// linking against vortex-cuda can find the PTX files. This must be set regardless
3737
// of CUDA availability since the code using env!() is always compiled.
3838
// At runtime, VORTEX_CUDA_KERNELS_DIR can be set to override this path.
3939
println!(
@@ -64,8 +64,8 @@ fn main() {
6464
return;
6565
}
6666

67-
// Watch and compile .cu and .cuh files from kernels/src to CUDA modules in kernels/gen
68-
if let Ok(entries) = fs::read_dir(&kernels_src) {
67+
// Watch and compile .cu and .cuh files from kernels/src to PTX in kernels/gen
68+
if let Ok(entries) = std::fs::read_dir(&kernels_src) {
6969
for path in entries.flatten().map(|entry| entry.path()) {
7070
let is_generated = path
7171
.file_name()
@@ -86,8 +86,8 @@ fn main() {
8686
if !is_generated {
8787
println!("cargo:rerun-if-changed={}", path.display());
8888
}
89-
// Compile all .cu files to CUDA fatbins in gen directory
90-
nvcc_compile_fatbin(&kernels_src, &kernels_gen, &path, &profile)
89+
// Compile all .cu files to PTX in gen directory
90+
nvcc_compile_ptx(&kernels_src, &kernels_gen, &path, &profile)
9191
.map_err(|e| {
9292
format!("Failed to compile CUDA kernel {}: {}", path.display(), e)
9393
})
@@ -103,19 +103,19 @@ fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::
103103
// Generate the lanes header (.cuh) — device functions only, no __global__ kernels.
104104
// This is what dynamic_dispatch.cu includes (via bit_unpack.cuh).
105105
let cuh_path = output_dir.join(format!("bit_unpack_{}_lanes.cuh", T::T));
106-
let mut cuh_file = fs::File::create(&cuh_path)?;
106+
let mut cuh_file = File::create(&cuh_path)?;
107107
generate_cuda_unpack_lanes::<T>(&mut cuh_file)?;
108108

109109
// Generate the standalone kernels (.cu) — includes the lanes header,
110-
// adds _device template + __global__ wrappers. Compiled to its own CUDA module.
110+
// adds _device template + __global__ wrappers. Compiled to its own PTX.
111111
let cu_path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
112-
let mut cu_file = fs::File::create(&cu_path)?;
112+
let mut cu_file = File::create(&cu_path)?;
113113
generate_cuda_unpack_kernels::<T>(&mut cu_file, thread_count)?;
114114

115115
Ok(cu_path)
116116
}
117117

118-
fn nvcc_compile_fatbin(
118+
fn nvcc_compile_ptx(
119119
include_dir: &Path,
120120
output_dir: &Path,
121121
cu_path: &Path,
@@ -148,24 +148,23 @@ fn nvcc_compile_fatbin(
148148
cmd.arg("-O3");
149149
}
150150

151-
// Output CUDA fatbin file goes to output_dir with same base name.
152-
let fatbin_path = output_dir
151+
// Output PTX file goes to output_dir with same base name
152+
let ptx_path = output_dir
153153
.join(cu_path.file_name().unwrap())
154-
.with_extension("fatbin");
154+
.with_extension("ptx");
155155

156-
// Embed a single PTX image for Ampere and newer GPUs. The driver JIT-compiles
157-
// PTX to the target GPU's SASS at runtime.
158156
cmd.arg("-std=c++20")
159-
.arg("-gencode=arch=compute_80,code=compute_80")
157+
.arg("-arch=native")
160158
// Flags forwarded to Clang.
161159
.arg("--compiler-options=-Wall -Wextra -Wpedantic -Werror")
162160
.arg("--restrict")
163-
.arg("--fatbin")
161+
.arg("--ptx")
164162
.arg("--include-path")
165163
.arg(include_dir)
164+
.arg("-c")
166165
.arg(cu_path)
167166
.arg("-o")
168-
.arg(&fatbin_path);
167+
.arg(&ptx_path);
169168

170169
let res = cmd.output()?;
171170

vortex-cuda/src/executor.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ impl CudaExecutionCtx {
187187
///
188188
/// # Arguments
189189
///
190-
/// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
190+
/// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
191191
/// * `ptypes` - List of ptype strings for the kernel name
192192
///
193193
/// # Errors
@@ -212,7 +212,7 @@ impl CudaExecutionCtx {
212212
///
213213
/// # Arguments
214214
///
215-
/// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
215+
/// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
216216
/// * `type_suffixes` - List of type suffix strings for the kernel name
217217
///
218218
/// # Errors

vortex-cuda/src/kernel/mod.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,9 @@ pub(crate) fn launch_cuda_kernel_with_config(
191191
}
192192
}
193193

194-
/// Loader for CUDA kernels with module caching.
194+
/// Loader for CUDA kernels with PTX caching.
195195
///
196-
/// Handles loading CUDA module files and functions.
196+
/// Handles loading PTX files, compiling modules, and loading functions.
197197
#[derive(Debug)]
198198
pub(crate) struct KernelLoader {
199199
/// Cache of loaded CUDA modules, keyed by module name
@@ -215,7 +215,7 @@ impl KernelLoader {
215215
///
216216
/// # Arguments
217217
///
218-
/// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
218+
/// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
219219
/// * `type_suffixes` - List of type suffix strings for the kernel name (`kernel_i128`)
220220
/// * `cuda_context` - CUDA context for loading the module
221221
pub fn load_function(
@@ -235,16 +235,16 @@ impl KernelLoader {
235235
let module = if let Some(entry) = self.modules.get(module_name) {
236236
Arc::clone(entry.value())
237237
} else {
238-
let module_path = Self::path_for_module(module_name);
238+
let ptx_path = Self::ptx_path_for_module(module_name);
239239

240-
// Load the CUDA module.
240+
// Compile and load the CUDA module.
241241
let module = cuda_context
242-
.load_module(Ptx::from_file(&module_path))
242+
.load_module(Ptx::from_file(&ptx_path))
243243
.map_err(|e| {
244244
vortex_err!(
245-
"Failed to load CUDA module {}, module path {}: {}",
245+
"Failed to load CUDA module {}, ptx path {}: {}",
246246
module_name,
247-
module_path.display(),
247+
ptx_path.display(),
248248
e
249249
)
250250
})?;
@@ -262,7 +262,7 @@ impl KernelLoader {
262262
.map_err(|e| vortex_err!("Failed to load kernel function '{}': {}", kernel_name, e))
263263
}
264264

265-
/// Returns the CUDA module file path for a given module name.
265+
/// Returns the PTX file path for a given module name.
266266
///
267267
/// Checks for `VORTEX_CUDA_KERNELS_DIR` environment variable at runtime first,
268268
/// falling back to the path baked in at compile time by build.rs.
@@ -273,11 +273,11 @@ impl KernelLoader {
273273
///
274274
/// # Returns
275275
///
276-
/// The full path to the CUDA module file
277-
fn path_for_module(module_name: &str) -> PathBuf {
276+
/// The full path to the PTX file
277+
fn ptx_path_for_module(module_name: &str) -> PathBuf {
278278
let kernels_dir = std::env::var("VORTEX_CUDA_KERNELS_DIR")
279279
.unwrap_or_else(|_| env!("VORTEX_CUDA_KERNELS_DIR").to_string());
280-
Path::new(&kernels_dir).join(format!("{}.fatbin", module_name))
280+
Path::new(&kernels_dir).join(format!("{}.ptx", module_name))
281281
}
282282
}
283283

vortex-cuda/src/session.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ const DEFAULT_STREAM_POOL_CAPACITY: usize = 4;
2929
/// CUDA session for GPU accelerated execution.
3030
///
3131
/// Maintains a registry of CUDA kernel implementations for array encodings.
32-
/// Holds the CUDA context for all GPU operations and caches loaded CUDA modules.
32+
/// Holds the CUDA context for all GPU operations and caches compiled PTX modules.
3333
#[derive(Clone, Debug)]
3434
pub struct CudaSession {
3535
context: Arc<CudaContext>,
@@ -113,12 +113,12 @@ impl CudaSession {
113113
///
114114
/// # Arguments
115115
///
116-
/// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
116+
/// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
117117
/// * `type_suffixes` - List of type suffix strings to generate kernel name
118118
///
119119
/// # Errors
120120
///
121-
/// Returns an error if the CUDA module file cannot be read or kernel cannot be loaded.
121+
/// Returns an error if PTX file cannot be read or kernel cannot be loaded.
122122
pub fn load_function_with_suffixes(
123123
&self,
124124
module_name: &str,

0 commit comments

Comments
 (0)