chore: speed up CUDA kernel compilation (#7509)

0ax1 · web-flow · commit 6ebb797cbd60 · 2026-04-17T11:02:31.000Z
Previously, changing a .cu file triggered cargo to rerun vortex-cuda's
build script, which recompiled all CUDA kernels and regenerated bindgen
output, causing a full Rust recompilation (~20s). Now CUDA kernel
compilation lives in a standalone vortex-cuda-kernel-build workspace
member with no dependency link to vortex-cuda, so .cu changes only
rebuild PTX without touching Rust.

Changes:
- New vortex-cuda/kernel-build crate owns .cu/.cuh file watching, nvcc
compilation, and bit_unpack kernel generation
- Timestamp-based PTX skipping: only recompiles .cu files whose PTX is
older than the source or any header

---------

Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs
@@ -3,10 +3,11 @@
 
 #![expect(clippy::unwrap_used)]
 #![expect(clippy::expect_used)]
+#![expect(clippy::panic)]
 #![expect(clippy::use_debug)]
 
 use std::env;
-use std::fs::File;
+use std::fs;
 use std::io;
 use std::path::Path;
 use std::path::PathBuf;
@@ -30,11 +31,10 @@ fn main() {
     // Output directory for compiled .ptx files - separate by profile.
     let kernels_gen = Path::new(&manifest_dir).join("kernels/gen").join(&profile);
 
-    std::fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");
+    fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");
 
-    // Always emit the kernels output directory path as a compile-time env var so any binary
-    // linking against vortex-cuda can find the PTX files. This must be set regardless
-    // of CUDA availability since the code using env!() is always compiled.
+    // Emit the kernels output directory path as a compile-time env var so any binary
+    // linking against vortex-cuda can find the PTX files.
     // At runtime, VORTEX_CUDA_KERNELS_DIR can be set to override this path.
     println!(
         "cargo:rustc-env=VORTEX_CUDA_KERNELS_DIR={}",
@@ -43,7 +43,7 @@ fn main() {
 
     println!("cargo:rerun-if-env-changed=PROFILE");
 
-    // Regenerate bit_unpack kernels only when the generator changes
+    // Regenerate bit_unpack kernels only when the generator changes.
     println!(
         "cargo:rerun-if-changed={}",
         Path::new(&manifest_dir)
@@ -63,55 +63,73 @@ fn main() {
         return;
     }
 
-    // Watch and compile .cu and .cuh files from kernels/src to PTX in kernels/gen
-    if let Ok(entries) = std::fs::read_dir(&kernels_src) {
-        for path in entries.flatten().map(|entry| entry.path()) {
-            let is_generated = path
-                .file_name()
-                .and_then(|n| n.to_str())
-                .is_some_and(|n| n.starts_with("bit_unpack_"));
+    // Compile .cu files to PTX. We deliberately do NOT register .cu/.cuh files
+    // with rerun-if-changed so that editing a .cu file does not trigger Rust
+    // recompilation.
+    let mut cu_files = Vec::new();
+    let mut newest_header = std::time::SystemTime::UNIX_EPOCH;
 
+    if let Ok(entries) = fs::read_dir(&kernels_src) {
+        for path in entries.flatten().map(|entry| entry.path()) {
             match path.extension().and_then(|e| e.to_str()) {
                 Some("cuh") | Some("h") => {
-                    // Only watch hand-written .cuh/.h files, not generated ones
-                    // (generated files are rebuilt when cuda_kernel_generator changes)
-                    if !is_generated {
-                        println!("cargo:rerun-if-changed={}", path.display());
+                    if let Ok(mtime) = fs::metadata(&path).and_then(|m| m.modified()) {
+                        newest_header = newest_header.max(mtime);
                     }
                 }
                 Some("cu") => {
-                    // Only watch hand-written .cu files, not generated ones
-                    // (generated files are rebuilt when cuda_kernel_generator changes)
-                    if !is_generated {
-                        println!("cargo:rerun-if-changed={}", path.display());
-                    }
-                    // Compile all .cu files to PTX in gen directory
-                    nvcc_compile_ptx(&kernels_src, &kernels_gen, &path, &profile)
-                        .map_err(|e| {
-                            format!("Failed to compile CUDA kernel {}: {}", path.display(), e)
-                        })
-                        .unwrap();
+                    cu_files.push(path);
                 }
                 _ => {}
             }
         }
     }
+
+    // Only compile .cu files whose PTX is stale (older than the source or any header).
+    for cu_path in &cu_files {
+        let ptx_path = kernels_gen
+            .join(cu_path.file_name().unwrap())
+            .with_extension("ptx");
+
+        let cu_mtime = fs::metadata(cu_path)
+            .and_then(|m| m.modified())
+            .unwrap_or(std::time::SystemTime::UNIX_EPOCH);
+        let newest_input = cu_mtime.max(newest_header);
+
+        let ptx_mtime = fs::metadata(&ptx_path).and_then(|m| m.modified()).ok();
+        if ptx_mtime.is_some_and(|t| t >= newest_input) {
+            continue;
+        }
+
+        nvcc_compile_ptx(&kernels_src, &kernels_gen, cu_path, &profile)
+            .map_err(|e| format!("Failed to compile CUDA kernel {}: {}", cu_path.display(), e))
+            .unwrap();
+    }
 }
 
-fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<PathBuf> {
+fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<()> {
     // Generate the lanes header (.cuh) — device functions only, no __global__ kernels.
     // This is what dynamic_dispatch.cu includes (via bit_unpack.cuh).
     let cuh_path = output_dir.join(format!("bit_unpack_{}_lanes.cuh", T::T));
-    let mut cuh_file = File::create(&cuh_path)?;
-    generate_cuda_unpack_lanes::<T>(&mut cuh_file)?;
+    let mut cuh_buf = Vec::new();
+    generate_cuda_unpack_lanes::<T>(&mut cuh_buf)?;
+    write_if_changed(&cuh_path, &cuh_buf);
 
     // Generate the standalone kernels (.cu) — includes the lanes header,
     // adds _device template + __global__ wrappers. Compiled to its own PTX.
     let cu_path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
-    let mut cu_file = File::create(&cu_path)?;
-    generate_cuda_unpack_kernels::<T>(&mut cu_file, thread_count)?;
+    let mut cu_buf = Vec::new();
+    generate_cuda_unpack_kernels::<T>(&mut cu_buf, thread_count)?;
+    write_if_changed(&cu_path, &cu_buf);
 
-    Ok(cu_path)
+    Ok(())
+}
+
+fn write_if_changed(path: &Path, content: &[u8]) {
+    if fs::read(path).is_ok_and(|existing| existing == content) {
+        return;
+    }
+    fs::write(path, content).unwrap_or_else(|e| panic!("Failed to write {}: {e}", path.display()));
 }
 
 fn nvcc_compile_ptx(
@@ -223,14 +241,13 @@ fn generate_patches_bindings(kernels_src: &Path, out_dir: &Path) {
         .derive_copy(true)
         .derive_debug(true)
         .generate()
-        .expect("Failed to generate dynamic_dispatch bindings");
+        .expect("Failed to generate patches bindings");
 
     bindings
         .write_to_file(out_dir.join("patches.rs"))
         .expect("Failed to write patches.rs");
 }
 
-/// Check if CUDA is available based on nvcc.
 fn is_cuda_available() -> bool {
     Command::new("nvcc")
         .arg("--version")