vortex-data
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vortex-cuda/build.rs‎
Lines changed: 15 additions & 5 deletions b/‎vortex-cuda/build.rs‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎vortex-cuda/kernels/src/bit_unpack.cuh‎
Lines changed: 14 additions & 14 deletions b/‎vortex-cuda/kernels/src/bit_unpack.cuh‎
Lines changed: 14 additions & 14 deletions
@@ -277,6 +277,7 @@ jobs:
           git ls-files vortex-cuda vortex-cxx vortex-duckdb vortex-ffi \
             | grep -E '\.(cpp|hpp|cu|cuh|h)$' \
             | grep -v 'kernels/src/bit_unpack_.*\.cu$' \
+            | grep -v 'kernels/src/bit_unpack_.*_lanes\.cuh$' \
             | xargs clang-format --dry-run --Werror --style=file
 
   rust-lint-no-default:
 
@@ -14,7 +14,8 @@ use std::process::Command;
 
 use fastlanes::FastLanes;
 
-use crate::bit_unpack_gen::generate_cuda_unpack;
+use crate::bit_unpack_gen::generate_cuda_unpack_kernels;
+use crate::bit_unpack_gen::generate_cuda_unpack_lanes;
 
 #[path = "src/bit_unpack_gen.rs"]
 pub mod bit_unpack_gen;
@@ -94,10 +95,19 @@ fn main() {
 }
 
 fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<PathBuf> {
-    let path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
-    let mut cu_file = File::create(&path)?;
-    generate_cuda_unpack::<T>(&mut cu_file, thread_count)?;
-    Ok(path)
+    // Generate the lanes header (.cuh) — device functions only, no __global__ kernels.
+    // This is what dynamic_dispatch.cu includes (via bit_unpack.cuh).
+    let cuh_path = output_dir.join(format!("bit_unpack_{}_lanes.cuh", T::T));
+    let mut cuh_file = File::create(&cuh_path)?;
+    generate_cuda_unpack_lanes::<T>(&mut cuh_file)?;
+
+    // Generate the standalone kernels (.cu) — includes the lanes header,
+    // adds _device template + __global__ wrappers. Compiled to its own PTX.
+    let cu_path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
+    let mut cu_file = File::create(&cu_path)?;
+    generate_cuda_unpack_kernels::<T>(&mut cu_file, thread_count)?;
+
+    Ok(cu_path)
 }
 
 fn nvcc_compile_ptx(
 
@@ -7,10 +7,10 @@
 #include <cuda_runtime.h>
 #include <stdint.h>
 
-#include "bit_unpack_8.cu"
-#include "bit_unpack_16.cu"
-#include "bit_unpack_32.cu"
-#include "bit_unpack_64.cu"
+#include "bit_unpack_8_lanes.cuh"
+#include "bit_unpack_16_lanes.cuh"
+#include "bit_unpack_32_lanes.cuh"
+#include "bit_unpack_64_lanes.cuh"
 #include "patches.h"
 
 /// Decodes a single lane of packed data.
@@ -26,22 +26,22 @@
 /// * `lane` - Lane index within the block (used to determine which packed words to process)
 /// * `bit_width` - Number of bits with which each value is encoded
 template <typename T>
-__device__ inline void bit_unpack_lane(const T *__restrict packed_chunk,
-                                       T *__restrict output_buffer,
-                                       T reference,
-                                       unsigned int lane,
-                                       uint32_t bit_width);
+__device__ __noinline__ void bit_unpack_lane(const T *__restrict packed_chunk,
+                                             T *__restrict output_buffer,
+                                             T reference,
+                                             unsigned int lane,
+                                             uint32_t bit_width);
 
 /// Template specializations for `bitunpack_lane_to_smem` for different integer types.
 ///
 /// Generates template specializations for each supported integer size (8, 16, 32, 64 bits).
 #define BIT_UNPACK_LANE(bits)                                                                                \
     template <>                                                                                              \
-    __device__ inline void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in,                         \
-                                                           uint##bits##_t *out,                              \
-                                                           uint##bits##_t reference,                         \
-                                                           unsigned int lane,                                \
-                                                           uint32_t bw) {                                    \
+    __device__ __noinline__ void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in,                   \
+                                                                  uint##bits##_t *out,                       \
+                                                                  uint##bits##_t reference,                  \
+                                                                  unsigned int lane,                         \
+                                                                  uint32_t bw) {                             \
         bit_unpack_##bits##_lane(in, out, reference, lane, bw);                                              \
     }