cuda ctx bind to thread before unsafe (#6681)

onursatici · web-flow · commit ee2916957cb0 · 2026-02-26T12:29:46.000Z
set the cuda context thread locals before unsafe cudarc calls. These
lower level calls do not ensure the thread local cuda context some calls
expect. cudarc's safe methods do this internally.

This PR also wraps the zstd buffers to use launch_external, matching
other kernels

---------

Signed-off-by: Onur Satici &lt;onur@spiraldb.com&gt;
diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
@@ -244,6 +244,11 @@ impl DeviceBuffer for CudaDeviceBuffer {
             ByteBufferMut::with_capacity_aligned(self.len, alignment);
         let len = self.len;
 
+        stream
+            .context()
+            .bind_to_thread()
+            .map_err(|e| vortex_err!("Failed to bind CUDA context: {}", e))?;
+
         // SAFETY: We pass a valid pointer to a buffer with sufficient capacity.
         // `cuMemcpyDtoHAsync_v2` fully initializes the memory.
         unsafe {
diff --git a/vortex-cuda/src/kernel/encodings/zstd_buffers.rs b/vortex-cuda/src/kernel/encodings/zstd_buffers.rs
@@ -106,7 +106,6 @@ async fn decode_zstd_buffers(
     let mut device_statuses: CudaSlice<nvcompStatus_t> = ctx.device_alloc(plan.num_frames())?;
     let mut nvcomp_temp_buffer: CudaSlice<u8> = ctx.device_alloc(nvcomp_temp_buffer_size)?;
     let stream = ctx.stream();
-
     let frame_ptrs_view = frame_ptrs_handle.cuda_view::<u64>()?;
     let frame_sizes_view = frame_sizes_handle.cuda_view::<usize>()?;
     let output_sizes_view = output_sizes_handle.cuda_view::<usize>()?;
@@ -123,21 +122,25 @@ async fn decode_zstd_buffers(
     let (device_actual_sizes_ptr, record_actual_sizes) = device_actual_sizes.device_ptr_mut(stream);
     let (nvcomp_temp_buffer_ptr, record_temp) = nvcomp_temp_buffer.device_ptr_mut(stream);
     let (device_statuses_ptr, record_statuses) = device_statuses.device_ptr_mut(stream);
-    unsafe {
-        nvcomp_zstd::decompress_async(
-            frame_ptrs_ptr as _,
-            frame_sizes_ptr as _,
-            output_sizes_ptr as _,
-            device_actual_sizes_ptr as _,
-            plan.num_frames(),
-            nvcomp_temp_buffer_ptr as _,
-            nvcomp_temp_buffer_size,
-            output_ptrs_ptr as _,
-            device_statuses_ptr as _,
-            stream.cu_stream().cast(),
-        )
-        .map_err(|e| vortex_err!("nvcomp decompress_async failed: {}", e))?;
-    }
+
+    ctx.launch_external(plan.output_size_total(), || {
+        // SAFETY: Pointer and size parameters are derived from validated decode plan inputs.
+        unsafe {
+            nvcomp_zstd::decompress_async(
+                frame_ptrs_ptr as _,
+                frame_sizes_ptr as _,
+                output_sizes_ptr as _,
+                device_actual_sizes_ptr as _,
+                plan.num_frames(),
+                nvcomp_temp_buffer_ptr as _,
+                nvcomp_temp_buffer_size,
+                output_ptrs_ptr as _,
+                device_statuses_ptr as _,
+                stream.cu_stream().cast(),
+            )
+            .map_err(|e| vortex_err!("nvcomp decompress_async failed: {}", e))
+        }
+    })?;
     drop(frame_ptr_records);
     drop(frame_views);
     drop((
diff --git a/vortex-cuda/src/stream.rs b/vortex-cuda/src/stream.rs
@@ -68,6 +68,16 @@ impl VortexCudaStream {
         let mut cuda_slice: CudaSlice<T> = self.device_alloc(host_slice.len())?;
         let (device_ptr, record_write) = cuda_slice.device_ptr_mut(&self.0);
 
+        // calling the unsafe memcpy_htod_async expects the cuda context thread local
+        // to be set. To avoid invalid context error from the cuda call we set it
+        // explicitly here.
+        // TODO(os): wrap calling unsafe cudarc functions with something that binds always
+        //           so we don't forget
+        self.0
+            .context()
+            .bind_to_thread()
+            .map_err(|e| vortex_err!("Failed to bind CUDA context: {}", e))?;
+
         unsafe {
             memcpy_htod_async(device_ptr, host_slice, self.0.cu_stream())
                 .map_err(|e| vortex_err!("Failed to schedule async copy to device: {}", e))?;
@@ -127,6 +137,11 @@ fn register_stream_callback(stream: &CudaStream) -> VortexResult<kanal::AsyncRec
 
     let tx_ptr = Box::into_raw(Box::new(tx));
 
+    stream
+        .context()
+        .bind_to_thread()
+        .map_err(|e| vortex_err!("Failed to bind CUDA context: {}", e))?;
+
     /// Called from CUDA driver thread when all preceding work on the stream completes.
     unsafe extern "C" fn callback(user_data: *mut std::ffi::c_void) {
         // SAFETY: The memory of `tx` is manually managed has not been freed