keep DropOnSync alive until after dispatching for cuda slices (#6673)

onursatici · web-flow · commit 6c7cd94312e4 · 2026-02-26T11:02:01.000Z
## Summary

cuda device pointers come with `SyncOnDrop` that is used to synchronise
reads and writes to the underlying cuda buffer. We should keep those
alive until after dispatching the read or write work on them. For our
single stream case this is fine, but if we were to have multiple streams
accessing the same buffer these would be a problem

---------

Signed-off-by: Onur Satici &lt;onur@spiraldb.com&gt;
diff --git a/vortex-cuda/benches/dynamic_dispatch_cuda.rs b/vortex-cuda/benches/dynamic_dispatch_cuda.rs
@@ -51,14 +51,16 @@ const BENCH_ARGS: &[(usize, &str)] = &[
 /// Launch the dynamic_dispatch kernel and return GPU-timed duration.
 fn run_timed(
     cuda_ctx: &mut CudaExecutionCtx,
-    output_ptr: u64,
     array_len: usize,
+    output_buf: &CudaDeviceBuffer,
     device_plan: &Arc<cudarc::driver::CudaSlice<DynamicDispatchPlan>>,
     shared_mem_bytes: u32,
 ) -> VortexResult<Duration> {
     let cuda_function = cuda_ctx.load_function("dynamic_dispatch", &["u32"])?;
     let array_len_u64 = array_len as u64;
-    let plan_ptr = device_plan.device_ptr(cuda_ctx.stream()).0;
+    let output_view = output_buf.as_view::<u32>();
+    let (output_ptr, record_output) = output_view.device_ptr(cuda_ctx.stream());
+    let (plan_ptr, record_plan) = device_plan.device_ptr(cuda_ctx.stream());
 
     let stream = cuda_ctx.stream();
     let ctx = stream.context();
@@ -86,6 +88,7 @@ fn run_timed(
             .launch(config)
             .map_err(|e| vortex_err!("kernel launch failed: {e}"))?;
     }
+    drop((record_output, record_plan));
 
     let stream = cuda_ctx.stream();
     let ctx = stream.context();
@@ -105,11 +108,10 @@ fn run_timed(
 struct BenchRunner {
     _plan: DynamicDispatchPlan,
     smem_bytes: u32,
-    output_ptr: u64,
     len: usize,
     // Keep alive
-    _device_plan: Arc<cudarc::driver::CudaSlice<DynamicDispatchPlan>>,
-    _output_buf: CudaDeviceBuffer,
+    device_plan: Arc<cudarc::driver::CudaSlice<DynamicDispatchPlan>>,
+    output_buf: CudaDeviceBuffer,
     _plan_buffers: Vec<vortex::array::buffer::BufferHandle>,
 }
 
@@ -130,15 +132,13 @@ impl BenchRunner {
             .device_alloc::<u32>(len.next_multiple_of(1024))
             .expect("alloc output");
         let output_buf = CudaDeviceBuffer::new(output_slice);
-        let output_ptr = output_buf.as_view::<u32>().device_ptr(cuda_ctx.stream()).0;
 
         Self {
             _plan: plan,
             smem_bytes,
-            output_ptr,
             len,
-            _device_plan: device_plan,
-            _output_buf: output_buf,
+            device_plan,
+            output_buf,
             _plan_buffers: plan_buffers,
         }
     }
@@ -147,9 +147,9 @@ impl BenchRunner {
         cuda_ctx.stream().synchronize().unwrap();
         run_timed(
             cuda_ctx,
-            self.output_ptr,
             self.len,
-            &self._device_plan,
+            &self.output_buf,
+            &self.device_plan,
             self.smem_bytes,
         )
         .unwrap()
diff --git a/vortex-cuda/benches/filter_cuda.rs b/vortex-cuda/benches/filter_cuda.rs
@@ -86,26 +86,33 @@ async fn run_filter_timed<T: CubFilterable + cudarc::driver::DeviceRepr>(
 
     // Get raw pointers
     let stream_ptr = stream.cu_stream() as cudaStream_t;
-    let d_input_ptr = d_input.device_ptr(stream).0 as *const T;
-    let d_bitmask_ptr = d_bitmask.device_ptr(stream).0 as *const u8;
-    let d_output_ptr = d_output.device_ptr_mut(stream).0 as *mut T;
-    let d_temp_ptr = d_temp.device_ptr_mut(stream).0 as *mut c_void;
-    let d_num_selected_ptr = d_num_selected.device_ptr_mut(stream).0 as *mut i64;
+    let (d_input_ptr, record_d_input) = d_input.device_ptr(stream);
+    let (d_bitmask_ptr, record_d_bitmask) = d_bitmask.device_ptr(stream);
+    let (d_output_ptr, record_d_output) = d_output.device_ptr_mut(stream);
+    let (d_temp_ptr, record_d_temp) = d_temp.device_ptr_mut(stream);
+    let (d_num_selected_ptr, record_d_num_selected) = d_num_selected.device_ptr_mut(stream);
 
     unsafe {
         T::filter_bitmask(
-            d_temp_ptr,
+            d_temp_ptr as *mut c_void,
             temp_bytes,
-            d_input_ptr,
-            d_bitmask_ptr,
+            d_input_ptr as *const T,
+            d_bitmask_ptr as *const u8,
             0, // bit_offset
-            d_output_ptr,
-            d_num_selected_ptr,
+            d_output_ptr as *mut T,
+            d_num_selected_ptr as *mut i64,
             num_items,
             stream_ptr,
         )
         .map_err(|e| vortex_err!("Filter kernel execution failed: {}", e))?;
     }
+    drop((
+        record_d_input,
+        record_d_bitmask,
+        record_d_output,
+        record_d_temp,
+        record_d_num_selected,
+    ));
 
     let end_event = ctx
         .new_event(Some(CUevent_flags::CU_EVENT_BLOCKING_SYNC))
diff --git a/vortex-cuda/benches/throughput_cuda.rs b/vortex-cuda/benches/throughput_cuda.rs
@@ -51,9 +51,9 @@ fn transfer_mix_timed(
         .device_alloc::<u32>((output_bytes / size_of::<u32>()).max(1))
         .unwrap();
 
-    let src_ptr = dtod_src.device_ptr(&in_stream).0;
-    let dst_ptr = dtod_dst.device_ptr_mut(&in_stream).0;
-    let memset_ptr = memset_dst.device_ptr_mut(&out_stream).0;
+    let (src_ptr, record_src) = dtod_src.device_ptr(&in_stream);
+    let (dst_ptr, record_dst) = dtod_dst.device_ptr_mut(&in_stream);
+    let (memset_ptr, record_memset) = memset_dst.device_ptr_mut(&out_stream);
 
     in_stream.synchronize().unwrap();
     out_stream.synchronize().unwrap();
@@ -76,6 +76,7 @@ fn transfer_mix_timed(
                 .unwrap();
         }
     }
+    drop((record_src, record_dst, record_memset));
 
     let end_in = in_stream
         .record_event(Some(CUevent_flags::CU_EVENT_BLOCKING_SYNC))
diff --git a/vortex-cuda/benches/zstd_cuda.rs b/vortex-cuda/benches/zstd_cuda.rs
@@ -78,22 +78,28 @@ async fn execute_zstd_kernel(
         .record(stream)
         .map_err(|e| vortex_err!("Failed to record start event: {:?}", e))?;
 
+    let (device_actual_sizes_ptr, record_actual_sizes) =
+        exec.device_actual_sizes.device_ptr_mut(stream);
+    let (nvcomp_temp_buffer_ptr, record_temp) = exec.nvcomp_temp_buffer.device_ptr_mut(stream);
+    let (device_statuses_ptr, record_statuses) = exec.device_statuses.device_ptr_mut(stream);
+
     // Launch the kernel
     unsafe {
         nvcomp_zstd::decompress_async(
             exec.frame_ptrs_ptr as _,
             exec.frame_sizes_ptr as _,
             exec.output_sizes_ptr as _,
-            exec.device_actual_sizes.device_ptr_mut(stream).0 as _,
+            device_actual_sizes_ptr as _,
             exec.num_frames,
-            exec.nvcomp_temp_buffer.device_ptr_mut(stream).0 as _,
+            nvcomp_temp_buffer_ptr as _,
             exec.nvcomp_temp_buffer_size,
             exec.output_ptrs_ptr as _,
-            exec.device_statuses.device_ptr_mut(stream).0 as _,
+            device_statuses_ptr as _,
             stream.cu_stream().cast(),
         )
         .map_err(|e| vortex_err!("nvcomp decompress_async failed: {}", e))?;
     }
+    drop((record_actual_sizes, record_temp, record_statuses));
 
     let end_event = ctx
         .new_event(Some(CUevent_flags::CU_EVENT_BLOCKING_SYNC))
diff --git a/vortex-cuda/src/dynamic_dispatch/mod.rs b/vortex-cuda/src/dynamic_dispatch/mod.rs
@@ -316,7 +316,7 @@ mod tests {
         data: &[u32],
     ) -> VortexResult<(u64, Arc<cudarc::driver::CudaSlice<u32>>)> {
         let device_buf = Arc::new(cuda_ctx.stream().clone_htod(data).expect("htod"));
-        let ptr = device_buf.device_ptr(cuda_ctx.stream()).0;
+        let (ptr, _) = device_buf.device_ptr(cuda_ctx.stream());
         Ok((ptr, device_buf))
     }
 
@@ -372,15 +372,16 @@ mod tests {
             .device_alloc::<u32>(output_len)
             .vortex_expect("alloc output");
         let output_buf = CudaDeviceBuffer::new(output_slice);
-        let output_ptr = output_buf.as_view::<u32>().device_ptr(cuda_ctx.stream()).0;
+        let output_view = output_buf.as_view::<u32>();
+        let (output_ptr, record_output) = output_view.device_ptr(cuda_ctx.stream());
 
         let device_plan = Arc::new(
             cuda_ctx
                 .stream()
                 .clone_htod(std::slice::from_ref(plan))
                 .expect("copy plan to device"),
         );
-        let plan_ptr = device_plan.device_ptr(cuda_ctx.stream()).0;
+        let (plan_ptr, record_plan) = device_plan.device_ptr(cuda_ctx.stream());
         let array_len_u64 = output_len as u64;
 
         cuda_ctx.stream().synchronize().expect("sync");
@@ -402,6 +403,7 @@ mod tests {
         unsafe {
             launch_builder.launch(config).expect("kernel launch");
         }
+        drop((record_output, record_plan));
 
         Ok(cuda_ctx
             .stream()
diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs
@@ -122,16 +122,15 @@ pub async fn zstd_kernel_prepare(
     // Device pointers for all compressed frames.
     let frame_ptrs = device_frame_handles
         .iter()
-        .map(|handle| {
-            handle
-                .cuda_view::<u8>()
-                .map(|view| view.device_ptr(ctx.stream()).0)
-        })
+        .map(|handle| handle.cuda_device_ptr())
         .collect::<VortexResult<Vec<_>>>()?;
 
     // Build output_ptrs from output base pointer + offsets.
     let output_ptrs = {
-        let base_ptr = device_output.device_ptr(ctx.stream()).0;
+        // We only need the allocation address here to build pointer metadata.
+        // The actual device write is tracked by `record_device_output` around
+        // `decompress_async`, so this guard can be dropped immediately.
+        let (base_ptr, _) = device_output.device_ptr(ctx.stream());
         output_sizes
             .iter()
             .scan(0u64, |offset, &size| {
@@ -155,16 +154,10 @@ pub async fn zstd_kernel_prepare(
     let device_statuses: CudaSlice<nvcompStatus_t> = ctx.device_alloc(num_frames)?;
     let nvcomp_temp_buffer: CudaSlice<u8> = ctx.device_alloc(nvcomp_temp_buffer_size)?;
 
-    macro_rules! device_ptr {
-        ($handle:expr, $type:ty) => {
-            $handle.cuda_view::<$type>()?.device_ptr(ctx.stream()).0
-        };
-    }
-
-    let frame_ptrs_ptr = device_ptr!(frame_ptrs_handle, u64);
-    let frame_sizes_ptr = device_ptr!(frame_sizes_handle, usize);
-    let output_sizes_ptr = device_ptr!(output_sizes_handle, usize);
-    let output_ptrs_ptr = device_ptr!(output_ptrs_handle, u64);
+    let frame_ptrs_ptr = frame_ptrs_handle.cuda_device_ptr()?;
+    let frame_sizes_ptr = frame_sizes_handle.cuda_device_ptr()?;
+    let output_sizes_ptr = output_sizes_handle.cuda_device_ptr()?;
+    let output_ptrs_ptr = output_ptrs_handle.cuda_device_ptr()?;
 
     // Return device pointers and handles to keep device memory alive
     Ok(ZstdKernelPrep {
@@ -252,25 +245,65 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
     let mut exec = zstd_kernel_prepare(frames, &metadata, ctx).await?;
 
     let stream = ctx.stream();
+    let frame_views = exec
+        .device_frame_handles
+        .iter()
+        .map(|handle| handle.cuda_view::<u8>())
+        .collect::<VortexResult<Vec<_>>>()?;
+    let mut frame_ptr_records = Vec::with_capacity(frame_views.len());
+    for view in &frame_views {
+        let (_frame_ptr, record_frame_ptr) = view.device_ptr(stream);
+        frame_ptr_records.push(record_frame_ptr);
+    }
+
+    let frame_ptrs_view = exec.frame_ptrs_handle.cuda_view::<u64>()?;
+    let frame_sizes_view = exec.frame_sizes_handle.cuda_view::<usize>()?;
+    let output_sizes_view = exec.output_sizes_handle.cuda_view::<usize>()?;
+    let output_ptrs_view = exec.output_ptrs_handle.cuda_view::<u64>()?;
+
+    let (frame_ptrs_ptr, record_frame_ptrs) = frame_ptrs_view.device_ptr(stream);
+    let (frame_sizes_ptr, record_frame_sizes) = frame_sizes_view.device_ptr(stream);
+    let (output_sizes_ptr, record_output_sizes) = output_sizes_view.device_ptr(stream);
+    let (output_ptrs_ptr, record_output_ptrs) = output_ptrs_view.device_ptr(stream);
+
+    // Track writes to the output allocation at the actual enqueue point.
+    // This guard intentionally outlives the pointer-metadata construction above.
+    let (_device_output_ptr, record_device_output) = exec.device_output.device_ptr_mut(stream);
+    let (device_actual_sizes_ptr, record_actual_sizes) =
+        exec.device_actual_sizes.device_ptr_mut(stream);
+    let (nvcomp_temp_buffer_ptr, record_temp) = exec.nvcomp_temp_buffer.device_ptr_mut(stream);
+    let (device_statuses_ptr, record_statuses) = exec.device_statuses.device_ptr_mut(stream);
 
     ctx.launch_external(n_rows, || {
         // SAFETY: zstd_kernel_prepare makes sure to return valid kernel params.
         unsafe {
             nvcomp_zstd::decompress_async(
-                exec.frame_ptrs_ptr as _,
-                exec.frame_sizes_ptr as _,
-                exec.output_sizes_ptr as _,
-                exec.device_actual_sizes.device_ptr_mut(stream).0 as _,
+                frame_ptrs_ptr as _,
+                frame_sizes_ptr as _,
+                output_sizes_ptr as _,
+                device_actual_sizes_ptr as _,
                 exec.num_frames,
-                exec.nvcomp_temp_buffer.device_ptr_mut(stream).0 as _,
+                nvcomp_temp_buffer_ptr as _,
                 exec.nvcomp_temp_buffer_size,
-                exec.output_ptrs_ptr as _,
-                exec.device_statuses.device_ptr_mut(stream).0 as _,
+                output_ptrs_ptr as _,
+                device_statuses_ptr as _,
                 stream.cu_stream().cast(),
             )
             .map_err(|e| vortex_err!("nvcomp decompress_async failed: {}", e))
         }
     })?;
+    drop(frame_ptr_records);
+    drop(frame_views);
+    drop((
+        record_frame_ptrs,
+        record_frame_sizes,
+        record_output_sizes,
+        record_output_ptrs,
+        record_device_output,
+        record_actual_sizes,
+        record_temp,
+        record_statuses,
+    ));
 
     // Unconditionally copy back to the host as Zstd arrays are fully
     // self-contained. They neither have any parent or child encodings.
diff --git a/vortex-cuda/src/kernel/encodings/zstd_buffers.rs b/vortex-cuda/src/kernel/encodings/zstd_buffers.rs
diff --git a/vortex-cuda/src/kernel/filter/mod.rs b/vortex-cuda/src/kernel/filter/mod.rs
diff --git a/vortex-cuda/src/stream.rs b/vortex-cuda/src/stream.rs