@@ -122,16 +122,15 @@ pub async fn zstd_kernel_prepare(
122122 // Device pointers for all compressed frames.
123123 let frame_ptrs = device_frame_handles
124124 . iter ( )
125- . map ( |handle| {
126- handle
127- . cuda_view :: < u8 > ( )
128- . map ( |view| view. device_ptr ( ctx. stream ( ) ) . 0 )
129- } )
125+ . map ( |handle| handle. cuda_device_ptr ( ) )
130126 . collect :: < VortexResult < Vec < _ > > > ( ) ?;
131127
132128 // Build output_ptrs from output base pointer + offsets.
133129 let output_ptrs = {
134- let base_ptr = device_output. device_ptr ( ctx. stream ( ) ) . 0 ;
130+ // We only need the allocation address here to build pointer metadata.
131+ // The actual device write is tracked by `record_device_output` around
132+ // `decompress_async`, so this guard can be dropped immediately.
133+ let ( base_ptr, _) = device_output. device_ptr ( ctx. stream ( ) ) ;
135134 output_sizes
136135 . iter ( )
137136 . scan ( 0u64 , |offset, & size| {
@@ -155,16 +154,10 @@ pub async fn zstd_kernel_prepare(
155154 let device_statuses: CudaSlice < nvcompStatus_t > = ctx. device_alloc ( num_frames) ?;
156155 let nvcomp_temp_buffer: CudaSlice < u8 > = ctx. device_alloc ( nvcomp_temp_buffer_size) ?;
157156
158- macro_rules! device_ptr {
159- ( $handle: expr, $type: ty) => {
160- $handle. cuda_view:: <$type>( ) ?. device_ptr( ctx. stream( ) ) . 0
161- } ;
162- }
163-
164- let frame_ptrs_ptr = device_ptr ! ( frame_ptrs_handle, u64 ) ;
165- let frame_sizes_ptr = device_ptr ! ( frame_sizes_handle, usize ) ;
166- let output_sizes_ptr = device_ptr ! ( output_sizes_handle, usize ) ;
167- let output_ptrs_ptr = device_ptr ! ( output_ptrs_handle, u64 ) ;
157+ let frame_ptrs_ptr = frame_ptrs_handle. cuda_device_ptr ( ) ?;
158+ let frame_sizes_ptr = frame_sizes_handle. cuda_device_ptr ( ) ?;
159+ let output_sizes_ptr = output_sizes_handle. cuda_device_ptr ( ) ?;
160+ let output_ptrs_ptr = output_ptrs_handle. cuda_device_ptr ( ) ?;
168161
169162 // Return device pointers and handles to keep device memory alive
170163 Ok ( ZstdKernelPrep {
@@ -252,25 +245,65 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
252245 let mut exec = zstd_kernel_prepare ( frames, & metadata, ctx) . await ?;
253246
254247 let stream = ctx. stream ( ) ;
248+ let frame_views = exec
249+ . device_frame_handles
250+ . iter ( )
251+ . map ( |handle| handle. cuda_view :: < u8 > ( ) )
252+ . collect :: < VortexResult < Vec < _ > > > ( ) ?;
253+ let mut frame_ptr_records = Vec :: with_capacity ( frame_views. len ( ) ) ;
254+ for view in & frame_views {
255+ let ( _frame_ptr, record_frame_ptr) = view. device_ptr ( stream) ;
256+ frame_ptr_records. push ( record_frame_ptr) ;
257+ }
258+
259+ let frame_ptrs_view = exec. frame_ptrs_handle . cuda_view :: < u64 > ( ) ?;
260+ let frame_sizes_view = exec. frame_sizes_handle . cuda_view :: < usize > ( ) ?;
261+ let output_sizes_view = exec. output_sizes_handle . cuda_view :: < usize > ( ) ?;
262+ let output_ptrs_view = exec. output_ptrs_handle . cuda_view :: < u64 > ( ) ?;
263+
264+ let ( frame_ptrs_ptr, record_frame_ptrs) = frame_ptrs_view. device_ptr ( stream) ;
265+ let ( frame_sizes_ptr, record_frame_sizes) = frame_sizes_view. device_ptr ( stream) ;
266+ let ( output_sizes_ptr, record_output_sizes) = output_sizes_view. device_ptr ( stream) ;
267+ let ( output_ptrs_ptr, record_output_ptrs) = output_ptrs_view. device_ptr ( stream) ;
268+
269+ // Track writes to the output allocation at the actual enqueue point.
270+ // This guard intentionally outlives the pointer-metadata construction above.
271+ let ( _device_output_ptr, record_device_output) = exec. device_output . device_ptr_mut ( stream) ;
272+ let ( device_actual_sizes_ptr, record_actual_sizes) =
273+ exec. device_actual_sizes . device_ptr_mut ( stream) ;
274+ let ( nvcomp_temp_buffer_ptr, record_temp) = exec. nvcomp_temp_buffer . device_ptr_mut ( stream) ;
275+ let ( device_statuses_ptr, record_statuses) = exec. device_statuses . device_ptr_mut ( stream) ;
255276
256277 ctx. launch_external ( n_rows, || {
257278 // SAFETY: zstd_kernel_prepare makes sure to return valid kernel params.
258279 unsafe {
259280 nvcomp_zstd:: decompress_async (
260- exec . frame_ptrs_ptr as _ ,
261- exec . frame_sizes_ptr as _ ,
262- exec . output_sizes_ptr as _ ,
263- exec . device_actual_sizes . device_ptr_mut ( stream ) . 0 as _ ,
281+ frame_ptrs_ptr as _ ,
282+ frame_sizes_ptr as _ ,
283+ output_sizes_ptr as _ ,
284+ device_actual_sizes_ptr as _ ,
264285 exec. num_frames ,
265- exec . nvcomp_temp_buffer . device_ptr_mut ( stream ) . 0 as _ ,
286+ nvcomp_temp_buffer_ptr as _ ,
266287 exec. nvcomp_temp_buffer_size ,
267- exec . output_ptrs_ptr as _ ,
268- exec . device_statuses . device_ptr_mut ( stream ) . 0 as _ ,
288+ output_ptrs_ptr as _ ,
289+ device_statuses_ptr as _ ,
269290 stream. cu_stream ( ) . cast ( ) ,
270291 )
271292 . map_err ( |e| vortex_err ! ( "nvcomp decompress_async failed: {}" , e) )
272293 }
273294 } ) ?;
295+ drop ( frame_ptr_records) ;
296+ drop ( frame_views) ;
297+ drop ( (
298+ record_frame_ptrs,
299+ record_frame_sizes,
300+ record_output_sizes,
301+ record_output_ptrs,
302+ record_device_output,
303+ record_actual_sizes,
304+ record_temp,
305+ record_statuses,
306+ ) ) ;
274307
275308 // Unconditionally copy back to the host as Zstd arrays are fully
276309 // self-contained. They neither have any parent or child encodings.
0 commit comments