fix: remove block on call in cuda_execute

0ax1 · 0ax1 · commit d9dd093e064d · 2026-03-24T16:57:07.000Z
Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
@@ -9,7 +9,6 @@
 
 use std::sync::Arc;
 
-use futures::executor::block_on;
 use vortex::array::ArrayRef;
 use vortex::array::DynArray;
 use vortex::array::ExecutionCtx;
@@ -324,7 +323,18 @@ impl PlanBuilderState<'_> {
     fn walk_primitive(&mut self, array: ArrayRef) -> VortexResult<Pipeline> {
         let prim = array.to_canonical()?.into_primitive();
         let PrimitiveArrayParts { buffer, .. } = prim.into_parts();
-        let device_buf = block_on(self.ctx.ensure_on_device(buffer))?;
+
+        // TODO(0ax1): Optimize device buffer allocation and copying.
+        //
+        // Ideally, there would be a buffer pool of preallocated device memory
+        // such that retrieving a device pointer is O(1) when building the
+        // dynamic dispatch plan. In the current setup, we need to allocate the
+        // buffer before we can get the device pointer. As the memory is
+        // allocated via the global allocator, which does not pin the host
+        // memory to physical addresses unlike `cudaHostAlloc`, the subsequent
+        // memory copy from host to device is sync and cannot be pushed to the
+        // CUDA stream as an async operation.
+        let device_buf = self.ctx.ensure_on_device_sync(buffer)?;
         let ptr = device_buf.cuda_device_ptr()?;
         self.device_buffers.push(device_buf);
         Ok(Pipeline {
@@ -354,7 +364,7 @@ impl PlanBuilderState<'_> {
             vortex_bail!("Dynamic dispatch does not support BitPackedArray with patches");
         }
 
-        let device_buf = block_on(self.ctx.ensure_on_device(packed))?;
+        let device_buf = self.ctx.ensure_on_device_sync(packed)?;
         let ptr = device_buf.cuda_device_ptr()?;
         self.device_buffers.push(device_buf);
         Ok(Pipeline {
@@ -490,14 +500,26 @@ impl PlanBuilderState<'_> {
 }
 
 /// Extract a FoR reference scalar as u64 bits.
+///
+/// `TryFrom<&Scalar>` for primitive types requires an exact ptype match,
+/// so we must try each width individually rather than relying on widening.
 fn extract_for_reference(for_arr: &FoRArray) -> VortexResult<u64> {
-    if let Ok(v) = u32::try_from(for_arr.reference_scalar()) {
+    let s = for_arr.reference_scalar();
+    if let Ok(v) = u8::try_from(s) {
+        Ok(v as u64)
+    } else if let Ok(v) = i8::try_from(s) {
+        Ok(v as u8 as u64)
+    } else if let Ok(v) = u16::try_from(s) {
+        Ok(v as u64)
+    } else if let Ok(v) = i16::try_from(s) {
+        Ok(v as u16 as u64)
+    } else if let Ok(v) = u32::try_from(s) {
         Ok(v as u64)
-    } else if let Ok(v) = i32::try_from(for_arr.reference_scalar()) {
+    } else if let Ok(v) = i32::try_from(s) {
         Ok(v as u32 as u64)
-    } else if let Ok(v) = u64::try_from(for_arr.reference_scalar()) {
+    } else if let Ok(v) = u64::try_from(s) {
         Ok(v)
-    } else if let Ok(v) = i64::try_from(for_arr.reference_scalar()) {
+    } else if let Ok(v) = i64::try_from(s) {
         Ok(v as u64)
     } else {
         vortex_bail!("Cannot extract FoR reference as an integer type")
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
@@ -252,6 +252,22 @@ impl CudaExecutionCtx {
         self.stream.copy_to_device(host_buffer)?.await
     }
 
+    /// Synchronous variant of [`ensure_on_device`](Self::ensure_on_device).
+    ///
+    /// Safe to call from within an async executor (no nested `block_on`).
+    /// The copy is enqueued on the stream and completes before any subsequent
+    /// work on the same stream.
+    pub fn ensure_on_device_sync(&self, handle: BufferHandle) -> VortexResult<BufferHandle> {
+        if handle.is_on_device() {
+            return Ok(handle);
+        }
+        let host_buffer = handle
+            .as_host_opt()
+            .ok_or_else(|| vortex_err!("Buffer is not on host"))?
+            .clone();
+        self.stream.copy_to_device_sync(host_buffer.as_ref())
+    }
+
     /// Returns a reference to the underlying [`VortexCudaStream`].
     ///
     /// Through [`Deref`][std::ops::Deref], this also provides access to the
diff --git a/vortex-cuda/src/stream.rs b/vortex-cuda/src/stream.rs
@@ -89,6 +89,28 @@ impl VortexCudaStream {
             Ok(BufferHandle::new_device(Arc::new(cuda_buf)))
         }))
     }
+
+    /// Synchronous variant of [`copy_to_device`](Self::copy_to_device).
+    ///
+    /// Allocates device memory, enqueues the H2D copy on the stream, and
+    /// returns immediately. The device pointer is valid as soon as this call
+    /// returns; the copy completes before any later work on the same stream.
+    ///
+    /// For **pageable** host memory (the common case), `memcpy_htod` stages
+    /// the source into a driver-managed pinned buffer before returning, so
+    /// the source data is safe to drop after this call.
+    pub(crate) fn copy_to_device_sync<T>(&self, data: &[T]) -> VortexResult<BufferHandle>
+    where
+        T: DeviceRepr + Debug + Send + Sync + 'static,
+    {
+        let mut cuda_slice: CudaSlice<T> = self.device_alloc(data.len())?;
+
+        self.memcpy_htod(data, &mut cuda_slice)
+            .map_err(|e| vortex_err!("Failed to schedule H2D copy: {}", e))?;
+
+        let cuda_buf = CudaDeviceBuffer::new(cuda_slice);
+        Ok(BufferHandle::new_device(Arc::new(cuda_buf)))
+    }
 }
 
 /// Registers a callback and asynchronously waits for its completion.