feat(cuda): hybrid GPU dispatch - fuse dyn + standalone kernels (#7127)

0ax1 · web-flow · commit 7f541db86521 · 2026-03-23T15:01:43.000Z
Add a hybrid_dispatch module that integrates subtrees with separate
kernel dispatch with dynamic dispatch kernels. Subtrees with unsupported
encodings (e.g. Zstd) are executed separately and their device buffers
are fed back as `LOAD` ops in the fused plan.

Note that this implicitly enables filtering via the CUDA CUB filter
implementation.

Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-cuda/benches/dynamic_dispatch_cuda.rs b/vortex-cuda/benches/dynamic_dispatch_cuda.rs
@@ -50,6 +50,10 @@ const BENCH_ARGS: &[(usize, &str)] = &[
 ];
 
 /// Launch the dynamic_dispatch kernel and return GPU-timed duration.
+///
+/// This deliberately does not use `DynamicDispatchPlan::execute` because the
+/// benchmark pre-allocates the output buffer and device plan once, then reuses
+/// them across iterations.
 fn run_timed(
     cuda_ctx: &mut CudaExecutionCtx,
     array_len: usize,
diff --git a/vortex-cuda/src/dynamic_dispatch/mod.rs b/vortex-cuda/src/dynamic_dispatch/mod.rs
@@ -19,7 +19,27 @@
 #![allow(non_snake_case)]
 #![allow(clippy::cast_possible_truncation)]
 
-mod plan_builder;
+use std::sync::Arc;
+
+use cudarc::driver::DevicePtr;
+use cudarc::driver::LaunchConfig;
+use cudarc::driver::PushKernelArg;
+use vortex::array::Canonical;
+use vortex::array::arrays::PrimitiveArray;
+use vortex::array::buffer::BufferHandle;
+use vortex::array::buffer::DeviceBufferExt;
+use vortex::array::match_each_unsigned_integer_ptype;
+use vortex::array::validity::Validity;
+use vortex::dtype::Nullability;
+use vortex::dtype::PType;
+use vortex::error::VortexResult;
+use vortex::error::vortex_bail;
+use vortex::error::vortex_err;
+
+use crate::CudaDeviceBuffer;
+use crate::executor::CudaExecutionCtx;
+
+pub(crate) mod plan_builder;
 pub use plan_builder::build_plan;
 
 include!(concat!(env!("OUT_DIR"), "/dynamic_dispatch.rs"));
@@ -201,6 +221,85 @@ impl DynamicDispatchPlan {
         }
         max_end * elem_size
     }
+
+    /// Allocate output, upload the plan to the device, and launch the
+    /// `dynamic_dispatch` kernel.
+    ///
+    /// The CUDA kernels are instantiated for unsigned types only.
+    /// Encoding transforms (FoR, ZigZag, ALP) are bit-identical
+    /// regardless of signedness.
+    ///
+    /// `CudaSlice::drop` enqueues `free` on the stream after kernel execution.
+    pub fn execute(
+        self,
+        output_ptype: PType,
+        len: usize,
+        device_buffers: Vec<BufferHandle>,
+        ctx: &mut CudaExecutionCtx,
+    ) -> VortexResult<Canonical> {
+        let unsigned_ptype = match output_ptype {
+            PType::U8 | PType::I8 => PType::U8,
+            PType::U16 | PType::I16 => PType::U16,
+            PType::U32 | PType::I32 | PType::F32 => PType::U32,
+            PType::U64 | PType::I64 => PType::U64,
+            other => vortex_bail!("dynamic dispatch does not support PType {:?}", other),
+        };
+        match_each_unsigned_integer_ptype!(unsigned_ptype, |T| {
+            self.execute_typed::<T>(output_ptype, len, device_buffers, ctx)
+        })
+    }
+
+    fn execute_typed<T>(
+        self,
+        output_ptype: PType,
+        len: usize,
+        device_buffers: Vec<BufferHandle>,
+        ctx: &mut CudaExecutionCtx,
+    ) -> VortexResult<Canonical>
+    where
+        T: cudarc::driver::DeviceRepr + vortex::dtype::NativePType,
+    {
+        if len == 0 {
+            return Ok(Canonical::Primitive(PrimitiveArray::empty::<T>(
+                Nullability::NonNullable,
+            )));
+        }
+
+        let output_buf = CudaDeviceBuffer::new(ctx.device_alloc::<T>(len.next_multiple_of(1024))?);
+        let device_plan = Arc::new(
+            ctx.stream()
+                .clone_htod(std::slice::from_ref(&self))
+                .map_err(|e| vortex_err!("copy plan to device: {e}"))?,
+        );
+
+        let shared_mem_bytes = self.shared_mem_bytes::<T>();
+        let cuda_function = ctx.load_function("dynamic_dispatch", &[T::PTYPE])?;
+        let num_blocks = u32::try_from(len.div_ceil(2048))?;
+        let config = LaunchConfig {
+            grid_dim: (num_blocks, 1, 1),
+            block_dim: (64, 1, 1),
+            shared_mem_bytes,
+        };
+
+        let output_ptr = output_buf.offset_ptr();
+        let plan_ptr = device_plan.device_ptr(ctx.stream()).0;
+        let array_len_u64 = len as u64;
+
+        ctx.launch_kernel_config(&cuda_function, config, len, |args| {
+            args.arg(&output_ptr);
+            args.arg(&array_len_u64);
+            args.arg(&plan_ptr);
+        })?;
+
+        drop(device_buffers);
+        drop(device_plan);
+
+        Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
+            BufferHandle::new_device(output_buf.slice_typed::<T>(0..len)),
+            output_ptype,
+            Validity::NonNullable,
+        )))
+    }
 }
 
 #[cfg(test)]
diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
@@ -7,6 +7,8 @@
 //! to the device, computes shared memory offsets, and produces a plan that the
 //! dynamic dispatch kernel can execute in a single launch.
 
+use std::sync::Arc;
+
 use futures::executor::block_on;
 use vortex::array::ArrayRef;
 use vortex::array::DynArray;
@@ -102,11 +104,30 @@ pub fn build_plan(
     array: &ArrayRef,
     ctx: &CudaExecutionCtx,
 ) -> VortexResult<(DynamicDispatchPlan, Vec<BufferHandle>)> {
+    build_plan_with_subtrees(array, ctx, &[])
+}
+
+/// Build a [`DynamicDispatchPlan`] with subtrees run as separate
+/// kernels that provide device buffers as inputs integrated via `LOAD`.
+pub fn build_plan_with_subtrees(
+    array: &ArrayRef,
+    ctx: &CudaExecutionCtx,
+    subtree_inputs: &[(ArrayRef, BufferHandle)],
+) -> VortexResult<(DynamicDispatchPlan, Vec<BufferHandle>)> {
+    let sub_map = subtree_inputs
+        .iter()
+        .map(|(arr, handle)| {
+            let ptr = handle.cuda_device_ptr()?;
+            Ok((Arc::as_ptr(arr) as *const () as usize, ptr))
+        })
+        .collect::<VortexResult<Vec<_>>>()?;
+
     let mut state = PlanBuilderState {
         ctx,
         stages: Vec::new(),
         smem_cursor: 0,
         device_buffers: Vec::new(),
+        subtree_inputs: sub_map,
     };
 
     let pipeline = state.walk(array.clone())?;
@@ -129,6 +150,88 @@ pub fn build_plan(
     Ok((DynamicDispatchPlan::new(state.stages), state.device_buffers))
 }
 
+/// Walk the encoding tree and find subtrees that cannot be fused into a
+/// dynamic-dispatch plan. The root of each subtree has a node that cannot
+/// be fused.
+///
+/// Returns an empty vec if the root itself cannot be fused.
+pub fn find_subtrees(array: &ArrayRef) -> Vec<ArrayRef> {
+    if !is_dyn_dispatch_compatible(array) {
+        return Vec::new();
+    }
+    let mut out = Vec::new();
+    collect_subtrees(array, &mut out);
+    out
+}
+
+/// Checks whether the encoding of an array can be fused into a dynamic-dispatch plan.
+fn is_dyn_dispatch_compatible(array: &ArrayRef) -> bool {
+    let id = array.encoding_id();
+    if id == ALP::ID {
+        if let Ok(a) = array.clone().try_into::<ALP>() {
+            return a.patches().is_none() && a.dtype().as_ptype() == PType::F32;
+        }
+        return false;
+    }
+    if id == BitPacked::ID {
+        if let Ok(a) = array.clone().try_into::<BitPacked>() {
+            return a.patches().is_none();
+        }
+        return false;
+    }
+    id == FoR::ID
+        || id == ZigZag::ID
+        || id == Dict::ID
+        || id == RunEnd::ID
+        || id == Primitive::ID
+        || id == Slice::ID
+        || id == Sequence::ID
+}
+
+/// Walk the children of a dynamic dispatch compatible root node. Any child
+/// that is not dyn dispatch compatible is recorded as a subtree that must be
+/// executed separately.
+fn collect_subtrees(array: &ArrayRef, out: &mut Vec<ArrayRef>) {
+    let id = array.encoding_id();
+
+    fn visit_child(child: &ArrayRef, out: &mut Vec<ArrayRef>) {
+        if is_dyn_dispatch_compatible(child) {
+            collect_subtrees(child, out);
+        } else {
+            out.push(child.clone());
+        }
+    }
+
+    if id == FoR::ID {
+        if let Ok(a) = array.clone().try_into::<FoR>() {
+            visit_child(a.encoded(), out);
+        }
+    } else if id == ZigZag::ID {
+        if let Ok(a) = array.clone().try_into::<ZigZag>() {
+            visit_child(a.encoded(), out);
+        }
+    } else if id == ALP::ID {
+        if let Ok(a) = array.clone().try_into::<ALP>() {
+            visit_child(a.encoded(), out);
+        }
+    } else if id == Slice::ID {
+        if let Some(a) = array.as_opt::<Slice>() {
+            visit_child(a.child(), out);
+        }
+    } else if id == Dict::ID
+        && let Ok(a) = array.clone().try_into::<Dict>()
+    {
+        visit_child(a.values(), out);
+        visit_child(a.codes(), out);
+    } else if id == RunEnd::ID
+        && let Ok(a) = array.clone().try_into::<RunEnd>()
+    {
+        visit_child(a.ends(), out);
+        visit_child(a.values(), out);
+    }
+    // BitPacked, Primitive, Sequence — leaves, no children.
+}
+
 /// Internal mutable state for the recursive tree walk.
 struct PlanBuilderState<'a> {
     ctx: &'a CudaExecutionCtx,
@@ -138,11 +241,30 @@ struct PlanBuilderState<'a> {
     smem_cursor: u32,
     /// Device buffers to keep alive.
     device_buffers: Vec<BufferHandle>,
+    /// Pre-executed subtree outputs injected as `LOAD` sources: `(identity, device_ptr)`.
+    subtree_inputs: Vec<(usize, u64)>,
 }
 
 impl PlanBuilderState<'_> {
+    /// If `array` matches a pre-executed subtree input, return a `LOAD` pipeline pointing at its device buffer.
+    fn find_subtree(&self, array: &ArrayRef) -> Option<Pipeline> {
+        let subtree_id = Arc::as_ptr(array) as *const () as usize;
+        self.subtree_inputs
+            .iter()
+            .find(|(id, _)| *id == subtree_id)
+            .map(|(_, ptr)| Pipeline {
+                source: SourceOp::load(),
+                scalar_ops: vec![],
+                input_ptr: *ptr,
+            })
+    }
+
     /// Recursively walk the encoding tree.
     fn walk(&mut self, array: ArrayRef) -> VortexResult<Pipeline> {
+        if let Some(pipeline) = self.find_subtree(&array) {
+            return Ok(pipeline);
+        }
+
         let id = array.encoding_id();
 
         if id == BitPacked::ID {
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
@@ -30,6 +30,7 @@ use vortex::error::vortex_err;
 
 use crate::CudaSession;
 use crate::ExportDeviceArray;
+use crate::hybrid_dispatch;
 use crate::kernel::DefaultLaunchStrategy;
 use crate::kernel::LaunchStrategy;
 use crate::kernel::LaunchStrategyExt;
@@ -265,6 +266,11 @@ impl CudaExecutionCtx {
         self.ctx.session()
     }
 
+    /// Returns a reference to the CUDA session.
+    pub(crate) fn cuda_session(&self) -> &CudaSession {
+        &self.cuda_session
+    }
+
     /// Get a handle to the exporter that can convert arrays into `ArrowDeviceArray`.
     pub fn exporter(&self) -> &Arc<dyn ExportDeviceArray> {
         self.cuda_session.export_device_array()
@@ -364,6 +370,19 @@ impl CudaArrayExt for ArrayRef {
             return self.execute(&mut ctx.ctx);
         }
 
+        // Try to fuse the encoding tree (or parts of it) into dynamic-dispatch
+        // kernel launches. See hybrid_dispatch module docs for details.
+        match hybrid_dispatch::try_dyn_dispatch(&self, ctx).await {
+            Ok(canonical) => return Ok(canonical),
+            Err(e) => {
+                trace!(
+                    encoding = %self.encoding_id(),
+                    error = %e,
+                    "Hybrid dispatch not applicable, trying registered single kernel"
+                );
+            }
+        }
+
         let Some(support) = ctx.cuda_session.kernel(&self.encoding_id()) else {
             debug!(
                 encoding = %self.encoding_id(),
diff --git a/vortex-cuda/src/hybrid_dispatch/mod.rs b/vortex-cuda/src/hybrid_dispatch/mod.rs
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs