1717//!
1818//! A "subtree" is a branch with a root node that is not dyn dispatch compatible
1919//! (below a compatible parent). It is executed via `execute_cuda`, which
20- //! re-enters `try_dyn_dispatch ` so compatible descendants still get fused.
20+ //! re-enters `try_gpu_dispatch ` so compatible descendants still get fused.
2121//!
2222//! Strategies tried in order:
2323//!
2424//! 1. Fully fused — no subtrees, whole tree is one `DynamicDispatchPlan`.
2525//!
2626//! 2. Partial fusion — subtrees are executed first (sequentially, same
2727//! stream), their device buffers become `LOAD` ops in a fused plan.
28- //! Each subtree re-enters `try_dyn_dispatch ` and may itself fuse.
28+ //! Each subtree re-enters `try_gpu_dispatch ` and may itself fuse.
2929//!
3030//! 3. Fallback — root is not compatible. Delegate to its registered
31- //! `CudaExecute` kernel; its children re-enter `try_dyn_dispatch `.
31+ //! `CudaExecute` kernel; its children re-enter `try_gpu_dispatch `.
3232//!
3333//! All three compose recursively to arbitrary depth.
3434//!
3838//! ```text
3939//! ZonedReader (zone-map pruning, skips whole chunks)
4040//! └── CudaFlatReader (per chunk)
41- //! └── try_dyn_dispatch (fused decompression)
41+ //! └── try_gpu_dispatch
4242//! └── FilterExecutor (CUB DeviceSelect on full output)
4343//! ```
4444
@@ -58,13 +58,20 @@ use crate::dynamic_dispatch::plan_builder::find_subtrees;
5858use crate :: executor:: CudaArrayExt ;
5959use crate :: executor:: CudaExecutionCtx ;
6060
61- /// Try to execute `array` via dynamic dispatch, fusing as much of the
62- /// encoding tree as possible into single kernel launches and falling back
63- /// to individual kernels for nodes not compatible with dynamic dispatch.
61+ /// Try to execute `array` on the GPU, attempting three strategies in order:
62+ ///
63+ /// 1. Fully fused — the entire encoding tree compiles into one
64+ /// `DynamicDispatchPlan` kernel launch.
65+ /// 2. Partially fused — incompatible subtrees are executed first
66+ /// (via recursive `execute_cuda`), then the remaining compatible tree
67+ /// is fused into a single plan with their outputs as `LOAD` sources.
68+ /// 3. Single-kernel fallback — the root encoding's registered
69+ /// `CudaExecute` kernel handles one layer; its children re-enter
70+ /// this function recursively.
6471///
6572/// Returns `Ok(Canonical)` on success. Returns `Err` when the array
6673/// cannot be handled (non-primitive output dtype, no registered kernel).
67- pub async fn try_dyn_dispatch (
74+ pub async fn try_gpu_dispatch (
6875 array : & ArrayRef ,
6976 ctx : & mut CudaExecutionCtx ,
7077) -> VortexResult < Canonical > {
@@ -85,11 +92,15 @@ pub async fn try_dyn_dispatch(
8592 debug ! ( encoding = %array. encoding_id( ) , num_stages = plan. num_stages, "fully-fused dyn dispatch" ) ;
8693 return plan. execute ( output_ptype, array. len ( ) , bufs, ctx) ;
8794 }
88- } else if let Some ( result) = try_partial_fuse ( array, & subtrees, output_ptype, ctx) . await ? {
95+ } else if let Some ( result) =
96+ // Incompatible subtrees are executed first (re-entering try_gpu_dispatch),
97+ // then their device buffers are injected as LOAD sources into a fused plan.
98+ try_partial_fuse ( array, & subtrees, output_ptype, ctx) . await ?
99+ {
89100 return Ok ( result) ;
90101 }
91102
92- // Pure fallback — single kernel, children re-enter try_dyn_dispatch .
103+ // Single kernel fallback , children will re-enter `try_gpu_dispatch` .
93104 ctx. cuda_session ( )
94105 . kernel ( & array. encoding_id ( ) )
95106 . ok_or_else ( || vortex_err ! ( "No CUDA kernel for encoding {:?}" , array. encoding_id( ) ) ) ?
0 commit comments