Skip to content

Commit caf4566

Browse files
committed
docs: clarify GPU dispatch flow
Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 0b75700 commit caf4566

2 files changed

Lines changed: 28 additions & 29 deletions

File tree

vortex-cuda/src/executor.rs

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -370,32 +370,20 @@ impl CudaArrayExt for ArrayRef {
370370
return self.execute(&mut ctx.ctx);
371371
}
372372

373-
// Try to fuse the encoding tree (or parts of it) into dynamic-dispatch
374-
// kernel launches. See hybrid_dispatch module docs for details.
375-
match hybrid_dispatch::try_dyn_dispatch(&self, ctx).await {
373+
// Try all GPU execution strategies: fused dynamic dispatch, partial
374+
// fusion with subtree fallbacks, and single-kernel fallback.
375+
// If none succeed, fall back to CPU execution.
376+
match hybrid_dispatch::try_gpu_dispatch(&self, ctx).await {
376377
Ok(canonical) => return Ok(canonical),
377378
Err(e) => {
378-
trace!(
379+
debug!(
379380
encoding = %self.encoding_id(),
380381
error = %e,
381-
"Hybrid dispatch not applicable, trying registered single kernel"
382+
"No GPU execution path available, falling back to CPU"
382383
);
383384
}
384385
}
385386

386-
let Some(support) = ctx.cuda_session.kernel(&self.encoding_id()) else {
387-
debug!(
388-
encoding = %self.encoding_id(),
389-
"No CUDA support registered for encoding, falling back to CPU execution"
390-
);
391-
return self.execute(&mut ctx.ctx);
392-
};
393-
394-
debug!(
395-
encoding = %self.encoding_id(),
396-
"Executing array on CUDA device"
397-
);
398-
399-
support.execute(self, ctx).await
387+
self.execute(&mut ctx.ctx)
400388
}
401389
}

vortex-cuda/src/hybrid_dispatch/mod.rs

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,18 @@
1717
//!
1818
//! A "subtree" is a branch with a root node that is not dyn dispatch compatible
1919
//! (below a compatible parent). It is executed via `execute_cuda`, which
20-
//! re-enters `try_dyn_dispatch` so compatible descendants still get fused.
20+
//! re-enters `try_gpu_dispatch` so compatible descendants still get fused.
2121
//!
2222
//! Strategies tried in order:
2323
//!
2424
//! 1. Fully fused — no subtrees, whole tree is one `DynamicDispatchPlan`.
2525
//!
2626
//! 2. Partial fusion — subtrees are executed first (sequentially, same
2727
//! stream), their device buffers become `LOAD` ops in a fused plan.
28-
//! Each subtree re-enters `try_dyn_dispatch` and may itself fuse.
28+
//! Each subtree re-enters `try_gpu_dispatch` and may itself fuse.
2929
//!
3030
//! 3. Fallback — root is not compatible. Delegate to its registered
31-
//! `CudaExecute` kernel; its children re-enter `try_dyn_dispatch`.
31+
//! `CudaExecute` kernel; its children re-enter `try_gpu_dispatch`.
3232
//!
3333
//! All three compose recursively to arbitrary depth.
3434
//!
@@ -38,7 +38,7 @@
3838
//! ```text
3939
//! ZonedReader (zone-map pruning, skips whole chunks)
4040
//! └── CudaFlatReader (per chunk)
41-
//! └── try_dyn_dispatch (fused decompression)
41+
//! └── try_gpu_dispatch
4242
//! └── FilterExecutor (CUB DeviceSelect on full output)
4343
//! ```
4444
@@ -58,13 +58,20 @@ use crate::dynamic_dispatch::plan_builder::find_subtrees;
5858
use crate::executor::CudaArrayExt;
5959
use crate::executor::CudaExecutionCtx;
6060

61-
/// Try to execute `array` via dynamic dispatch, fusing as much of the
62-
/// encoding tree as possible into single kernel launches and falling back
63-
/// to individual kernels for nodes not compatible with dynamic dispatch.
61+
/// Try to execute `array` on the GPU, attempting three strategies in order:
62+
///
63+
/// 1. Fully fused — the entire encoding tree compiles into one
64+
/// `DynamicDispatchPlan` kernel launch.
65+
/// 2. Partially fused — incompatible subtrees are executed first
66+
/// (via recursive `execute_cuda`), then the remaining compatible tree
67+
/// is fused into a single plan with their outputs as `LOAD` sources.
68+
/// 3. Single-kernel fallback — the root encoding's registered
69+
/// `CudaExecute` kernel handles one layer; its children re-enter
70+
/// this function recursively.
6471
///
6572
/// Returns `Ok(Canonical)` on success. Returns `Err` when the array
6673
/// cannot be handled (non-primitive output dtype, no registered kernel).
67-
pub async fn try_dyn_dispatch(
74+
pub async fn try_gpu_dispatch(
6875
array: &ArrayRef,
6976
ctx: &mut CudaExecutionCtx,
7077
) -> VortexResult<Canonical> {
@@ -85,11 +92,15 @@ pub async fn try_dyn_dispatch(
8592
debug!(encoding = %array.encoding_id(), num_stages = plan.num_stages, "fully-fused dyn dispatch");
8693
return plan.execute(output_ptype, array.len(), bufs, ctx);
8794
}
88-
} else if let Some(result) = try_partial_fuse(array, &subtrees, output_ptype, ctx).await? {
95+
} else if let Some(result) =
96+
// Incompatible subtrees are executed first (re-entering try_gpu_dispatch),
97+
// then their device buffers are injected as LOAD sources into a fused plan.
98+
try_partial_fuse(array, &subtrees, output_ptype, ctx).await?
99+
{
89100
return Ok(result);
90101
}
91102

92-
// Pure fallback — single kernel, children re-enter try_dyn_dispatch.
103+
// Single kernel fallback, children will re-enter `try_gpu_dispatch`.
93104
ctx.cuda_session()
94105
.kernel(&array.encoding_id())
95106
.ok_or_else(|| vortex_err!("No CUDA kernel for encoding {:?}", array.encoding_id()))?

0 commit comments

Comments
 (0)