refactor: merge upload_patches, uniform match pattern for source/scalar patches

0ax1 · 0ax1 · commit 8c7987fdd506 · 2026-04-21T15:13:50.000Z
Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -444,8 +444,6 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) {
         // decoded elements below.
         __syncthreads();
 
-        // BP patches are already applied inside bitunpack().
-
         smem_out += src.params.bitunpack.element_offset % SMEM_TILE_SIZE;
 
         if (stage.num_scalar_ops > 0) {
diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
@@ -48,9 +48,7 @@ use super::ptype_to_tag;
 use super::tag_to_ptype;
 use crate::CudaBufferExt;
 use crate::CudaExecutionCtx;
-use crate::kernel::DevicePatches;
-use crate::kernel::load_patches_sync;
-use crate::kernel::upload_gpu_patches;
+use crate::kernel::upload_patches;
 
 /// A plan whose source buffers have been copied to the device, ready for kernel launch.
 pub struct MaterializedPlan {
@@ -386,37 +384,26 @@ impl FusedPlan {
         for (stage, smem_byte_offset, len) in &self.stages {
             let mut source = stage.source;
 
-            // Upload BitPacked patches as a GPUPatches struct if present.
+            // Upload source patches (e.g. BitPacked exceptions).
             if let Some(patches) = &stage.source_patches {
-                let device_patches = load_patches_sync(patches, ctx)?;
-                let (gpu_buf, ptr) = upload_gpu_patches(&device_patches, ctx)?;
-                source.params.bitunpack.patches_ptr = ptr;
-                // Keep the underlying data buffers and the GPUPatches struct alive.
-                let DevicePatches {
-                    chunk_offsets,
-                    indices,
-                    values,
-                    ..
-                } = device_patches;
-                device_buffers.extend([chunk_offsets, indices, values, gpu_buf]);
+                let (ptr, bufs) = upload_patches(patches, ctx)?;
+                match source.op_code {
+                    SourceOp_SourceOpCode_BITUNPACK => source.params.bitunpack.patches_ptr = ptr,
+                    _ => unreachable!("patches on unsupported source op"),
+                }
+                device_buffers.extend(bufs);
             }
 
             // Upload patches for each scalar op that carries them.
             let mut scalar_ops: Vec<ScalarOp> = Vec::with_capacity(stage.scalar_ops.len());
             for (mut op, patches) in stage.scalar_ops.clone() {
                 if let Some(patches) = &patches {
-                    let device_patches = load_patches_sync(patches, ctx)?;
-                    let (gpu_buf, ptr) = upload_gpu_patches(&device_patches, ctx)?;
-                    if op.op_code == ScalarOp_ScalarOpCode_ALP {
-                        op.params.alp.patches_ptr = ptr;
+                    let (ptr, bufs) = upload_patches(&patches, ctx)?;
+                    match op.op_code {
+                        ScalarOp_ScalarOpCode_ALP => op.params.alp.patches_ptr = ptr,
+                        _ => unreachable!("patches on unsupported scalar op"),
                     }
-                    let DevicePatches {
-                        chunk_offsets,
-                        indices,
-                        values,
-                        ..
-                    } = device_patches;
-                    device_buffers.extend([chunk_offsets, indices, values, gpu_buf]);
+                    device_buffers.extend(bufs);
                 }
                 scalar_ops.push(op);
             }
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
@@ -34,9 +34,7 @@ pub use encodings::ZstdKernelPrep;
 pub use encodings::zstd_kernel_prepare;
 pub(crate) use encodings::*;
 pub(crate) use filter::FilterExecutor;
-pub(crate) use patches::types::DevicePatches;
-pub(crate) use patches::types::load_patches_sync;
-pub(crate) use patches::types::upload_gpu_patches;
+pub(crate) use patches::types::upload_patches;
 pub(crate) use slice::SliceExecutor;
 
 use crate::CudaKernelEvents;
diff --git a/vortex-cuda/src/kernel/patches/types.rs b/vortex-cuda/src/kernel/patches/types.rs
@@ -137,15 +137,19 @@ pub(crate) fn ptype_to_chunk_offset_type(ptype: PType) -> VortexResult<ChunkOffs
 ///
 /// Canonicalization is done on the CPU via [`LEGACY_SESSION`], then the
 /// resulting host buffers are uploaded to the device.
+/// Canonicalize patches on the CPU, upload data buffers and a [`GPUPatches`]
+/// struct to the device in one step. Returns the device pointer to the
+/// `GPUPatches` struct and a vec of buffer handles that must be kept alive
+/// for the duration of the kernel launch.
 ///
 /// # Errors
 ///
 /// If the patches do not have `chunk_offsets`.
 #[allow(deprecated)]
-pub(crate) fn load_patches_sync(
+pub(crate) fn upload_patches(
     patches: &Patches,
     ctx: &CudaExecutionCtx,
-) -> VortexResult<DevicePatches> {
+) -> VortexResult<(u64, Vec<BufferHandle>)> {
     let offset = patches.offset();
     let offset_within_chunk = patches.offset_within_chunk().unwrap_or_default();
 
@@ -159,6 +163,7 @@ pub(crate) fn load_patches_sync(
     // Canonicalize chunk_offsets on the CPU
     let co_canonical = co.clone().execute::<PrimitiveArray>(&mut exec_ctx)?;
     let chunk_offset_ptype = co_canonical.ptype();
+    let n_chunks = co_canonical.len();
     let chunk_offsets = co_canonical.buffer_handle().clone();
 
     // Canonicalize indices and convert to u32
@@ -187,52 +192,29 @@ pub(crate) fn load_patches_sync(
         .execute::<PrimitiveArray>(&mut exec_ctx)?;
     let values = values_prim.buffer_handle().clone();
 
-    // Upload all buffers to the device
+    // Upload data buffers to the device
     let chunk_offsets = ctx.ensure_on_device_sync(chunk_offsets)?;
     let indices = ctx.ensure_on_device_sync(indices)?;
     let values = ctx.ensure_on_device_sync(values)?;
 
-    let num_patches = patches.num_patches();
-    // n_chunks must match the chunk_offsets array length, not array_len / 1024.
-    // When patches are sliced, chunk_offsets is sliced to only include chunks
-    // overlapping the slice range — matching the CPU's patch_chunk which uses
-    // chunk_offsets_slice.len().
-    let n_chunks = co_canonical.len();
-
-    Ok(DevicePatches {
-        chunk_offsets,
-        chunk_offset_ptype,
-        indices,
-        values,
-        offset,
-        offset_within_chunk,
-        num_patches,
-        n_chunks,
-    })
-}
-
-/// Upload a [`GPUPatches`] struct to the device, returning the buffer handle
-/// (which must be kept alive) and the device pointer to the struct.
-///
-/// The caller must also keep the [`DevicePatches`] alive for the duration of
-/// the kernel launch, since the `GPUPatches` struct contains device pointers
-/// into the individual buffers owned by `DevicePatches`.
-pub(crate) fn upload_gpu_patches(
-    device_patches: &DevicePatches,
-    ctx: &CudaExecutionCtx,
-) -> VortexResult<(BufferHandle, u64)> {
+    // Build the GPUPatches C struct from device pointers.
     // Zero-initialize to avoid uninitialized padding bytes.
     let mut gpu_patches: GPUPatches = unsafe { std::mem::zeroed() };
-    gpu_patches.chunk_offsets = device_patches.chunk_offsets.cuda_device_ptr()? as _;
-    gpu_patches.chunk_offset_type = ptype_to_chunk_offset_type(device_patches.chunk_offset_ptype)?;
-    gpu_patches.indices = device_patches.indices.cuda_device_ptr()? as _;
-    gpu_patches.values = device_patches.values.cuda_device_ptr()? as _;
+    gpu_patches.chunk_offsets = chunk_offsets.cuda_device_ptr()? as _;
+    gpu_patches.chunk_offset_type = ptype_to_chunk_offset_type(chunk_offset_ptype)?;
+    gpu_patches.indices = indices.cuda_device_ptr()? as _;
+    gpu_patches.values = values.cuda_device_ptr()? as _;
+    let num_patches = patches.num_patches();
     #[expect(clippy::cast_possible_truncation)]
     {
-        gpu_patches.offset = device_patches.offset as u32;
-        gpu_patches.offset_within_chunk = device_patches.offset_within_chunk as u32;
-        gpu_patches.num_patches = device_patches.num_patches as u32;
-        gpu_patches.n_chunks = device_patches.n_chunks as u32;
+        gpu_patches.offset = offset as u32;
+        gpu_patches.offset_within_chunk = offset_within_chunk as u32;
+        gpu_patches.num_patches = num_patches as u32;
+        // n_chunks must match the chunk_offsets array length, not array_len / 1024.
+        // When patches are sliced, chunk_offsets is sliced to only include chunks
+        // overlapping the slice range — matching the CPU's patch_chunk which uses
+        // chunk_offsets_slice.len().
+        gpu_patches.n_chunks = n_chunks as u32;
     }
 
     // Serialize the repr(C) struct to bytes and upload to the device.
@@ -242,14 +224,13 @@ pub(crate) fn upload_gpu_patches(
             size_of::<GPUPatches>(),
         )
     };
-
     let mut buf =
         ByteBufferMut::with_capacity_aligned(size_of::<GPUPatches>(), Alignment::of::<u64>());
     buf.extend_from_slice(bytes);
-    let host_buf = BufferHandle::new_host(buf.freeze());
-    let device_buf = ctx.ensure_on_device_sync(host_buf)?;
-    let ptr = device_buf.cuda_device_ptr()?;
-    Ok((device_buf, ptr))
+    let gpu_buf = ctx.ensure_on_device_sync(BufferHandle::new_host(buf.freeze()))?;
+    let ptr = gpu_buf.cuda_device_ptr()?;
+
+    Ok((ptr, vec![chunk_offsets, indices, values, gpu_buf]))
 }
 
 #[cfg(test)]