vortex-data
diff --git a/‎vortex-cuda/benches/dynamic_dispatch_cuda.rs‎
Lines changed: 136 additions & 0 deletions b/‎vortex-cuda/benches/dynamic_dispatch_cuda.rs‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎vortex-cuda/kernels/src/dynamic_dispatch.cu‎
Lines changed: 106 additions & 16 deletions b/‎vortex-cuda/kernels/src/dynamic_dispatch.cu‎
Lines changed: 106 additions & 16 deletions
diff --git a/‎vortex-cuda/kernels/src/dynamic_dispatch.h‎
Lines changed: 21 additions & 0 deletions b/‎vortex-cuda/kernels/src/dynamic_dispatch.h‎
Lines changed: 21 additions & 0 deletions
@@ -425,12 +425,148 @@ fn bench_alp_for_bitpacked(c: &mut Criterion) {
     group.finish();
 }
 
+// ---------------------------------------------------------------------------
+// Benchmark: Dict with narrower BitPacked codes (exercises widen_inplace)
+// ---------------------------------------------------------------------------
+
+/// Dict(codes=BitPacked<u8>, values=Prim<u32>) — widens u8 → u32 in smem.
+fn bench_dict_bp_u8_codes_u32_values(c: &mut Criterion) {
+    let mut group = c.benchmark_group("dict_widen_u8_to_u32");
+
+    let dict_size: usize = 4; // 2-bit codes
+    let bit_width: u8 = 2;
+    let dict_values: Vec<u32> = (0..dict_size as u32).map(|i| i * 1000 + 42).collect();
+
+    for (len, len_str) in BENCH_ARGS {
+        group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
+
+        let codes: Vec<u8> = (0..*len).map(|i| (i % dict_size) as u8).collect();
+        let codes_prim = PrimitiveArray::new(Buffer::from(codes), NonNullable);
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let codes_bp = BitPackedData::encode(&codes_prim.into_array(), bit_width, &mut ctx)
+            .vortex_expect("bitpack u8 codes");
+        let values_prim = PrimitiveArray::new(Buffer::from(dict_values.clone()), NonNullable);
+        let dict = DictArray::new(codes_bp.into_array(), values_prim.into_array());
+        let array = dict.into_array();
+
+        group.bench_with_input(
+            BenchmarkId::new("dynamic_dispatch_u32", len_str),
+            len,
+            |b, &n| {
+                let mut cuda_ctx =
+                    CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
+
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+
+                b.iter_custom(|iters| {
+                    let mut total_time = Duration::ZERO;
+                    for _ in 0..iters {
+                        total_time += bench_runner.run(&mut cuda_ctx);
+                    }
+                    total_time
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Dict(codes=BitPacked<u16>, values=Prim<u32>) — widens u16 → u32 in smem.
+fn bench_dict_bp_u16_codes_u32_values(c: &mut Criterion) {
+    let mut group = c.benchmark_group("dict_widen_u16_to_u32");
+
+    let dict_size: usize = 8; // 3-bit codes
+    let bit_width: u8 = 3;
+    let dict_values: Vec<u32> = (0..dict_size as u32).map(|i| i * 5000 + 100).collect();
+
+    for (len, len_str) in BENCH_ARGS {
+        group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
+
+        let codes: Vec<u16> = (0..*len).map(|i| (i % dict_size) as u16).collect();
+        let codes_prim = PrimitiveArray::new(Buffer::from(codes), NonNullable);
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let codes_bp = BitPackedData::encode(&codes_prim.into_array(), bit_width, &mut ctx)
+            .vortex_expect("bitpack u16 codes");
+        let values_prim = PrimitiveArray::new(Buffer::from(dict_values.clone()), NonNullable);
+        let dict = DictArray::new(codes_bp.into_array(), values_prim.into_array());
+        let array = dict.into_array();
+
+        group.bench_with_input(
+            BenchmarkId::new("dynamic_dispatch_u32", len_str),
+            len,
+            |b, &n| {
+                let mut cuda_ctx =
+                    CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
+
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+
+                b.iter_custom(|iters| {
+                    let mut total_time = Duration::ZERO;
+                    for _ in 0..iters {
+                        total_time += bench_runner.run(&mut cuda_ctx);
+                    }
+                    total_time
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Dict(codes=BitPacked<u32>, values=Prim<u32>) — same-width baseline, no widen.
+fn bench_dict_bp_u32_codes_u32_values(c: &mut Criterion) {
+    let mut group = c.benchmark_group("dict_nowiden_u32_to_u32");
+
+    let dict_size: usize = 8; // 3-bit codes
+    let bit_width: u8 = 3;
+    let dict_values: Vec<u32> = (0..dict_size as u32).map(|i| i * 5000 + 100).collect();
+
+    for (len, len_str) in BENCH_ARGS {
+        group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
+
+        let codes: Vec<u32> = (0..*len).map(|i| (i % dict_size) as u32).collect();
+        let codes_prim = PrimitiveArray::new(Buffer::from(codes), NonNullable);
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let codes_bp = BitPackedData::encode(&codes_prim.into_array(), bit_width, &mut ctx)
+            .vortex_expect("bitpack u32 codes");
+        let values_prim = PrimitiveArray::new(Buffer::from(dict_values.clone()), NonNullable);
+        let dict = DictArray::new(codes_bp.into_array(), values_prim.into_array());
+        let array = dict.into_array();
+
+        group.bench_with_input(
+            BenchmarkId::new("dynamic_dispatch_u32", len_str),
+            len,
+            |b, &n| {
+                let mut cuda_ctx =
+                    CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
+
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
+
+                b.iter_custom(|iters| {
+                    let mut total_time = Duration::ZERO;
+                    for _ in 0..iters {
+                        total_time += bench_runner.run(&mut cuda_ctx);
+                    }
+                    total_time
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
 fn benchmark_dynamic_dispatch(c: &mut Criterion) {
     bench_for_bitpacked(c);
     bench_dict_bp_codes(c);
     bench_runend(c);
     bench_dict_bp_codes_bp_for_values(c);
     bench_alp_for_bitpacked(c);
+    bench_dict_bp_u8_codes_u32_values(c);
+    bench_dict_bp_u16_codes_u32_values(c);
+    bench_dict_bp_u32_codes_u32_values(c);
 }
 
 criterion::criterion_group! {
 
@@ -39,9 +39,16 @@
 //
 // ## Mixed-width support
 //
-// LOAD sources from pending subtrees may have a narrower type than the
-// output (e.g. u8 dict codes in a u32 plan). load_element() widens
-// to T via static_cast — no separate widen kernel or smem intermediate.
+// Dict codes, RunEnd ends, and other child arrays may have a narrower
+// element type than the output T. Two mechanisms handle this:
+//
+// LOAD       load_element() dispatches on the per-stage PTypeTag to
+//            read at the source's native width and static_cast to T.
+// BITUNPACK  bitunpack_typed() unpacks at the source's native width,
+//            then widens to T in-place via a backward scan
+//            (widen_inplace). The smem region is pre-allocated at
+//            max(source_width, T) bytes per element by the Rust plan
+//            builder, so the widen never overflows.
 
 #include <assert.h>
 #include <cuda.h>
@@ -203,6 +210,28 @@ scatter_patches_chunk(const GPUPatches &patches, T *__restrict out, uint32_t chu
 // Source ops
 // ═══════════════════════════════════════════════════════════════════════════
 
+/// Widen SOURCE-sized elements in shared memory to DESTINATION-sized in-place.
+///
+/// A single warp performs the backward scan so that lockstep execution
+/// guarantees every load at index i retires before the store at i, and
+/// higher indices are already consumed. Using multiple warps would introduce
+/// a cross-warp race: a fast warp writing dst[low] can clobber source
+/// bytes that a slow warp has not yet read.
+template <typename DESTINATION, typename SOURCE>
+__device__ inline void widen_inplace(DESTINATION *dst, uint32_t len) {
+    if constexpr (sizeof(DESTINATION) <= sizeof(SOURCE)) {
+        return;
+    }
+    const SOURCE *src = reinterpret_cast<const SOURCE *>(dst);
+    if (threadIdx.x < warpSize) {
+        for (int32_t i = static_cast<int32_t>(len) - 1 - static_cast<int32_t>(threadIdx.x); i >= 0;
+             i -= warpSize) {
+            dst[i] = static_cast<DESTINATION>(src[i]);
+        }
+    }
+    __syncthreads();
+}
+
 /// FastLanes cooperative unpack — all threads in the block scatter-write
 /// decoded elements into `dst`. Caller must issue __syncthreads() before
 /// any thread reads from `dst`.
@@ -236,6 +265,68 @@ __device__ inline void bitunpack(const T *__restrict packed,
     }
 }
 
+/// Dispatch bitunpack at the source's native element width, then widen
+/// to T in-place so all downstream scalar ops and smem consumers see
+/// T-sized elements. Falls back to the direct `bitunpack<T>` path when
+/// the source ptype already matches T. Issues __syncthreads() before
+/// returning on all paths.
+///
+/// Accepts explicit chunk_start / chunk_len so it works for both input
+/// stages (full decode with chunk_start=0, chunk_len=stage.len) and
+/// the output stage (tiled with varying chunk_start / chunk_len).
+template <typename T>
+__device__ inline void bitunpack_typed(T *__restrict dst,
+                                       const void *__restrict packed,
+                                       uint64_t chunk_start,
+                                       uint32_t chunk_len,
+                                       const struct SourceOp &src,
+                                       PTypeTag source_ptype) {
+    // Fast path: source width matches T — no widening needed.
+    if (ptype_byte_width(source_ptype) == sizeof(T)) {
+        bitunpack<T>(reinterpret_cast<const T *>(packed), dst, chunk_start, chunk_len, src);
+        __syncthreads();
+        return;
+    }
+
+    // Compute total elements written by bitunpack (including alignment
+    // padding) so widen_inplace covers the full scratch region.
+    const uint32_t elem_off = src.params.bitunpack.element_offset;
+    const uint32_t dst_off = (chunk_start + elem_off) % FL_CHUNK;
+    const uint32_t n_chunks = (chunk_len + dst_off + FL_CHUNK - 1) / FL_CHUNK;
+    const uint32_t total_elems = n_chunks * FL_CHUNK;
+
+    // Narrow source: unpack at native width, then widen to T.
+    switch (source_ptype) {
+    case PTYPE_U8:
+    case PTYPE_I8: {
+        auto *narrow = reinterpret_cast<uint8_t *>(dst);
+        bitunpack<uint8_t>(reinterpret_cast<const uint8_t *>(packed), narrow, chunk_start, chunk_len, src);
+        __syncthreads();
+        widen_inplace<T, uint8_t>(dst, total_elems);
+        break;
+    }
+    case PTYPE_U16:
+    case PTYPE_I16: {
+        auto *narrow = reinterpret_cast<uint16_t *>(dst);
+        bitunpack<uint16_t>(reinterpret_cast<const uint16_t *>(packed), narrow, chunk_start, chunk_len, src);
+        __syncthreads();
+        widen_inplace<T, uint16_t>(dst, total_elems);
+        break;
+    }
+    case PTYPE_U32:
+    case PTYPE_I32:
+    case PTYPE_F32: {
+        auto *narrow = reinterpret_cast<uint32_t *>(dst);
+        bitunpack<uint32_t>(reinterpret_cast<const uint32_t *>(packed), narrow, chunk_start, chunk_len, src);
+        __syncthreads();
+        widen_inplace<T, uint32_t>(dst, total_elems);
+        break;
+    }
+    default:
+        __builtin_unreachable();
+    }
+}
+
 /// Read N values from a source op into `out`.
 ///
 /// Dispatches on `src.op_code` to handle each encoding:
@@ -354,16 +445,14 @@ __device__ void execute_output_stage(T *__restrict output,
         if (src.op_code == SourceOp::BITUNPACK) {
             chunk_len = bitunpack_tile_len(stage, block_len, elem_idx);
             T *scratch = reinterpret_cast<T *>(smem + stage.smem_byte_offset);
-            bitunpack<T>(reinterpret_cast<const T *>(stage.input_ptr),
-                         scratch,
-                         block_start + elem_idx,
-                         chunk_len,
-                         src);
+            bitunpack_typed<T>(scratch,
+                               reinterpret_cast<const void *>(stage.input_ptr),
+                               block_start + elem_idx,
+                               chunk_len,
+                               src,
+                               ptype);
             const uint32_t align = (block_start + elem_idx + src.params.bitunpack.element_offset) % FL_CHUNK;
             smem_src = scratch + align;
-            // Write barrier: all threads finished bitunpack (and any
-            // patches), safe to read from scratch.
-            __syncthreads();
         } else {
             chunk_len = block_len;
         }
@@ -438,11 +527,12 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) {
     const auto &src = stage.source;
 
     if (src.op_code == SourceOp::BITUNPACK) {
-        T *raw_smem = smem_out;
-        bitunpack<T>(reinterpret_cast<const T *>(stage.input_ptr), smem_out, 0, stage.len, src);
-        // Write barrier: cooperative bitunpack finished, safe to read
-        // decoded elements below.
-        __syncthreads();
+        bitunpack_typed<T>(smem_out,
+                           reinterpret_cast<const void *>(stage.input_ptr),
+                           0,
+                           stage.len,
+                           src,
+                           stage.source_ptype);
 
         smem_out += src.params.bitunpack.element_offset % SMEM_TILE_SIZE;
 
 
@@ -78,6 +78,27 @@ PTYPE_HOST_DEVICE constexpr PTypeTag ptype_to_unsigned(PTypeTag tag) {
         return tag;
     }
 }
+
+PTYPE_HOST_DEVICE constexpr uint8_t ptype_byte_width(PTypeTag tag) {
+    switch (tag) {
+    case PTYPE_U8:
+    case PTYPE_I8:
+        return 1;
+    case PTYPE_U16:
+    case PTYPE_I16:
+        return 2;
+    case PTYPE_U32:
+    case PTYPE_I32:
+    case PTYPE_F32:
+        return 4;
+    case PTYPE_U64:
+    case PTYPE_I64:
+    case PTYPE_F64:
+        return 8;
+    default:
+        return 0;
+    }
+}
 #endif
 
 /// Number of threads per CUDA block.