vortex-data
diff --git a/‎vortex-cuda/kernels/src/dynamic_dispatch.cu‎
Lines changed: 16 additions & 100 deletions b/‎vortex-cuda/kernels/src/dynamic_dispatch.cu‎
Lines changed: 16 additions & 100 deletions
diff --git a/‎vortex-cuda/kernels/src/dynamic_dispatch.h‎
Lines changed: 0 additions & 21 deletions b/‎vortex-cuda/kernels/src/dynamic_dispatch.h‎
Lines changed: 0 additions & 21 deletions
@@ -39,16 +39,9 @@
 //
 // ## Mixed-width support
 //
-// Dict codes, RunEnd ends, and other child arrays may have a narrower
-// element type than the output T. Two mechanisms handle this:
-//
-// LOAD       load_element() dispatches on the per-stage PTypeTag to
-//            read at the source's native width and static_cast to T.
-// BITUNPACK  bitunpack_typed() unpacks at the source's native width,
-//            then widens to T in-place via a backward scan
-//            (widen_inplace). The smem region is pre-allocated at
-//            max(source_width, T) bytes per element by the Rust plan
-//            builder, so the widen never overflows.
+// LOAD sources from pending subtrees may have a narrower type than the
+// output (e.g. u8 dict codes in a u32 plan). load_element() widens
+// to T via static_cast — no separate widen kernel or smem intermediate.
 
 #include <assert.h>
 #include <cuda.h>
@@ -210,22 +203,6 @@ scatter_patches_chunk(const GPUPatches &patches, T *__restrict out, uint32_t chu
 // Source ops
 // ═══════════════════════════════════════════════════════════════════════════
 
-/// Widen U-sized elements in shared memory to T-sized, in-place.
-/// Backward scan ensures no unread element is overwritten since
-/// sizeof(T) >= sizeof(U) guarantees the write at index i touches
-/// only bytes beyond those of src[i].
-template <typename T, typename U>
-__device__ inline void widen_inplace(T *dst, uint32_t len) {
-    if constexpr (sizeof(T) <= sizeof(U)) {
-        return;
-    }
-    const U *src = reinterpret_cast<const U *>(dst);
-    for (int32_t i = static_cast<int32_t>(len) - 1 - threadIdx.x; i >= 0; i -= blockDim.x) {
-        dst[i] = static_cast<T>(src[i]);
-    }
-    __syncthreads();
-}
-
 /// FastLanes cooperative unpack — all threads in the block scatter-write
 /// decoded elements into `dst`. Caller must issue __syncthreads() before
 /// any thread reads from `dst`.
@@ -259,68 +236,6 @@ __device__ inline void bitunpack(const T *__restrict packed,
     }
 }
 
-/// Dispatch bitunpack at the source's native element width, then widen
-/// to T in-place so all downstream scalar ops and smem consumers see
-/// T-sized elements. Falls back to the direct `bitunpack<T>` path when
-/// the source ptype already matches T. Issues __syncthreads() before
-/// returning on all paths.
-///
-/// Accepts explicit chunk_start / chunk_len so it works for both input
-/// stages (full decode with chunk_start=0, chunk_len=stage.len) and
-/// the output stage (tiled with varying chunk_start / chunk_len).
-template <typename T>
-__device__ inline void bitunpack_typed(T *__restrict dst,
-                                       const void *__restrict packed,
-                                       uint64_t chunk_start,
-                                       uint32_t chunk_len,
-                                       const struct SourceOp &src,
-                                       PTypeTag source_ptype) {
-    // Fast path: source width matches T — no widening needed.
-    if (ptype_byte_width(source_ptype) == sizeof(T)) {
-        bitunpack<T>(reinterpret_cast<const T *>(packed), dst, chunk_start, chunk_len, src);
-        __syncthreads();
-        return;
-    }
-
-    // Compute total elements written by bitunpack (including alignment
-    // padding) so widen_inplace covers the full scratch region.
-    const uint32_t elem_off = src.params.bitunpack.element_offset;
-    const uint32_t dst_off = (chunk_start + elem_off) % FL_CHUNK;
-    const uint32_t n_chunks = (chunk_len + dst_off + FL_CHUNK - 1) / FL_CHUNK;
-    const uint32_t total_elems = n_chunks * FL_CHUNK;
-
-    // Narrow source: unpack at native width, then widen to T.
-    switch (source_ptype) {
-    case PTYPE_U8:
-    case PTYPE_I8: {
-        auto *narrow = reinterpret_cast<uint8_t *>(dst);
-        bitunpack<uint8_t>(reinterpret_cast<const uint8_t *>(packed), narrow, chunk_start, chunk_len, src);
-        __syncthreads();
-        widen_inplace<T, uint8_t>(dst, total_elems);
-        break;
-    }
-    case PTYPE_U16:
-    case PTYPE_I16: {
-        auto *narrow = reinterpret_cast<uint16_t *>(dst);
-        bitunpack<uint16_t>(reinterpret_cast<const uint16_t *>(packed), narrow, chunk_start, chunk_len, src);
-        __syncthreads();
-        widen_inplace<T, uint16_t>(dst, total_elems);
-        break;
-    }
-    case PTYPE_U32:
-    case PTYPE_I32:
-    case PTYPE_F32: {
-        auto *narrow = reinterpret_cast<uint32_t *>(dst);
-        bitunpack<uint32_t>(reinterpret_cast<const uint32_t *>(packed), narrow, chunk_start, chunk_len, src);
-        __syncthreads();
-        widen_inplace<T, uint32_t>(dst, total_elems);
-        break;
-    }
-    default:
-        __builtin_unreachable();
-    }
-}
-
 /// Read N values from a source op into `out`.
 ///
 /// Dispatches on `src.op_code` to handle each encoding:
@@ -439,14 +354,16 @@ __device__ void execute_output_stage(T *__restrict output,
         if (src.op_code == SourceOp::BITUNPACK) {
             chunk_len = bitunpack_tile_len(stage, block_len, elem_idx);
             T *scratch = reinterpret_cast<T *>(smem + stage.smem_byte_offset);
-            bitunpack_typed<T>(scratch,
-                               reinterpret_cast<const void *>(stage.input_ptr),
-                               block_start + elem_idx,
-                               chunk_len,
-                               src,
-                               ptype);
+            bitunpack<T>(reinterpret_cast<const T *>(stage.input_ptr),
+                         scratch,
+                         block_start + elem_idx,
+                         chunk_len,
+                         src);
             const uint32_t align = (block_start + elem_idx + src.params.bitunpack.element_offset) % FL_CHUNK;
             smem_src = scratch + align;
+            // Write barrier: all threads finished bitunpack (and any
+            // patches), safe to read from scratch.
+            __syncthreads();
         } else {
             chunk_len = block_len;
         }
@@ -521,12 +438,11 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) {
     const auto &src = stage.source;
 
     if (src.op_code == SourceOp::BITUNPACK) {
-        bitunpack_typed<T>(smem_out,
-                           reinterpret_cast<const void *>(stage.input_ptr),
-                           0,
-                           stage.len,
-                           src,
-                           stage.source_ptype);
+        T *raw_smem = smem_out;
+        bitunpack<T>(reinterpret_cast<const T *>(stage.input_ptr), smem_out, 0, stage.len, src);
+        // Write barrier: cooperative bitunpack finished, safe to read
+        // decoded elements below.
+        __syncthreads();
 
         smem_out += src.params.bitunpack.element_offset % SMEM_TILE_SIZE;
 
 
@@ -78,27 +78,6 @@ PTYPE_HOST_DEVICE constexpr PTypeTag ptype_to_unsigned(PTypeTag tag) {
         return tag;
     }
 }
-
-PTYPE_HOST_DEVICE constexpr uint8_t ptype_byte_width(PTypeTag tag) {
-    switch (tag) {
-    case PTYPE_U8:
-    case PTYPE_I8:
-        return 1;
-    case PTYPE_U16:
-    case PTYPE_I16:
-        return 2;
-    case PTYPE_U32:
-    case PTYPE_I32:
-    case PTYPE_F32:
-        return 4;
-    case PTYPE_U64:
-    case PTYPE_I64:
-    case PTYPE_F64:
-        return 8;
-    default:
-        return 0;
-    }
-}
 #endif
 
 /// Number of threads per CUDA block.