Add scatter/gather CUDA kernels for MoE batched pipeline

TimDettmers · claude · TimDettmers · commit 830491dfd137 · 2026-03-09T12:32:40.000-04:00
Scatter copies packed FP4 from concatenated token layout to padded per-expert
batched layout with zero-filling. Gather copies BF16 results back. Both use
one threadblock per expert with vectorized uint4 loads/stores. Includes C
entry points, op definitions, registered kernels, and convenience functions.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -343,6 +343,7 @@ if(BUILD_CUDA)
         add_library(nvfp4_common OBJECT
             csrc/qutlass/scale_reorder.cu
             csrc/qutlass/fused_quantize_nv.cu
+            csrc/qutlass/moe_scatter_gather.cu
         )
         set_target_properties(nvfp4_common PROPERTIES
             CUDA_ARCHITECTURES "${_NVFP4_COMMON_ARCHS}"
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -540,6 +540,44 @@ def _(blocked_scales: torch.Tensor, H: int, W: int) -> torch.Tensor:
     return torch.empty(H * W, dtype=torch.uint8, device=blocked_scales.device)
 
 
+# MoE scatter: concatenated FP4 → padded per-expert batched FP4
+torch.library.define(
+    "bitsandbytes::moe_scatter_nvfp4",
+    "(Tensor packed_concat, Tensor expert_offsets, int max_M, int K, int num_experts) -> Tensor",
+)
+
+
+@register_fake("bitsandbytes::moe_scatter_nvfp4")
+def _(
+    packed_concat: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    max_M: int,
+    K: int,
+    num_experts: int,
+) -> torch.Tensor:
+    row_bytes = K // 2
+    return torch.empty(num_experts * max_M * row_bytes, dtype=torch.uint8, device=packed_concat.device)
+
+
+# MoE gather: padded per-expert BF16 → concatenated BF16
+torch.library.define(
+    "bitsandbytes::moe_gather_bf16",
+    "(Tensor D_batched, Tensor expert_offsets, int max_M, int N, int num_experts, int total_tokens) -> Tensor",
+)
+
+
+@register_fake("bitsandbytes::moe_gather_bf16")
+def _(
+    D_batched: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    max_M: int,
+    N: int,
+    num_experts: int,
+    total_tokens: int,
+) -> torch.Tensor:
+    return torch.empty(total_tokens * N, dtype=torch.bfloat16, device=D_batched.device)
+
+
 # NVFP4 GEMM (A @ B^T with block-scaled FP4 inputs)
 torch.library.define(
     "bitsandbytes::gemm_nvfp4",
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -1055,6 +1055,58 @@ def _(
     return out
 
 
+@register_kernel("bitsandbytes::moe_scatter_nvfp4", "cuda")
+def _(
+    packed_concat: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    max_M: int,
+    K: int,
+    num_experts: int,
+) -> torch.Tensor:
+    """Scatter concatenated FP4 data to padded per-expert batched layout."""
+    row_bytes = K // 2
+    out = torch.empty(
+        num_experts * max_M * row_bytes, dtype=torch.uint8, device=packed_concat.device,
+    )
+    with _cuda_device_of(packed_concat):
+        lib.cmoe_scatter_nvfp4(
+            get_ptr(packed_concat),
+            get_ptr(out),
+            get_ptr(expert_offsets),
+            ct.c_int(max_M),
+            ct.c_int(K),
+            ct.c_int(num_experts),
+            _get_tensor_stream(packed_concat),
+        )
+    return out
+
+
+@register_kernel("bitsandbytes::moe_gather_bf16", "cuda")
+def _(
+    D_batched: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    max_M: int,
+    N: int,
+    num_experts: int,
+    total_tokens: int,
+) -> torch.Tensor:
+    """Gather BF16 results from padded per-expert layout to concatenated output."""
+    out = torch.empty(
+        total_tokens * N, dtype=torch.bfloat16, device=D_batched.device,
+    )
+    with _cuda_device_of(D_batched):
+        lib.cmoe_gather_bf16(
+            get_ptr(D_batched),
+            get_ptr(out),
+            get_ptr(expert_offsets),
+            ct.c_int(max_M),
+            ct.c_int(N),
+            ct.c_int(num_experts),
+            _get_tensor_stream(D_batched),
+        )
+    return out
+
+
 # Hand-written NVFP4 GEMM (SM_120+)
 #
 # Uses mma.sync.aligned.block_scale instructions for small-M decode.
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1280,6 +1280,56 @@ def scale_to_blocked_batched(
     )
 
 
+def moe_scatter_nvfp4(
+    packed_concat: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    max_M: int,
+    K: int,
+    num_experts: int,
+) -> torch.Tensor:
+    """Scatter concatenated FP4 data to padded per-expert batched layout.
+
+    Args:
+        packed_concat: Packed FP4 data [total_tokens * K/2] (uint8).
+        expert_offsets: Cumulative token offsets [num_experts + 1] (int32, device).
+        max_M: Padded max tokens per expert (128-aligned).
+        K: Hidden dimension.
+        num_experts: Number of experts.
+
+    Returns:
+        Padded batched FP4 data [num_experts * max_M * K/2] (uint8, zero-padded).
+    """
+    return torch.ops.bitsandbytes.moe_scatter_nvfp4(
+        packed_concat, expert_offsets, max_M, K, num_experts,
+    )
+
+
+def moe_gather_bf16(
+    D_batched: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    max_M: int,
+    N: int,
+    num_experts: int,
+    total_tokens: int,
+) -> torch.Tensor:
+    """Gather BF16 results from padded per-expert layout to concatenated.
+
+    Args:
+        D_batched: Batched BF16 output [num_experts * max_M * N] (bf16).
+        expert_offsets: Cumulative token offsets [num_experts + 1] (int32, device).
+        max_M: Padded max tokens per expert.
+        N: Output dimension.
+        num_experts: Number of experts.
+        total_tokens: Total tokens across all experts.
+
+    Returns:
+        Concatenated BF16 output [total_tokens * N].
+    """
+    return torch.ops.bitsandbytes.moe_gather_bf16(
+        D_batched, expert_offsets, max_M, N, num_experts, total_tokens,
+    )
+
+
 def dequantize_nvfp4(
     packed_data: torch.Tensor,
     quant_state: NVFP4QuantState,
diff --git a/csrc/qutlass/moe_scatter_gather.cu b/csrc/qutlass/moe_scatter_gather.cu
@@ -0,0 +1,205 @@
+/*
+ * Scatter and gather kernels for MoE batched NVFP4 GEMM pipeline.
+ *
+ * Scatter: copies packed FP4 data from concatenated token layout to
+ *          padded per-expert batched layout. Zero-fills padding rows.
+ *
+ * Gather: copies BF16 results from padded per-expert batched layout
+ *         back to concatenated token layout.
+ *
+ * Both kernels use one threadblock per expert with vectorized 128-bit
+ * (uint4) loads/stores for bandwidth efficiency.
+ */
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+// =========================================================================
+// Scatter: concatenated FP4 → padded per-expert batched FP4
+// =========================================================================
+// Each threadblock handles one expert. Threads cooperatively copy
+// n_tokens * row_bytes from the concatenated source to the padded
+// destination, then zero-fill padding rows.
+//
+// Data layout:
+//   Input:  packed_concat [total_tokens * row_bytes] contiguous
+//   Output: packed_batched [num_experts * max_M * row_bytes] with zero padding
+//
+// row_bytes = K / 2 (packed FP4: 2 values per byte)
+__global__ void kMoeScatterNVFP4(
+    const uint8_t* __restrict__ input,    // [total_tokens * row_bytes]
+    uint8_t* __restrict__ output,          // [num_experts * max_M * row_bytes]
+    const int* __restrict__ expert_offsets, // [num_experts + 1] cumulative token offsets
+    int max_M,                             // padded max tokens per expert
+    int row_bytes                          // K / 2
+) {
+    int expert = blockIdx.x;
+    int start = expert_offsets[expert];
+    int end = expert_offsets[expert + 1];
+    int n_tokens = end - start;
+
+    // Source: contiguous in concatenated buffer
+    const uint8_t* src = input + (long long)start * row_bytes;
+
+    // Destination: padded slot for this expert
+    uint8_t* dst = output + (long long)expert * max_M * row_bytes;
+
+    // Total bytes to process for this expert (data + padding)
+    long long total_bytes = (long long)max_M * row_bytes;
+    long long data_bytes = (long long)n_tokens * row_bytes;
+
+    // Use vectorized uint4 (16-byte) copies where possible
+    int tid = threadIdx.x;
+    int stride = blockDim.x;
+
+    // Copy data rows using uint4 vectorization
+    long long vec_data_bytes = (data_bytes / 16) * 16;
+    const uint4* src4 = reinterpret_cast<const uint4*>(src);
+    uint4* dst4 = reinterpret_cast<uint4*>(dst);
+    long long n_vec = vec_data_bytes / 16;
+
+    for (long long i = tid; i < n_vec; i += stride) {
+        dst4[i] = src4[i];
+    }
+
+    // Handle remaining bytes in data region
+    for (long long i = vec_data_bytes + tid; i < data_bytes; i += stride) {
+        dst[i] = src[i];
+    }
+
+    // Zero-fill padding region using uint4
+    long long pad_start = data_bytes;
+    long long pad_bytes = total_bytes - pad_start;
+
+    if (pad_bytes > 0) {
+        // Align pad_start up to 16-byte boundary for vectorized zeroing
+        long long aligned_pad_start = ((pad_start + 15) / 16) * 16;
+
+        // Zero unaligned bytes at start of padding
+        for (long long i = pad_start + tid; i < aligned_pad_start && i < total_bytes; i += stride) {
+            dst[i] = 0;
+        }
+
+        // Vectorized zero-fill
+        uint4 zero4 = make_uint4(0, 0, 0, 0);
+        long long vec_pad_end = (total_bytes / 16) * 16;
+        uint4* dst4_pad = reinterpret_cast<uint4*>(dst);
+        long long vec_start = aligned_pad_start / 16;
+        long long vec_end = vec_pad_end / 16;
+
+        for (long long i = vec_start + tid; i < vec_end; i += stride) {
+            dst4_pad[i] = zero4;
+        }
+
+        // Zero remaining bytes at end
+        for (long long i = vec_pad_end + tid; i < total_bytes; i += stride) {
+            dst[i] = 0;
+        }
+    }
+}
+
+
+// =========================================================================
+// Gather: padded per-expert BF16 → concatenated BF16
+// =========================================================================
+// Each threadblock handles one expert. Threads cooperatively copy
+// n_tokens * row_elems BF16 values from the padded batched output
+// to the concatenated result.
+//
+// Data layout:
+//   Input:  D_batched [num_experts * max_M * N] bf16
+//   Output: D_concat [total_tokens * N] bf16
+//
+// row_bytes = N * 2 (bf16 = 2 bytes per element)
+__global__ void kMoeGatherBF16(
+    const uint8_t* __restrict__ input,     // [num_experts * max_M * row_bytes]
+    uint8_t* __restrict__ output,           // [total_tokens * row_bytes]
+    const int* __restrict__ expert_offsets, // [num_experts + 1]
+    int max_M,
+    int row_bytes                           // N * 2
+) {
+    int expert = blockIdx.x;
+    int start = expert_offsets[expert];
+    int end = expert_offsets[expert + 1];
+    int n_tokens = end - start;
+
+    if (n_tokens <= 0) return;
+
+    // Source: padded slot for this expert
+    const uint8_t* src = input + (long long)expert * max_M * row_bytes;
+
+    // Destination: contiguous in concatenated buffer
+    uint8_t* dst = output + (long long)start * row_bytes;
+
+    long long data_bytes = (long long)n_tokens * row_bytes;
+
+    int tid = threadIdx.x;
+    int stride = blockDim.x;
+
+    // Vectorized uint4 copy
+    long long vec_bytes = (data_bytes / 16) * 16;
+    const uint4* src4 = reinterpret_cast<const uint4*>(src);
+    uint4* dst4 = reinterpret_cast<uint4*>(dst);
+    long long n_vec = vec_bytes / 16;
+
+    for (long long i = tid; i < n_vec; i += stride) {
+        dst4[i] = src4[i];
+    }
+
+    // Handle remaining bytes
+    for (long long i = vec_bytes + tid; i < data_bytes; i += stride) {
+        dst[i] = src[i];
+    }
+}
+
+
+// =========================================================================
+// extern "C" launchers
+// =========================================================================
+
+extern "C" void cmoe_scatter_nvfp4(
+    const void* input,
+    void* output,
+    const int* expert_offsets,
+    int max_M,
+    int K,
+    int num_experts,
+    cudaStream_t stream
+) {
+    int row_bytes = K / 2;  // packed FP4: 2 values per byte
+
+    // One threadblock per expert, 256 threads
+    dim3 grid(num_experts);
+    dim3 block(256);
+
+    kMoeScatterNVFP4<<<grid, block, 0, stream>>>(
+        static_cast<const uint8_t*>(input),
+        static_cast<uint8_t*>(output),
+        expert_offsets,
+        max_M,
+        row_bytes
+    );
+}
+
+extern "C" void cmoe_gather_bf16(
+    const void* input,
+    void* output,
+    const int* expert_offsets,
+    int max_M,
+    int N,
+    int num_experts,
+    cudaStream_t stream
+) {
+    int row_bytes = N * 2;  // bf16: 2 bytes per element
+
+    dim3 grid(num_experts);
+    dim3 block(256);
+
+    kMoeGatherBF16<<<grid, block, 0, stream>>>(
+        static_cast<const uint8_t*>(input),
+        static_cast<uint8_t*>(output),
+        expert_offsets,
+        max_M,
+        row_bytes
+    );
+}

Original file line number	Diff line number	Diff line change
`@@ -343,6 +343,7 @@ if(BUILD_CUDA)`
`343`	`343`	`add_library(nvfp4_common OBJECT`
`344`	`344`	`csrc/qutlass/scale_reorder.cu`
`345`	`345`	`csrc/qutlass/fused_quantize_nv.cu`
	`346`	`+ csrc/qutlass/moe_scatter_gather.cu`
`346`	`347`	`)`
`347`	`348`	`set_target_properties(nvfp4_common PROPERTIES`
`348`	`349`	`CUDA_ARCHITECTURES "${_NVFP4_COMMON_ARCHS}"`