Add weighted gather kernel to moe_scatter_gather.cu

TimDettmers · claude · TimDettmers · commit 95ec5561abdb · 2026-03-09T18:39:50.000-04:00
Port cmoe_weighted_gather_bf16 from SM_120 branch. This adds a fused
gather + weight multiply + FP32 accumulate + BF16 convert kernel that
replaces separate gather + scale + sum operations. The kernel uses
atomicAdd for cross-expert accumulation with minimal contention.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/csrc/qutlass/moe_scatter_gather.cu b/csrc/qutlass/moe_scatter_gather.cu
@@ -1,17 +1,23 @@
 /*
  * Scatter and gather kernels for MoE batched NVFP4 GEMM pipeline.
  *
- * Scatter: copies packed FP4 data from concatenated token layout to
+ * Scatter: copies packed FP4/uint8 data from concatenated token layout to
  *          padded per-expert batched layout. Zero-fills padding rows.
+ *          Works for both packed FP4 activations (row_bytes = K/2) and
+ *          scale factors (same kernel, different row_bytes).
  *
  * Gather: copies BF16 results from padded per-expert batched layout
  *         back to concatenated token layout.
  *
- * Both kernels use one threadblock per expert with vectorized 128-bit
+ * Weighted gather: fused gather + multiply by expert gating weight +
+ *         atomicAdd into output. Single kernel replaces gather + scale + sum.
+ *
+ * All kernels use one threadblock per expert with vectorized 128-bit
  * (uint4) loads/stores for bandwidth efficiency.
  */
 
 #include <cuda_runtime.h>
+#include <cuda_bf16.h>
 #include <cstdint>
 
 // =========================================================================
@@ -153,6 +159,62 @@ __global__ void kMoeGatherBF16(
 }
 
 
+// =========================================================================
+// Weighted gather: padded per-expert BF16 → FP32 accumulate → BF16 output
+// =========================================================================
+// Two-phase operation (both launched from one extern "C" call):
+//   Phase 1: kMoeWeightedGatherAccum — read BF16 expert output, multiply by
+//            gating weight, atomicAdd into FP32 workspace.
+//   Phase 2: kConvertFP32ToBF16 — convert FP32 workspace to BF16 output.
+//
+// Uses a token-parallel layout: grid = (total_assignments,) where each
+// assignment is a (token_id, expert_id, weight) triple. Atomic contention
+// is minimal — at most top_k experts write to the same token row, and with
+// N=4096 elements spread across 256 threads, collisions are rare.
+//
+// FP32 accumulation avoids BF16 rounding error across top_k additions.
+// The final conversion to BF16 rounds once at the end.
+
+__global__ void kMoeWeightedGatherAccum(
+    const __nv_bfloat16* __restrict__ D_batched, // [num_experts * max_M * N]
+    float* __restrict__ workspace,                // [num_tokens * N] fp32, zero-initialized
+    const int* __restrict__ token_ids,            // [total_assignments]
+    const int* __restrict__ expert_ids,           // [total_assignments]
+    const int* __restrict__ slot_ids,             // [total_assignments]
+    const float* __restrict__ weights,            // [total_assignments]
+    int max_M,
+    int N
+) {
+    int assign = blockIdx.x;
+    int token_id = token_ids[assign];
+    int expert_id = expert_ids[assign];
+    int slot_id = slot_ids[assign];
+    float w = weights[assign];
+
+    const __nv_bfloat16* src = D_batched + ((long long)expert_id * max_M + slot_id) * N;
+    float* dst = workspace + (long long)token_id * N;
+
+    int tid = threadIdx.x;
+    int stride = blockDim.x;
+
+    for (int i = tid; i < N; i += stride) {
+        float val = __bfloat162float(src[i]) * w;
+        atomicAdd(&dst[i], val);
+    }
+}
+
+__global__ void kConvertFP32ToBF16(
+    const float* __restrict__ input,       // [n_elements]
+    __nv_bfloat16* __restrict__ output,    // [n_elements]
+    int n_elements
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n_elements) {
+        output[idx] = __float2bfloat16(input[idx]);
+    }
+}
+
+
 // =========================================================================
 // extern "C" launchers
 // =========================================================================
@@ -203,3 +265,54 @@ extern "C" void cmoe_gather_bf16(
         row_bytes
     );
 }
+
+extern "C" void cmoe_weighted_gather_bf16(
+    const void* D_batched,      // [num_experts * max_M * N] bf16
+    void* output_bf16,          // [num_tokens * N] bf16, final output
+    float* workspace_fp32,      // [num_tokens * N] fp32, caller-managed scratch
+    const int* token_ids,       // [total_assignments]
+    const int* expert_ids,      // [total_assignments]
+    const int* slot_ids,        // [total_assignments]
+    const float* weights,       // [total_assignments]
+    int total_assignments,
+    int num_tokens,
+    int max_M,
+    int N,
+    cudaStream_t stream
+) {
+    if (total_assignments <= 0) return;
+
+    int n_elements = num_tokens * N;
+
+    // Zero the FP32 workspace
+    cudaMemsetAsync(workspace_fp32, 0, (size_t)n_elements * sizeof(float), stream);
+
+    // Phase 1: weighted accumulate into FP32 workspace
+    {
+        dim3 grid(total_assignments);
+        dim3 block(256);
+
+        kMoeWeightedGatherAccum<<<grid, block, 0, stream>>>(
+            static_cast<const __nv_bfloat16*>(D_batched),
+            workspace_fp32,
+            token_ids,
+            expert_ids,
+            slot_ids,
+            weights,
+            max_M,
+            N
+        );
+    }
+
+    // Phase 2: convert FP32 → BF16
+    {
+        int threads = 256;
+        int blocks = (n_elements + threads - 1) / threads;
+
+        kConvertFP32ToBF16<<<blocks, threads, 0, stream>>>(
+            workspace_fp32,
+            static_cast<__nv_bfloat16*>(output_bf16),
+            n_elements
+        );
+    }
+}