further optimied for group_size == 32 and bz == 1

Gasoonjia · Gasoonjia · commit 6c3bab10ba8b · 2026-04-15T23:46:08.000-07:00
diff --git a/backends/cuda/benchmarks/benchmark_moe.py b/backends/cuda/benchmarks/benchmark_moe.py
@@ -10,7 +10,7 @@
 
 Measures latency across prompt lengths matching the Qwen3.5-35B-A3B model
 (hidden_size=2048, num_experts=256, top_k=8, intermediate_size=512,
-INT4 weight-only quantization with group_size=128).
+INT4 weight-only quantization with group_size=32).
 
 Usage:
     python benchmark_moe.py
@@ -21,7 +21,6 @@
 from functools import partial
 
 import executorch.backends.cuda.triton.kernels  # noqa: F401 — registers triton ops
-
 import torch
 from triton.testing import do_bench
 
@@ -33,7 +32,7 @@
     "top_k": 8,
     "hidden_size": 2048,
     "intermediate_size": 512,
-    "group_size": 128,
+    "group_size": 32,
 }
 
 PROMPT_LENGTHS = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4095]
diff --git a/backends/cuda/triton/kernels/fused_moe.py b/backends/cuda/triton/kernels/fused_moe.py
@@ -42,31 +42,39 @@
 
 
 # Autotune configs for GEMM1 (_fused_moe_kernel).
-# Top performers from CI benchmark on A100-SXM4-80GB, Qwen3.5 MoE dimensions
-# (M=1, N=1024, K=2048, 8 experts, group_size=128).
+# Qwen3.5 MoE dimensions (M=1, N=1024, K=2048, 8 experts, group_size=32).
+# BLOCK_K ≤ 32 ensures the efficient one-scale-per-tile path.
 _GEMM1_CONFIGS = [
+    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=2),
+    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=5),
+    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=4),
     triton.Config({"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=2),
-    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=2, num_stages=2),
-    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=2, num_stages=4),
-    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=2, num_stages=5),
-    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=2, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256}, num_warps=2, num_stages=5),
-    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=4),
-    triton.Config({"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 16}, num_warps=2, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 16}, num_warps=2, num_stages=5),
 ]
 
 # Autotune configs for GEMM2 (_fused_moe_silu_kernel).
-# Top performers from CI benchmark on A100-SXM4-80GB, Qwen3.5 MoE dimensions
-# (M=1, N=2048, K=512, 8 experts, group_size=128).
+# Qwen3.5 MoE dimensions (M=1, N=2048, K=512, 8 experts, group_size=32).
+# BLOCK_K ≤ 32 ensures the efficient one-scale-per-tile path.
 _GEMM2_CONFIGS = [
+    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=2),
+    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=5),
+    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, num_warps=2, num_stages=4),
     triton.Config({"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=2),
-    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 128}, num_warps=2, num_stages=4),
-    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=2, num_stages=4),
-    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256}, num_warps=4, num_stages=4),
-    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=2, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128}, num_warps=2, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 16}, num_warps=2, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 16}, num_warps=2, num_stages=5),
 ]
 
 
@@ -76,7 +84,7 @@ def _fused_moe_kernel(
     # Pointers
     A,  # [M, K] bf16 activations
     B,  # [E, N, K//2] int8 packed INT4 weights
-    C,  # [M * top_k, N] bf16 output
+    C,  # [M, N] fp32 output (atomic accumulation across experts)
     B_scale,  # [E, N, K//group_size] bf16 scales
     topk_ids,  # [M * top_k] int64 expert indices
     topk_weights,  # [M * top_k] float32 router weights
@@ -144,11 +152,16 @@ def _fused_moe_kernel(
         k_remaining = K - k_step * BLOCK_SIZE_K
         k_mask = offs_k < k_remaining
 
-        # Load A tile [BLOCK_SIZE_K]
-        a = tl.load(a_ptrs, mask=k_mask, other=0.0)
+        # Load A tile [BLOCK_SIZE_K] — reused across N-blocks, keep in L2
+        a = tl.load(a_ptrs, mask=k_mask, other=0.0, eviction_policy="evict_last")
 
-        # Load B tile [BLOCK_SIZE_K, BLOCK_SIZE_N] and unpack INT4
-        b = tl.load(b_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0)
+        # Load B tile [BLOCK_SIZE_K, BLOCK_SIZE_N] and unpack INT4 — streaming
+        b = tl.load(
+            b_ptrs,
+            mask=k_mask[:, None] & n_mask[None, :],
+            other=0,
+            eviction_policy="evict_first",
+        )
         b = (b >> b_shifter) & 0xF
 
         # Load per-group scales and dequantize
@@ -161,9 +174,12 @@ def _fused_moe_kernel(
                 + offs_n[None, :] * stride_bsn
                 + group_idx * stride_bsk
             )
-            b_scale = tl.load(scale_ptrs, mask=n_mask[None, :], other=0.0).to(
-                tl.float32
-            )
+            b_scale = tl.load(
+                scale_ptrs,
+                mask=n_mask[None, :],
+                other=0.0,
+                eviction_policy="evict_first",
+            ).to(tl.float32)
         else:
             scale_ptrs = (
                 B_scale
@@ -172,7 +188,10 @@ def _fused_moe_kernel(
                 + ((offs_k[:, None] + BLOCK_SIZE_K * k_step) // group_size) * stride_bsk
             )
             b_scale = tl.load(
-                scale_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0.0
+                scale_ptrs,
+                mask=k_mask[:, None] & n_mask[None, :],
+                other=0.0,
+                eviction_policy="evict_first",
             ).to(tl.float32)
 
         # Dequantize and accumulate in float32: vector-matrix multiply
@@ -222,6 +241,7 @@ def _fused_moe_silu_kernel(
     group_size: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
     BLOCK_SIZE_K: tl.constexpr,
+    top_k: tl.constexpr,
     compute_type: tl.constexpr,
 ):
     """GEMM2 with fused SiLU activation.
@@ -263,13 +283,22 @@ def _fused_moe_silu_kernel(
         k_remaining = K - k_step * BLOCK_SIZE_K
         k_mask = offs_k < k_remaining
 
-        # Load gate and up in float32, apply SiLU(gate) * up
-        gate = tl.load(a_gate_ptrs, mask=k_mask, other=0.0).to(tl.float32)
-        up = tl.load(a_up_ptrs, mask=k_mask, other=0.0).to(tl.float32)
+        # Load gate and up in float32, apply SiLU(gate) * up — reused across N-blocks
+        gate = tl.load(
+            a_gate_ptrs, mask=k_mask, other=0.0, eviction_policy="evict_last"
+        ).to(tl.float32)
+        up = tl.load(
+            a_up_ptrs, mask=k_mask, other=0.0, eviction_policy="evict_last"
+        ).to(tl.float32)
         a = gate * tl.sigmoid(gate) * up
 
-        # Load and dequantize INT4 weights
-        b = tl.load(b_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0)
+        # Load and dequantize INT4 weights — streaming
+        b = tl.load(
+            b_ptrs,
+            mask=k_mask[:, None] & n_mask[None, :],
+            other=0,
+            eviction_policy="evict_first",
+        )
         b = (b >> b_shifter) & 0xF
 
         if BLOCK_SIZE_K <= group_size:
@@ -280,9 +309,12 @@ def _fused_moe_silu_kernel(
                 + offs_n[None, :] * stride_bsn
                 + group_idx * stride_bsk
             )
-            b_scale = tl.load(scale_ptrs, mask=n_mask[None, :], other=0.0).to(
-                tl.float32
-            )
+            b_scale = tl.load(
+                scale_ptrs,
+                mask=n_mask[None, :],
+                other=0.0,
+                eviction_policy="evict_first",
+            ).to(tl.float32)
         else:
             scale_ptrs = (
                 B_scale
@@ -291,7 +323,10 @@ def _fused_moe_silu_kernel(
                 + ((offs_k[:, None] + BLOCK_SIZE_K * k_step) // group_size) * stride_bsk
             )
             b_scale = tl.load(
-                scale_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0.0
+                scale_ptrs,
+                mask=k_mask[:, None] & n_mask[None, :],
+                other=0.0,
+                eviction_policy="evict_first",
             ).to(tl.float32)
 
         b_dequant = (b.to(tl.float32) - 8.0) * b_scale
@@ -301,12 +336,13 @@ def _fused_moe_silu_kernel(
         a_up_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
 
-    # Multiply by router weight
+    # Multiply by router weight and atomically accumulate into token row
     weight = tl.load(topk_weights + pair_idx)
     acc = acc * weight
 
-    c_ptrs = C + pair_idx * stride_cm + offs_n * stride_cn
-    tl.store(c_ptrs, acc.to(compute_type), mask=n_mask)
+    token_idx = pair_idx // top_k
+    c_ptrs = C + token_idx * stride_cm + offs_n * stride_cn
+    tl.atomic_add(c_ptrs, acc, mask=n_mask)
 
 
 # ---------------------------------------------------------------------------
@@ -394,17 +430,16 @@ def grid1(meta):
     )
 
     # ---- GEMM2 with fused SiLU: reads gate+up from cache1, no intermediate buffer ----
-    cache3 = torch.empty(
-        num_pairs, N2, dtype=hidden_states.dtype, device=hidden_states.device
-    )
+    # Zero-init FP32 buffer — atomic_add in the kernel accumulates across top_k experts
+    output = torch.zeros(M, N2, dtype=torch.float32, device=hidden_states.device)
 
     def grid2(meta):
         return (num_pairs * triton.cdiv(N2, meta["BLOCK_SIZE_N"]),)
 
     wrap_triton(_fused_moe_silu_kernel)[grid2](
         cache1,
         w2,
-        cache3,
+        output,
         w2_scale,
         topk_ids_flat,
         topk_weights_flat,
@@ -416,17 +451,17 @@ def grid2(meta):
         stride_be=w2.stride(0),
         stride_bk=w2.stride(2),
         stride_bn=w2.stride(1),
-        stride_cm=cache3.stride(0),
-        stride_cn=cache3.stride(1),
+        stride_cm=output.stride(0),
+        stride_cn=output.stride(1),
         stride_bse=w2_scale.stride(0),
         stride_bsk=w2_scale.stride(2),
         stride_bsn=w2_scale.stride(1),
         group_size=group_size,
+        top_k=top_k,
         compute_type=tl.bfloat16,
     )
 
-    # ---- Sum across top-k experts ----
-    return cache3.view(M, top_k, N2).sum(dim=1)
+    return output.to(hidden_states.dtype)
 
 
 @fused_moe.register_fake