Update on "Add W4A8 INT8 activation kernels for batched MoE prefill"

digantdesai · digantdesai · commit a6aba5e8707a · 2026-04-28T14:06:19.000-07:00
INT8 tensor core variants of the batched MoE GEMM kernels that
dynamically quantize bf16 activations to INT8 per-row per-tile and
dequantize INT4 weights directly to INT8 (skipping bf16 conversion).
Uses tl.dot(int8, int8) → int32 accumulation with per-tile float32
rescale. 1.7× MoE speedup on A100 at M=1024 with 0.9998 cosine
similarity vs bf16 baseline.

Co-authored-by: Claude &lt;noreplyanthropic.com&gt;

[ghstack-poisoned]
diff --git a/backends/cuda/triton/kernels/fused_moe.py b/backends/cuda/triton/kernels/fused_moe.py
@@ -707,11 +707,11 @@ def _fused_moe_batched_kernel(
 # Autotune configs for batched INT8 GEMM1 (gate+up projection, W4A8).
 _BATCHED_GEMM1_INT8_CONFIGS = [
     triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
     triton.Config(
-        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3
+        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2
     ),
+    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
     triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
     triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
 ]
@@ -833,7 +833,10 @@ def _fused_moe_batched_int8_kernel(
         else:
             # Multi-group tile: dequantize weights per group, use float matmul
             b_dequant = (b_int8.to(tl.float32) * b_scale).to(compute_type)
-            acc += tl.dot(a_int8.to(compute_type), b_dequant).to(tl.float32) * a_scale[:, None]
+            acc += (
+                tl.dot(a_int8.to(compute_type), b_dequant).to(tl.float32)
+                * a_scale[:, None]
+            )
 
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
@@ -977,11 +980,11 @@ def _fused_moe_silu_batched_kernel(
 # Autotune configs for batched INT8 GEMM2 (down projection + SiLU, W4A8).
 _BATCHED_GEMM2_INT8_CONFIGS = [
     triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
     triton.Config(
-        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3
+        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2
     ),
+    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
     triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
     triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
 ]
@@ -1105,7 +1108,10 @@ def _fused_moe_silu_batched_int8_kernel(
         else:
             # Multi-group tile: dequantize weights per group, use float matmul
             b_dequant = (b_int8.to(tl.float32) * b_scale).to(compute_type)
-            acc += tl.dot(a_int8.to(compute_type), b_dequant).to(tl.float32) * a_scale[:, None]
+            acc += (
+                tl.dot(a_int8.to(compute_type), b_dequant).to(tl.float32)
+                * a_scale[:, None]
+            )
 
         a_gate_ptrs += BLOCK_SIZE_K * stride_ak
         a_up_ptrs += BLOCK_SIZE_K * stride_ak