Update on "Add W4A8 INT8 activation kernels for batched MoE prefill"

digantdesai · digantdesai · commit dbcc10f25215 · 2026-04-28T19:39:40.000-07:00
INT8 tensor core variants of the batched MoE GEMM kernels that
dynamically quantize bf16 activations to INT8 per-row per-tile and
dequantize INT4 weights directly to INT8 (skipping bf16 conversion).
Uses tl.dot(int8, int8) → int32 accumulation with per-tile float32
rescale. 1.7× MoE speedup on A100 at M=1024 with 0.9998 cosine
similarity vs bf16 baseline.

Co-authored-by: Claude &lt;noreplyanthropic.com&gt;

[ghstack-poisoned]
diff --git a/backends/cuda/tests/test_fused_moe.py b/backends/cuda/tests/test_fused_moe.py
@@ -213,6 +213,11 @@ def _run_cpp_runner(runner_path, pte_path, ptd_path, input_files, output_base):
 
 
 class TestFusedMoE(unittest.TestCase):
+    # TODO: migrate from manual max_abs/max_ref relative checks to
+    # torch.allclose(atol=, rtol=). Current tests use per-tensor-max relative
+    # error which is looser than per-element allclose — need to calibrate atol
+    # for INT4 quantization noise floor across random weight magnitudes.
+
     def setUp(self):
         if not torch.cuda.is_available():
             self.skipTest("CUDA is not available")
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -535,12 +535,12 @@ def _apply_turboquant(model, config):
 # ---------------------------------------------------------------------------
 
 
-def _set_batched_moe(model, enabled, activation_dtype="bf16"):
+def _set_batched_moe(model, enabled, moe_activation_dtype="bf16"):
     """Toggle batched tensor-core MoE kernel for all MoE layers."""
     for layer in model.layers:
         if hasattr(layer, "mlp") and hasattr(layer.mlp, "experts"):
             layer.mlp.experts.use_batched_moe = enabled
-            layer.mlp.experts.activation_dtype = activation_dtype
+            layer.mlp.experts.moe_activation_dtype = moe_activation_dtype
 
 
 def export_and_lower(model, config, args):
@@ -783,8 +783,8 @@ def _export_cuda(model, config, args):
     # chunk_gated_delta_rule with CHUNK_SIZE=64) for the full range of sequence
     # lengths. Smaller examples cause AOTI to bake in intermediate buffer sizes
     # that reject longer prompts at runtime.
-    activation_dtype = getattr(args, "activation_dtype", "bf16")
-    _set_batched_moe(model, True, activation_dtype=activation_dtype)
+    moe_activation_dtype = getattr(args, "moe_activation_dtype", "bf16")
+    _set_batched_moe(model, True, moe_activation_dtype=moe_activation_dtype)
     print("Exporting prefill method...")
 
     example_prefill_len = config.max_seq_len - 1
@@ -949,10 +949,10 @@ def main():  # noqa: C901
         help="Disable split-K (flash-decoding) SDPA for decode; use tiled SDPA instead.",
     )
     parser.add_argument(
-        "--activation-dtype",
+        "--moe-activation-dtype",
         choices=["bf16", "int8"],
         default="bf16",
-        help="Activation dtype for batched MoE prefill kernels (bf16=W4A16, int8=W4A8).",
+        help="MoE activation dtype for prefill only. Decode always uses bf16. bf16 (default): W4A16 batched GEMM. int8: W4A8 with INT8 tensor cores (~1.5x faster prefill).",
     )
     args = parser.parse_args()
 
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -479,7 +479,7 @@ def __init__(self, config):
         self.hidden_size = config.hidden_size
         self.group_size = 32
         self.use_batched_moe = False
-        self.activation_dtype = "bf16"
+        self.moe_activation_dtype = "bf16"
 
         self.w1_weight = nn.Parameter(
             torch.empty(
@@ -498,7 +498,7 @@ def __init__(self, config):
 
     def forward(self, x, expert_weights, expert_indices, top_k):
         if self.use_batched_moe:
-            if self.activation_dtype == "int8":
+            if self.moe_activation_dtype == "int8":
                 return torch.ops.triton.fused_moe_batched_gemm_int8(
                     x,
                     self.w1,