Update on "Add Triton INT4 dense kernels with dequant prefill path for Qwen3.5 MoE"

digantdesai · digantdesai · commit 21c2f4bc46da · 2026-04-29T12:26:27.000-07:00
Add three new Triton kernels for dense W4A16 linear projections that
replace tinygemm's tiled INT4 format with simple [N, K//2] packed weights
(same format as MoE experts):

- int4_matmul: fused dequant+tl.dot GEMM for medium M (prefill crossover)
- int4_matvec: bandwidth-optimized vec-mat for M=1 decode
- dequant_w4_to_bf16: weight dequant for large-M prefill via Inductor mm

W4DequantLinear wraps these with dual decode/prefill dispatch:
- Decode (M=1): int4_matvec (73 tok/s, ~35% slower than tinygemm)
- Prefill (M&gt;1): dequant+F.linear via Inductor (3400 tok/s at 3K tokens,
  +67% over tinygemm baseline)

Single 18GB weight blob (no duplication). Decode perf regression is a
known trade-off for uniform weight format — to be revisited with a
CUDA C++ matvec kernel.

Also adds INT8 dynamic-activation MoE tests and comprehensive correctness
tests (48 tests, all passing at rtol=0.01).

Co-authored-by: Claude &lt;noreplyanthropic.com&gt;

[ghstack-poisoned]
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -418,7 +418,9 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
   TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
   python -m executorch.examples.models.qwen3_5_moe.export \
       --prequantized "$LOCAL_MODEL_DIR" \
-      --output-dir "${OUTPUT_DIR}"
+      --output-dir "${OUTPUT_DIR}" \
+      --dense-prefill dequant \
+      --moe-activation-dtype int8
   echo "::endgroup::"
 
   test -f "${OUTPUT_DIR}/model.pte"
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -667,12 +667,12 @@ def _apply_turboquant(model, config):
 # ---------------------------------------------------------------------------
 
 
-def _set_batched_moe(model, enabled, moe_moe_moe_activation_dtype="bf16"):
+def _set_batched_moe(model, enabled, moe_activation_dtype="bf16"):
     """Toggle batched tensor-core MoE kernel for all MoE layers."""
     for layer in model.layers:
         if hasattr(layer, "mlp") and hasattr(layer.mlp, "experts"):
             layer.mlp.experts.use_batched_moe = enabled
-            layer.mlp.experts.moe_moe_moe_activation_dtype = moe_moe_moe_activation_dtype
+            layer.mlp.experts.moe_activation_dtype = moe_activation_dtype
 
 
 def export_and_lower(model, config, args):
@@ -916,8 +916,8 @@ def _export_cuda(model, config, args):
     # chunk_gated_delta_rule with CHUNK_SIZE=64) for the full range of sequence
     # lengths. Smaller examples cause AOTI to bake in intermediate buffer sizes
     # that reject longer prompts at runtime.
-    moe_moe_moe_activation_dtype = getattr(args, "moe_moe_moe_activation_dtype", "bf16")
-    _set_batched_moe(model, True, moe_moe_moe_activation_dtype=moe_moe_moe_activation_dtype)
+    moe_activation_dtype = getattr(args, "moe_activation_dtype", "bf16")
+    _set_batched_moe(model, True, moe_activation_dtype=moe_activation_dtype)
     dense_prefill = getattr(args, "dense_prefill", "tinygemm")
     _set_dequant_prefill(model, dense_prefill == "dequant")
     print("Exporting prefill method...")
@@ -1087,14 +1087,15 @@ def main():  # noqa: C901
         "--moe-activation-dtype",
         choices=["bf16", "int8"],
         default="bf16",
-        help="MoE activation dtype for prefill only. Decode always uses bf16. bf16 (default): W4A16 batched GEMM. int8: W4A8 with INT8 tensor cores (~1.5x faster prefill).",
+        help="MoE activation dtype for prefill only. Decode always uses bf16. bf16 (default): W4A16 batched GEMM. int8: W4A8 with INT8 tensor cores.",
     )
     parser.add_argument(
         "--dense-prefill",
         choices=["tinygemm", "dequant"],
         default="tinygemm",
-        help="Dense linear kernel: tinygemm (default W4A16 INT4 kernel) or "
-        "dequant (dequant W4→BF16 + Inductor mm for prefill, int4_matvec for decode).",
+        help="Dense linear prefill kernel. Decode always uses int4_matvec (Triton W4A16 vec-mat). "
+        "tinygemm (default): W4A16 _weight_int4pack_mm. "
+        "dequant: dequant W4→BF16 + cuBLAS GEMM.",
     )
     args = parser.parse_args()
 
@@ -1139,7 +1140,7 @@ def main():  # noqa: C901
                 "(dense weights must be W4 quantized)"
             )
 
-    if args.moe_moe_activation_dtype != "bf16" and args.backend != "cuda":
+    if args.moe_activation_dtype != "bf16" and args.backend != "cuda":
         parser.error("--moe-activation-dtype int8 requires --backend cuda")
 
     model, config = load_and_quantize(args)
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -479,7 +479,7 @@ def __init__(self, config):
         self.hidden_size = config.hidden_size
         self.group_size = 32
         self.use_batched_moe = False
-        self.moe_moe_activation_dtype = "bf16"
+        self.moe_activation_dtype = "bf16"
 
         self.w1_weight = nn.Parameter(
             torch.empty(
@@ -498,7 +498,7 @@ def __init__(self, config):
 
     def forward(self, x, expert_weights, expert_indices, top_k):
         if self.use_batched_moe:
-            if self.moe_moe_activation_dtype == "int8":
+            if self.moe_activation_dtype == "int8":
                 return torch.ops.triton.fused_moe_batched_gemm_int8(
                     x,
                     self.w1,