upgrade triton moe config.

xuanyuanminzheng · xuanyuanminzheng · commit fb123b0b3c7e · 2026-06-02T17:51:00.000+08:00
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -1879,51 +1879,318 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict):
 
     def _get_default_config(self, M: int, E: int) -> dict:
         """
-        Heuristic tile config for BF16 MoE, ported verbatim from vLLM's
-        `get_default_config` (bf16/fp16 non-block_shape branch).
-        See vllm/model_executor/layers/fused_moe/fused_moe.py:1273-1319.
+        GPU-aware heuristic tile config for BF16 MoE.
 
-        M: number of tokens (A.size(0) in vLLM), i.e. pre-expansion token count.
-        E: number of (local) experts.
-        """
-
-        # Tile sizes scale with batch: small batches are memory-bound
-        # (favor tall-K tiles), large batches are compute-bound (favor
-        # large M/N tiles with more warps).
-        if M <= 32:
-            block_m = 16
-        elif M <= 96:
-            block_m = 32
-        elif M <= 512:
-            block_m = 64
-        else:
-            block_m = 128
-
-        block_n = 64 if M <= 64 else 128
-
-        block_k = 64
+        Derived from SGLang's per-device tuned JSON configs for E=64, N=1856:
+          - SM100 (B200): triton_3_5_1/E=64,N=1856,device_name=NVIDIA_B200.json
+          - SM90  (H100): triton_3_5_1/E=64,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
 
-        # Grouping adjacent M-blocks lets them share weight tiles in L2.
-        # Only helps when there are enough M-blocks per expert to group;
-        # with many experts each one sees few tokens so grouping is useless.
-        tokens_per_expert = M // max(E, 1)
-        group_m = 16 if tokens_per_expert > 128 else 1
+        Config selection mirrors SGLang's try_get_optimal_moe_config:
+          pick the entry whose key is closest to M by absolute difference.
 
-        # Large batches have enough blocks to saturate the GPU, so we
-        # use more warps per block to increase arithmetic intensity.
-        num_warps = 4 if M <= 128 else 8
-
-        num_stages = 4 if M <= 32 else 3
-
-        return {
-            "BLOCK_SIZE_M": block_m,
-            "BLOCK_SIZE_N": block_n,
-            "BLOCK_SIZE_K": block_k,
-            "GROUP_SIZE_M": group_m,
-            "num_warps": num_warps,
-            "num_stages": num_stages,
+        M: number of tokens (pre-expansion token count).
+        E: number of (local) experts.
+        """
+        from fastdeploy.model_executor.utils import get_sm_version
+
+        # SM100=B200 (sm_version>=100), SM90=H100, default to H100 on unknown GPU
+        _SM100_CONFIGS = {
+            1: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 32,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            2: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 32,
+                "num_warps": 4,
+                "num_stages": 3,
+            },
+            4: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 64,
+                "num_warps": 4,
+                "num_stages": 4,
+            },
+            8: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 32,
+                "num_warps": 4,
+                "num_stages": 3,
+            },
+            16: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 3,
+            },
+            24: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 16,
+                "num_warps": 4,
+                "num_stages": 4,
+            },
+            32: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 16,
+                "num_warps": 4,
+                "num_stages": 4,
+            },
+            48: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 4,
+            },
+            64: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 4,
+            },
+            96: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 3,
+            },
+            128: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 3,
+            },
+            256: {
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            512: {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 5,
+            },
+            1024: {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 4,
+            },
+            1536: {
+                "BLOCK_SIZE_M": 256,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 3,
+            },
+            2048: {
+                "BLOCK_SIZE_M": 256,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 3,
+            },
+            3072: {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 4,
+            },
+            4096: {
+                "BLOCK_SIZE_M": 256,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 3,
+            },
+        }
+        _SM90_CONFIGS = {
+            1: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 32,
+                "num_warps": 4,
+                "num_stages": 3,
+            },
+            2: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 16,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            4: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 2,
+            },
+            8: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 64,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            16: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 256,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            24: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 256,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            32: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 256,
+                "GROUP_SIZE_M": 16,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            48: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 256,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            64: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 256,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            96: {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 256,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            128: {
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 4,
+                "num_stages": 5,
+            },
+            256: {
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 2,
+            },
+            512: {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 4,
+            },
+            1024: {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 5,
+            },
+            1536: {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 4,
+            },
+            2048: {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 4,
+            },
+            3072: {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 4,
+            },
+            4096: {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+                "num_warps": 8,
+                "num_stages": 4,
+            },
         }
 
+        configs = _SM100_CONFIGS if get_sm_version() >= 100 else _SM90_CONFIGS
+        best_key = min(configs.keys(), key=lambda x: abs(x - M))
+        return configs[best_key]
+
     def apply_tp(
         self,
         layer: nn.Layer,
@@ -2094,4 +2361,4 @@ def apply_ep_prefill(
     def apply_ep_decode(
         self, layer, x, gate, topk_ids_hookfunc=None, shared_experts=None, fc1_latent_proj=None, fc2_latent_proj=None
     ):
-        raise NotImplementedError("TritonMoEMethod does not support EP decode yet.")
+        return self._apply_ep_no_deepep(layer, x, gate, topk_ids_hookfunc)