ModelTC
diff --git a/‎lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=4096,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H100_80GB_HBM3.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py‎
Lines changed: 14 additions & 3 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 19 additions & 12 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 19 additions & 12 deletions
diff --git a/‎lightllm/common/fused_moe/grouped_topk.py‎
Lines changed: 2 additions & 3 deletions b/‎lightllm/common/fused_moe/grouped_topk.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎lightllm/common/fused_moe/moe_silu_and_mul.py‎
Lines changed: 46 additions & 106 deletions b/‎lightllm/common/fused_moe/moe_silu_and_mul.py‎
Lines changed: 46 additions & 106 deletions
diff --git a/‎lightllm/common/fused_moe/topk_select.py‎
Lines changed: 0 additions & 2 deletions b/‎lightllm/common/fused_moe/topk_select.py‎
Lines changed: 0 additions & 2 deletions
@@ -1 +1 @@
-{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 2, "num_stages": 2}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 1}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 1}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}}
+{"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "16384": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "32768": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}}
@@ -1 +1 @@
-{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"1": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}}
@@ -67,11 +67,22 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t
             topk_group=topk_group,
             num_expert_group=num_expert_group,
             scoring_func=self.scoring_func,
-            num_fused_shared_experts=self.num_fused_shared_experts,
         )
         if self.num_fused_shared_experts > 0:
-            topk_ids[:, -1] = self.n_routed_experts - 1
-            topk_weights[:, -1] = 1.0 / self.routed_scaling_factor
+            pad_topk_ids = torch.arange(
+                         start=self.n_routed_experts - self.num_fused_shared_experts, 
+                         end=self.n_routed_experts,
+                         step=1,
+                         dtype=topk_ids.dtype,
+                         device="cuda").view(1, self.num_fused_shared_experts).repeat(topk_ids.shape[0], 1)
+            pad_topk_weights = torch.full((topk_weights.shape[0], self.num_fused_shared_experts),
+                                          fill_value=1.0 / self.routed_scaling_factor,
+                                          device="cuda",
+                                          dtype=topk_weights.dtype)
+            
+            topk_ids = torch.cat([topk_ids, pad_topk_ids], dim=1)
+            topk_weights = torch.cat([topk_weights, pad_topk_weights], dim=1)
+        
         w1, w1_scale = self.w1
         w2, w2_scale = self.w2
         use_fp8_w8a8 = self.quant_method is not None
 
@@ -462,6 +462,7 @@ def grouped_matmul(
     out: torch.Tensor,
     mul_routed_weight: bool,
     use_fp8_w8a8: bool,
+    alloc_tensor_func=torch.empty,
     reused_mblock_infos=None,
     run_config: Optional[dict] = None,
 ):
@@ -525,8 +526,8 @@ def grouped_matmul(
         else:
             _m, _k = token_inputs.shape
             assert _k % block_size_k == 0
-            input_scale = torch.empty((_m, _k // block_size_k), dtype=torch.float32, device=token_inputs.device)
-            qinput_tensor = torch.empty((_m, _k), dtype=expert_weights.dtype, device=token_inputs.device)
+            input_scale = alloc_tensor_func((_m, _k // block_size_k), dtype=torch.float32, device=token_inputs.device)
+            qinput_tensor = alloc_tensor_func((_m, _k), dtype=expert_weights.dtype, device=token_inputs.device)
             per_token_group_quant_fp8(token_inputs, block_size_k, qinput_tensor, input_scale)
             token_inputs, token_input_scale = qinput_tensor, input_scale
 
@@ -611,6 +612,7 @@ def fused_experts_impl(
     w2_scale: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
+    alloc_tensor_func=torch.empty,
     run_config: Optional[dict] = None,
 ):
     # Check constraints.
@@ -625,26 +627,29 @@ def fused_experts_impl(
     CHUNK_SIZE = FFN_MOE_CHUNK_SIZE
     topk_num = topk_ids.shape[1]
     M = min(num_tokens, CHUNK_SIZE)
-
-    cache = torch.empty(M * topk_num * max(N, w2.shape[1]), device=hidden_states.device, dtype=hidden_states.dtype)
-    intermediate_cache1 = cache[: M * topk_num * N].view(M, topk_num, N)
-    intermediate_cache2 = torch.empty((M, topk_num, N // 2), device=hidden_states.device, dtype=hidden_states.dtype)
-    intermediate_cache3 = cache[: M * topk_num * w2.shape[1]].view(M, topk_num, w2.shape[1])
+    
+    intermediate_cache13_shared = alloc_tensor_func((M, topk_num, max(N, w2.shape[1])), device=hidden_states.device, dtype=hidden_states.dtype)
+    intermediate_cache1 = intermediate_cache13_shared.view(-1)[:(M * topk_num * N)].view(M, topk_num, N)
+    intermediate_cache2 = alloc_tensor_func(
+        (M, topk_num, N // 2), device=hidden_states.device, dtype=hidden_states.dtype
+    )
+    intermediate_cache3 = intermediate_cache13_shared.view(-1)[:(M * topk_num * w2.shape[1])].view(M, topk_num, w2.shape[1])
 
     if inplace:
         out_hidden_states = hidden_states
     else:
-        out_hidden_states = torch.empty(hidden_states.shape, device=hidden_states.device, dtype=hidden_states.dtype)
+        out_hidden_states = alloc_tensor_func(
+            hidden_states.shape, device=hidden_states.device, dtype=hidden_states.dtype
+        )
 
     for chunk in range(triton.cdiv(num_tokens, CHUNK_SIZE)):
         begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE, min((chunk + 1) * CHUNK_SIZE, num_tokens))
         curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
         tokens_in_chunk, _ = curr_hidden_states.shape
 
-        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
-            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
-            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
-            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+        intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
+        intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
+        intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
 
         curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
         curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
@@ -668,6 +673,7 @@ def fused_experts_impl(
             out=intermediate_cache1.view(-1, N),
             mul_routed_weight=False,
             use_fp8_w8a8=use_fp8_w8a8,
+            alloc_tensor_func=alloc_tensor_func,
             run_config=run_config,
         )
 
@@ -686,6 +692,7 @@ def fused_experts_impl(
             out=intermediate_cache3.view(-1, w2.shape[1]),
             mul_routed_weight=True,
             use_fp8_w8a8=use_fp8_w8a8,
+            alloc_tensor_func=alloc_tensor_func,
             reused_mblock_infos=reused_mblock_infos,
             run_config=run_config,
         )
 
@@ -208,7 +208,6 @@ def triton_grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
     group_score_used_topk_num=2,
-    num_fused_shared_experts: int = 0,
 ):
 
     if correction_bias is not None:
@@ -223,8 +222,8 @@ def triton_grouped_topk(
         dtype = torch.float32
 
     scores_buffer = torch.empty((token_num, total_expert_num), dtype=dtype, device="cuda")
-    out_topk_weights = torch.empty((token_num, topk + num_fused_shared_experts), dtype=torch.float32, device="cuda")
-    out_topk_ids = torch.empty((token_num, topk + num_fused_shared_experts), dtype=torch.long, device="cuda")
+    out_topk_weights = torch.empty((token_num, topk), dtype=torch.float32, device="cuda")
+    out_topk_ids = torch.empty((token_num, topk), dtype=torch.long, device="cuda")
 
     assert total_expert_num % num_expert_group == 0
 
 
@@ -4,9 +4,8 @@
 import triton.language as tl
 from .moe_silu_and_mul_config import MoeSiluAndMulKernelConfig
 
-
 @triton.jit
-def _silu_and_mul_kernel(
+def _silu_and_mul_kernel_fast(
     input_ptr,
     output_ptr,
     stride_input_m,
@@ -17,89 +16,48 @@ def _silu_and_mul_kernel(
     size_n,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
-):
-    stride_input_m = tl.cast(stride_input_m, dtype=tl.int64)
-    stride_output_m = tl.cast(stride_output_m, dtype=tl.int64)
-
-    tid = tl.program_id(0)
-    input_m_offsets = tid * BLOCK_M + tl.arange(0, BLOCK_M)
-    output_m_offsets = tid * BLOCK_M + tl.arange(0, BLOCK_M)
-
-    pid = tl.program_id(1)
-    input_n_offsets = pid * BLOCK_N + tl.arange(0, BLOCK_N)
-    output_n_offsets = pid * BLOCK_N + tl.arange(0, BLOCK_N)
-
-    up_offsets = input_m_offsets[:, None] * stride_input_m + (input_n_offsets[None, :] + size_n)
-    gate_offsets = input_m_offsets[:, None] * stride_input_m + input_n_offsets[None, :]
-    res_offsets = output_m_offsets[:, None] * stride_output_m + output_n_offsets[None, :]
-
-    up = tl.load(
-        input_ptr + up_offsets,
-        mask=(input_n_offsets < size_n)[None, :] * (input_m_offsets < size_m)[:, None],
-        other=0.0,
-    )
-    gate = tl.load(
-        input_ptr + gate_offsets,
-        mask=(input_n_offsets < size_n)[None, :] * (input_m_offsets < size_m)[:, None],
-        other=0.0,
-    ).to(tl.float32)
-
-    gate = gate / (1 + tl.exp(-gate))
-    gate = gate.to(input_ptr.dtype.element_ty)
-
-    tl.store(
-        output_ptr + res_offsets,
-        up * gate,
-        mask=(output_n_offsets < size_n)[None, :] * (output_m_offsets < size_m)[:, None],
-    )
-
-
-@triton.jit
-def _silu_and_mul_kernel_fast(
-    input_ptr,
-    output_ptr,
-    stride_input_m,
-    stride_input_n,
-    stride_output_m,
-    stride_output_n,
-    size_n,
-    BLOCK_N: tl.constexpr,
     NEED_MASK: tl.constexpr,
 ):
     stride_input_m = tl.cast(stride_input_m, dtype=tl.int64)
     stride_output_m = tl.cast(stride_output_m, dtype=tl.int64)
 
-    cur_batch = tl.program_id(0)
-    pid = tl.program_id(1)
-    n_offsets = pid * BLOCK_N + tl.arange(0, BLOCK_N)
-
-    up_offsets = cur_batch * stride_input_m + (n_offsets[None, :] + size_n)
-    gate_offsets = cur_batch * stride_input_m + n_offsets[None, :]
-    res_offsets = cur_batch * stride_output_m + n_offsets[None, :]
+    m_block_index = tl.program_id(0)
+    n_block_index = tl.program_id(1)
+    n_offsets = n_block_index * BLOCK_N + tl.arange(0, BLOCK_N)
+    m_start_index = m_block_index * BLOCK_M
+    m_end_index = (m_block_index + 1) * BLOCK_M
+    m_end_index = tl.where(m_end_index < size_m, m_end_index, size_m)
     if NEED_MASK:
         mask = n_offsets[None, :] < size_n
+        other = 0.0
     else:
-        mask = True
-
-    up = tl.load(
-        input_ptr + up_offsets,
-        mask=mask,
-        other=0.0,
-    )
-    gate = tl.load(
-        input_ptr + gate_offsets,
-        mask=mask,
-        other=0.0,
-    ).to(tl.float32)
-
-    gate = gate / (1 + tl.exp(-gate))
-    gate = gate.to(input_ptr.dtype.element_ty)
-
-    tl.store(
-        output_ptr + res_offsets,
-        up * gate,
-        mask=mask,
-    )
+        mask = None
+        other = None
+    
+    for m_index in range(m_start_index, m_end_index):
+        gate_offsets = m_index * stride_input_m + n_offsets[None, :]
+        up_offsets = m_index * stride_input_m + (n_offsets[None, :] + size_n)
+        out_offsets = m_index * stride_output_m + n_offsets[None, :]
+        
+        up = tl.load(
+            input_ptr + up_offsets,
+            mask=mask,
+            other=other,
+        )
+        gate = tl.load(
+            input_ptr + gate_offsets,
+            mask=mask,
+            other=other,
+        ).to(tl.float32)
+
+        gate = gate / (1 + tl.exp(-gate))
+        gate = gate.to(input_ptr.dtype.element_ty)
+
+        tl.store(
+            output_ptr + out_offsets,
+            up * gate,
+            mask=mask,
+        )
 
 
 def silu_and_mul_fwd(input: torch.Tensor, output: torch.Tensor, **run_config):
@@ -116,26 +74,6 @@ def silu_and_mul_fwd(input: torch.Tensor, output: torch.Tensor, **run_config):
     if not run_config:
         run_config = MoeSiluAndMulKernelConfig.try_to_get_best_config(M=size_m, N=size_n, out_dtype=str(output.dtype))
 
-    if size_m <= 4096:
-        BLOCK_N = run_config["BLOCK_N"]
-        grid = (
-            size_m,
-            triton.cdiv(size_n, BLOCK_N),
-        )
-        NEED_MASK = size_n % BLOCK_N != 0
-        _silu_and_mul_kernel_fast[grid](
-            input,
-            output,
-            stride_input_m,
-            stride_input_n,
-            stride_output_m,
-            stride_output_n,
-            size_n,
-            BLOCK_N=BLOCK_N,
-            NEED_MASK=NEED_MASK,
-        )
-        return
-
     BLOCK_M = run_config["BLOCK_M"]
     BLOCK_N = run_config["BLOCK_N"]
     num_warps = run_config["num_warps"]
@@ -144,17 +82,19 @@ def silu_and_mul_fwd(input: torch.Tensor, output: torch.Tensor, **run_config):
         triton.cdiv(size_m, BLOCK_M),
         triton.cdiv(size_n, BLOCK_N),
     )
-    _silu_and_mul_kernel[grid](
-        input,
-        output,
-        stride_input_m,
-        stride_input_n,
-        stride_output_m,
-        stride_output_n,
-        size_m,
-        size_n,
+    NEED_MASK = (size_n % BLOCK_N) != 0
+    _silu_and_mul_kernel_fast[grid](
+        input_ptr=input,
+        output_ptr=output,
+        stride_input_m=stride_input_m,
+        stride_input_n=stride_input_n,
+        stride_output_m=stride_output_m,
+        stride_output_n=stride_output_n,
+        size_m=size_m,
+        size_n=size_n,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
+        NEED_MASK=NEED_MASK,
         num_warps=num_warps,
     )
     return
@@ -175,7 +175,6 @@ def select_experts(
     num_expert_group: Optional[int] = None,
     scoring_func: str = "softmax",
     custom_routing_function: Optional[Callable] = None,
-    num_fused_shared_experts: int = 0,
 ):
     from lightllm.common.fused_moe.topk_select import fused_topk
     from lightllm.common.fused_moe.grouped_topk import triton_grouped_topk
@@ -211,7 +210,6 @@ def select_experts(
                 topk_group=topk_group,
                 scoring_func=scoring_func,
                 group_score_used_topk_num=group_score_topk_num,
-                num_fused_shared_experts=num_fused_shared_experts,
             )
 
     elif custom_routing_function is None:
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 2, "num_stages": 2}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 1}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 1}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 2, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}}
	`1`	+{"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "16384": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "32768": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}}
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 2}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 2}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
	`1`	+{"1": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}}