PaddlePaddle · From00 · May 21, 2026 · May 20, 2026
diff --git a/paddleformers/transformers/glm_moe_dsa/modeling.py b/paddleformers/transformers/glm_moe_dsa/modeling.py
@@ -122,19 +122,12 @@ def _qkv_sep(matrix_2d, ortho_fn, kv_head_num=None, num_key_value_groups=None):
             )
 
         def _ffn_gate_up(matrix, ortho_fn, intermediate_size=None):
+            """Slice FFN gate_up, orthogonalise gate and up independently."""
             import paddle
 
-            if matrix.ndim == 2:
-                gate, up = paddle.split(matrix, [intermediate_size, intermediate_size], axis=1)
-                return paddle.concat([ortho_fn(gate), ortho_fn(up)], axis=1)
-            elif matrix.ndim == 3:
-                expert_updates = []
-                for ei in range(matrix.shape[0]):
-                    gate, up = paddle.split(matrix[ei], [intermediate_size, intermediate_size], axis=1)
-                    expert_updates.append(paddle.concat([ortho_fn(gate), ortho_fn(up)], axis=1))
-                return paddle.stack(expert_updates, axis=0)
-            else:
-                raise ValueError(f"FFN gate_up split expects 2D or 3D tensor, got shape {matrix.shape}")
+            assert matrix.ndim == 2 or matrix.ndim == 3, "FFN gate_up split expects 2D or 3D tensor"
+            gate, up = paddle.split(matrix, [intermediate_size, intermediate_size], axis=-1)
+            return paddle.concat([ortho_fn(gate), ortho_fn(up)], axis=-1)
 
         def _mla_per_head(matrix_2d_global, ortho_fn, head_num=None, axis=None, head_split_sizes=None):
             import paddle
@@ -144,17 +137,6 @@ def _mla_per_head(matrix_2d_global, ortho_fn, head_num=None, axis=None, head_spl
             processed_groups = [ortho_fn(group) for group in groups]
             return paddle.concat(processed_groups, axis=axis)
 
-        def _moe_experts(matrix_3d_global, ortho_fn):
-            import paddle
-
-            if matrix_3d_global.ndim != 3:
-                raise ValueError(f"MoE expert split expects 3D tensor, got shape {matrix_3d_global.shape}")
-            n_experts = matrix_3d_global.shape[0]
-            return paddle.stack(
-                [ortho_fn(matrix_3d_global[ei]) for ei in range(n_experts)],
-                axis=0,
-            )
-
         slice_config = {}
 
         muon_configs = config.muon_configs
@@ -164,7 +146,6 @@ def _moe_experts(matrix_3d_global, ortho_fn):
         num_key_value_heads = config.num_key_value_heads
         num_key_value_groups = num_attention_head // num_key_value_heads
         use_mla = getattr(config, "q_lora_rank", None) and config.q_lora_rank > 0
-        moe_expert_fusion = getattr(config, "moe_expert_fusion", False)
         use_gated_attn = getattr(config, "use_gated_attn", False)
 
         muon_qkv_update_mode = muon_configs.get("muon_qkv_update_mode", "split_head")
@@ -181,8 +162,6 @@ def _moe_experts(matrix_3d_global, ortho_fn):
 
         ffn_slice_fn = _ffn_gate_up if muon_ffn_split else None
 
-        fused_moe_fn = _moe_experts if moe_expert_fusion else None
-
         mla_slice_fn = None
         if use_mla and muon_qkv_update_mode == "split_head":
             mla_slice_fn = _mla_per_head
@@ -216,10 +195,6 @@ def _add_layer_slice_config(prefix):
                             {"intermediate_size": moe_intermediate_size},
                         )
 
-            if moe_expert_fusion and fused_moe_fn is not None:
-                slice_config[f"{prefix}.mlp.experts.down_proj.weight"] = (fused_moe_fn, {})
-                slice_config[f"{prefix}.mlp.grouped_gemm_experts.weight2"] = (fused_moe_fn, {})
-
             if use_mla and mla_slice_fn is not None:
                 assert (
                     hasattr(config, "qk_nope_head_dim")

diff --git a/paddleformers/transformers/minimax_m2/modeling.py b/paddleformers/transformers/minimax_m2/modeling.py
@@ -93,17 +93,9 @@ def _ffn_gate_up(matrix, ortho_fn, intermediate_size=None):
             """Slice FFN gate_up, orthogonalise gate and up independently."""
             import paddle
 
-            if matrix.ndim == 2:
-                gate, up = paddle.split(matrix, [intermediate_size, intermediate_size], axis=1)
-                return paddle.concat([ortho_fn(gate), ortho_fn(up)], axis=1)
-            elif matrix.ndim == 3:
-                expert_updates = []
-                for ei in range(matrix.shape[0]):
-                    gate, up = paddle.split(matrix[ei], [intermediate_size, intermediate_size], axis=1)
-                    expert_updates.append(paddle.concat([ortho_fn(gate), ortho_fn(up)], axis=1))
-                return paddle.stack(expert_updates, axis=0)
-            else:
-                raise ValueError(f"FFN gate_up split expects 2D or 3D tensor, got shape {matrix.shape}")
+            assert matrix.ndim == 2 or matrix.ndim == 3, "FFN gate_up split expects 2D or 3D tensor"
+            gate, up = paddle.split(matrix, [intermediate_size, intermediate_size], axis=-1)
+            return paddle.concat([ortho_fn(gate), ortho_fn(up)], axis=-1)
 
         def _mla_per_head(matrix_2d_global, ortho_fn, head_num=None, axis=None, head_split_sizes=None):
             """Slice MLA weights by heads."""
@@ -114,18 +106,6 @@ def _mla_per_head(matrix_2d_global, ortho_fn, head_num=None, axis=None, head_spl
             processed_groups = [ortho_fn(group) for group in groups]
             return paddle.concat(processed_groups, axis=axis)
 
-        def _moe_experts(matrix_3d_global, ortho_fn):
-            """Slice MoE weights by experts."""
-            import paddle
-
-            if matrix_3d_global.ndim != 3:
-                raise ValueError(f"MoE expert split expects 3D tensor, got shape {matrix_3d_global.shape}")
-            n_experts = matrix_3d_global.shape[0]
-            return paddle.stack(
-                [ortho_fn(matrix_3d_global[ei]) for ei in range(n_experts)],
-                axis=0,
-            )
-
         slice_config = {}
 
         muon_configs = config.muon_configs
@@ -135,7 +115,6 @@ def _moe_experts(matrix_3d_global, ortho_fn):
         num_key_value_heads = config.num_key_value_heads
         num_key_value_groups = num_attention_head // num_key_value_heads
         use_mla = getattr(config, "q_lora_rank", None) and config.q_lora_rank > 0
-        moe_expert_fusion = getattr(config, "moe_expert_fusion", False)
         use_gated_attn = getattr(config, "use_gated_attn", False)
 
         # Get Muon configuration from muon_configs
@@ -155,9 +134,6 @@ def _moe_experts(matrix_3d_global, ortho_fn):
         # Determine FFN slice strategy
         ffn_slice_fn = _ffn_gate_up if muon_ffn_split else None
 
-        # Determine Fused MoE slice strategy
-        fused_moe_fn = _moe_experts if moe_expert_fusion else None
-
         # Determine MLA slice strategy
         mla_slice_fn = None
         if use_mla and muon_qkv_update_mode == "split_head":
@@ -198,11 +174,6 @@ def _add_layer_slice_config(prefix):
                             {"intermediate_size": moe_intermediate_size},
                         )
 
-            # Fused MoE weights (grouped_gemm)
-            if moe_expert_fusion and fused_moe_fn is not None:
-                slice_config[f"{prefix}.mlp.experts.down_proj.weight"] = (fused_moe_fn, {})
-                slice_config[f"{prefix}.mlp.grouped_gemm_experts.weight2"] = (fused_moe_fn, {})
-
             # MLA weights
             if use_mla and mla_slice_fn is not None:
                 assert (