@@ -82,9 +82,13 @@ def attn_no_op_post_init(decoder_layer):
8282
8383 @staticmethod
8484 def mlp_no_op_post_init (decoder_layer ):
85- """Replace MLP sublayers with no-op modules."""
85+ """Replace MLP sublayers with no-op modules.
86+
87+ Note: GPT-OSS MoE layers return (hidden_states, router_scores), so we need
88+ to return a tuple of 2 values.
89+ """
8690 decoder_layer .post_attention_layernorm = Same ()
87- decoder_layer .mlp = MatchingZeros ()
91+ decoder_layer .mlp = return_tuple_of_size ( MatchingZeros , size = 2 ) ()
8892
8993 @staticmethod
9094 def init_rotary_embedding (model , runtime ):
@@ -192,7 +196,17 @@ class GptOss20bExpertRemovalLayerDescriptor(ExpertRemovalLayerDescriptor):
192196 router_weights : List [str ] = field (default_factory = lambda : ["router.weight" ])
193197 router_biases : List [str ] = field (default_factory = lambda : ["router.bias" ])
194198
195- # Per-expert format (unquantized models have fused tensors without .weight suffix)
199+ # Fused format: single tensors containing all experts (test models)
200+ fused_expert_weights : List [str ] = field (
201+ default_factory = lambda : [
202+ "experts.gate_up_proj" ,
203+ "experts.gate_up_proj_bias" ,
204+ "experts.down_proj" ,
205+ "experts.down_proj_bias" ,
206+ ]
207+ )
208+
209+ # Not used for fused format, but kept for compatibility
196210 expert_weights : List [str ] = field (default_factory = lambda : ["gate_up_proj" , "down_proj" ])
197211 expert_biases : List [str ] = field (
198212 default_factory = lambda : ["gate_up_proj_bias" , "down_proj_bias" ]
0 commit comments