Skip to content

Commit 7d6c203

Browse files
fix any_model for gpt_oss
Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
1 parent 153847c commit 7d6c203

1 file changed

Lines changed: 17 additions & 3 deletions

File tree

modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_model_descriptor.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,13 @@ def attn_no_op_post_init(decoder_layer):
8282

8383
@staticmethod
8484
def mlp_no_op_post_init(decoder_layer):
85-
"""Replace MLP sublayers with no-op modules."""
85+
"""Replace MLP sublayers with no-op modules.
86+
87+
Note: GPT-OSS MoE layers return (hidden_states, router_scores), so we need
88+
to return a tuple of 2 values.
89+
"""
8690
decoder_layer.post_attention_layernorm = Same()
87-
decoder_layer.mlp = MatchingZeros()
91+
decoder_layer.mlp = return_tuple_of_size(MatchingZeros, size=2)()
8892

8993
@staticmethod
9094
def init_rotary_embedding(model, runtime):
@@ -192,7 +196,17 @@ class GptOss20bExpertRemovalLayerDescriptor(ExpertRemovalLayerDescriptor):
192196
router_weights: List[str] = field(default_factory=lambda: ["router.weight"])
193197
router_biases: List[str] = field(default_factory=lambda: ["router.bias"])
194198

195-
# Per-expert format (unquantized models have fused tensors without .weight suffix)
199+
# Fused format: single tensors containing all experts (test models)
200+
fused_expert_weights: List[str] = field(
201+
default_factory=lambda: [
202+
"experts.gate_up_proj",
203+
"experts.gate_up_proj_bias",
204+
"experts.down_proj",
205+
"experts.down_proj_bias",
206+
]
207+
)
208+
209+
# Not used for fused format, but kept for compatibility
196210
expert_weights: List[str] = field(default_factory=lambda: ["gate_up_proj", "down_proj"])
197211
expert_biases: List[str] = field(
198212
default_factory=lambda: ["gate_up_proj_bias", "down_proj_bias"]

0 commit comments

Comments
 (0)