Skip to content

Commit 08cd44f

Browse files
committed
add weight-only supports in _QuantFusedExperts
Signed-off-by: Hung-Yueh Chiang <hungyuehc@nvidia.com>
1 parent e8c0602 commit 08cd44f

1 file changed

Lines changed: 20 additions & 0 deletions

File tree

modelopt/torch/quantization/plugins/huggingface.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,26 @@ def forward(self, *args, **kwargs):
900900
self._down_proj_linear = False
901901
return super().forward(*args, **kwargs)
902902

903+
def iter_weights_for_calibration(self):
904+
"""Yield ``(weight_slice, quantizer)`` pairs for each expert and weight type.
905+
906+
The base implementation resolves singular ``*_weight_quantizer`` names via
907+
``quantizer_attr_names``, but fused experts store per-expert quantizers as
908+
``nn.ModuleList`` attributes (``gate_up_proj_weight_quantizers``,
909+
``down_proj_weight_quantizers``). Override to yield the per-expert slice
910+
and its corresponding quantizer directly.
911+
"""
912+
for weight_name, quantizers_name in (
913+
("gate_up_proj", "gate_up_proj_weight_quantizers"),
914+
("down_proj", "down_proj_weight_quantizers"),
915+
):
916+
weight = getattr(self, weight_name, None)
917+
quantizers = getattr(self, quantizers_name, None)
918+
if weight is None or quantizers is None:
919+
continue
920+
for idx, q in enumerate(quantizers):
921+
yield weight[idx], q
922+
903923
def fold_weight(self, keep_attrs: bool = False):
904924
"""Fold per-expert weight quantizers into the fused 3-D weights.
905925

0 commit comments

Comments
 (0)