Skip to content

Commit 5edd5ac

Browse files
committed
fix bug
Signed-off-by: Jennifer Chen <jennifchen@nvidia.com>
1 parent 2a33e19 commit 5edd5ac

3 files changed

Lines changed: 10 additions & 10 deletions

File tree

modelopt/torch/quantization/config.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@
214214
**_default_disabled_quantizer_cfg,
215215
**_mamba_moe_disabled_quantizer_cfg,
216216
},
217+
"algorithm": "max",
217218
}
218219

219220
MAMBA_MOE_FP8_CONSERVATIVE_CFG = {
@@ -225,6 +226,7 @@
225226
"*mixer.in_proj*": {"enable": False}, # Skip mamba linear
226227
"*mixer.out_proj*": {"enable": False}, # Skip mamba linear
227228
},
229+
"algorithm": "max",
228230
}
229231

230232
FP8_PER_CHANNEL_PER_TOKEN_CFG = {
@@ -435,6 +437,7 @@
435437
**_default_disabled_quantizer_cfg,
436438
**_mamba_moe_disabled_quantizer_cfg,
437439
},
440+
"algorithm": "max",
438441
}
439442
MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = {
440443
"quant_cfg": {
@@ -455,6 +458,7 @@
455458
"*mixer.in_proj*": {"enable": False}, # Skip mamba linear
456459
"*mixer.out_proj*": {"enable": False}, # Skip mamba linear
457460
},
461+
"algorithm": "max",
458462
}
459463

460464

@@ -1083,12 +1087,6 @@ class MaxCalibConfig(QuantizeAlgorithmConfig):
10831087
description="If True, the amax will be synced across the distributed processes.",
10841088
)
10851089

1086-
shared_moe_weight_scale: bool | None = ModeloptField(
1087-
default=True,
1088-
title="Whether to share the weight scale across local experts.",
1089-
description="If True, the weight scale will be shared across local experts.",
1090-
)
1091-
10921090

10931091
class MseCalibConfig(QuantizeAlgorithmConfig):
10941092
"""Configuration for per-tensor MSE calibration.

modelopt/torch/quantization/model_calib.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ def max_calibrate(
9999
model: nn.Module,
100100
forward_loop: ForwardLoop | None = None,
101101
distributed_sync=True,
102-
shared_moe_weight_scale=True,
103102
):
104103
"""Calibrate the model using max.
105104
@@ -108,7 +107,6 @@ def max_calibrate(
108107
forward_loop: A callable which takes the model as argument and
109108
forwards calibration data through the model.
110109
distributed_sync: Whether to sync amax across distributed processes.
111-
shared_moe_weight_scale: Whether to share the weight scale across local experts.
112110
113111
See :class:`MaxCalibConfig <modelopt.torch.quantization.config.MaxCalibConfig>` for
114112
details on the remaining arguments.
@@ -123,7 +121,7 @@ def max_calibrate(
123121
# Sync amax across local experts within each rank (for SequentialMLP)
124122
for name, module in model.named_modules():
125123
if hasattr(module, "layer_sync_moe_local_experts_amax"):
126-
module.layer_sync_moe_local_experts_amax(shared_moe_weight_scale)
124+
module.layer_sync_moe_local_experts_amax()
127125

128126
if not distributed_sync:
129127
return

modelopt/torch/quantization/plugins/megatron.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,11 @@ def layer_sync_moe_local_experts_amax(self):
594594
amax_dict = {}
595595
for expert in self.local_experts:
596596
for name, module in expert.named_modules():
597-
if isinstance(module, TensorQuantizer) and module.amax is not None:
597+
if (
598+
isinstance(module, TensorQuantizer)
599+
and module.amax is not None
600+
and "input_quantizer" in name
601+
):
598602
stored_amax = amax_dict.get(name)
599603
amax_tensor = module.amax.detach().clone()
600604
amax_dict[name] = (

0 commit comments

Comments
 (0)