File tree Expand file tree Collapse file tree
modelopt/torch/quantization Expand file tree Collapse file tree Original file line number Diff line number Diff line change 214214 ** _default_disabled_quantizer_cfg ,
215215 ** _mamba_moe_disabled_quantizer_cfg ,
216216 },
217+ "algorithm" : "max" ,
217218}
218219
219220MAMBA_MOE_FP8_CONSERVATIVE_CFG = {
225226 "*mixer.in_proj*" : {"enable" : False }, # Skip mamba linear
226227 "*mixer.out_proj*" : {"enable" : False }, # Skip mamba linear
227228 },
229+ "algorithm" : "max" ,
228230}
229231
230232FP8_PER_CHANNEL_PER_TOKEN_CFG = {
435437 ** _default_disabled_quantizer_cfg ,
436438 ** _mamba_moe_disabled_quantizer_cfg ,
437439 },
440+ "algorithm" : "max" ,
438441}
439442MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = {
440443 "quant_cfg" : {
455458 "*mixer.in_proj*" : {"enable" : False }, # Skip mamba linear
456459 "*mixer.out_proj*" : {"enable" : False }, # Skip mamba linear
457460 },
461+ "algorithm" : "max" ,
458462}
459463
460464
@@ -1083,12 +1087,6 @@ class MaxCalibConfig(QuantizeAlgorithmConfig):
10831087 description = "If True, the amax will be synced across the distributed processes." ,
10841088 )
10851089
1086- shared_moe_weight_scale : bool | None = ModeloptField (
1087- default = True ,
1088- title = "Whether to share the weight scale across local experts." ,
1089- description = "If True, the weight scale will be shared across local experts." ,
1090- )
1091-
10921090
10931091class MseCalibConfig (QuantizeAlgorithmConfig ):
10941092 """Configuration for per-tensor MSE calibration.
Original file line number Diff line number Diff line change @@ -99,7 +99,6 @@ def max_calibrate(
9999 model : nn .Module ,
100100 forward_loop : ForwardLoop | None = None ,
101101 distributed_sync = True ,
102- shared_moe_weight_scale = True ,
103102):
104103 """Calibrate the model using max.
105104
@@ -108,7 +107,6 @@ def max_calibrate(
108107 forward_loop: A callable which takes the model as argument and
109108 forwards calibration data through the model.
110109 distributed_sync: Whether to sync amax across distributed processes.
111- shared_moe_weight_scale: Whether to share the weight scale across local experts.
112110
113111 See :class:`MaxCalibConfig <modelopt.torch.quantization.config.MaxCalibConfig>` for
114112 details on the remaining arguments.
@@ -123,7 +121,7 @@ def max_calibrate(
123121 # Sync amax across local experts within each rank (for SequentialMLP)
124122 for name , module in model .named_modules ():
125123 if hasattr (module , "layer_sync_moe_local_experts_amax" ):
126- module .layer_sync_moe_local_experts_amax (shared_moe_weight_scale )
124+ module .layer_sync_moe_local_experts_amax ()
127125
128126 if not distributed_sync :
129127 return
Original file line number Diff line number Diff line change @@ -594,7 +594,11 @@ def layer_sync_moe_local_experts_amax(self):
594594 amax_dict = {}
595595 for expert in self .local_experts :
596596 for name , module in expert .named_modules ():
597- if isinstance (module , TensorQuantizer ) and module .amax is not None :
597+ if (
598+ isinstance (module , TensorQuantizer )
599+ and module .amax is not None
600+ and "input_quantizer" in name
601+ ):
598602 stored_amax = amax_dict .get (name )
599603 amax_tensor = module .amax .detach ().clone ()
600604 amax_dict [name ] = (
You can’t perform that action at this time.
0 commit comments