|
214 | 214 | **_default_disabled_quantizer_cfg, |
215 | 215 | **_mamba_moe_disabled_quantizer_cfg, |
216 | 216 | }, |
217 | | - "algorithm": "max", |
| 217 | + "algorithm": {"method": "max", "shared_moe_weight_scale": False}, |
218 | 218 | } |
219 | 219 |
|
220 | 220 | MAMBA_MOE_FP8_CONSERVATIVE_CFG = { |
|
226 | 226 | "*mixer.in_proj*": {"enable": False}, # Skip mamba linear |
227 | 227 | "*mixer.out_proj*": {"enable": False}, # Skip mamba linear |
228 | 228 | }, |
229 | | - "algorithm": "max", |
| 229 | + "algorithm": {"method": "max", "shared_moe_weight_scale": False}, |
230 | 230 | } |
231 | 231 |
|
232 | 232 | FP8_PER_CHANNEL_PER_TOKEN_CFG = { |
|
437 | 437 | **_default_disabled_quantizer_cfg, |
438 | 438 | **_mamba_moe_disabled_quantizer_cfg, |
439 | 439 | }, |
440 | | - "algorithm": "max", |
| 440 | + "algorithm": {"method": "max", "shared_moe_weight_scale": False}, |
441 | 441 | } |
442 | 442 | MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { |
443 | 443 | "quant_cfg": { |
|
458 | 458 | "*mixer.in_proj*": {"enable": False}, # Skip mamba linear |
459 | 459 | "*mixer.out_proj*": {"enable": False}, # Skip mamba linear |
460 | 460 | }, |
461 | | - "algorithm": "max", |
| 461 | + "algorithm": {"method": "max", "shared_moe_weight_scale": False}, |
462 | 462 | } |
463 | 463 |
|
464 | 464 |
|
@@ -1087,6 +1087,12 @@ class MaxCalibConfig(QuantizeAlgorithmConfig): |
1087 | 1087 | description="If True, the amax will be synced across the distributed processes.", |
1088 | 1088 | ) |
1089 | 1089 |
|
| 1090 | + shared_moe_weight_scale: bool | None = ModeloptField( |
| 1091 | + default=True, |
| 1092 | + title="Whether to share the weight scale across local experts.", |
| 1093 | + description="If True, the weight scale will be shared across local experts.", |
| 1094 | + ) |
| 1095 | + |
1090 | 1096 |
|
1091 | 1097 | class MseCalibConfig(QuantizeAlgorithmConfig): |
1092 | 1098 | """Configuration for per-tensor MSE calibration. |
|
0 commit comments