@@ -104,6 +104,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
104104 "int8_sq" : mtq .INT8_SMOOTHQUANT_CFG ,
105105 "int8_wo" : mtq .INT8_WEIGHT_ONLY_CFG ,
106106 "fp8" : mtq .FP8_DEFAULT_CFG ,
107+ "fp8_w8a8" : mtq .FP8_DEFAULT_CFG ,
107108 "int4_awq" : mtq .INT4_AWQ_CFG ,
108109 "w4a8_awq" : mtq .W4A8_AWQ_BETA_CFG ,
109110 "nvfp4" : mtq .NVFP4_DEFAULT_CFG ,
@@ -350,6 +351,7 @@ def auto_quantize(
350351 qformat
351352 in [
352353 "fp8" ,
354+ "fp8_w8a8" ,
353355 "int8_sq" ,
354356 "int8_wo" ,
355357 "int4_awq" ,
@@ -396,9 +398,15 @@ def forward_step(model, batch):
396398 if "parent_class" not in entry and entry ["quantizer_name" ] != "*lm_head*"
397399 ]
398400 enable_linear_attn_big3 = os .environ .get ("MODELOPT_AUTOQ_ENABLE_LINEAR_ATTN_BIG3" ) == "1"
401+ enable_linear_attn_all = os .environ .get ("MODELOPT_AUTOQ_ENABLE_LINEAR_ATTN_ALL" ) == "1"
399402 enable_shared_expert = os .environ .get ("MODELOPT_AUTOQ_ENABLE_SHARED_EXPERT" ) == "1"
403+ if enable_linear_attn_all :
404+ enable_linear_attn_big3 = True
400405 autoq_extra_disabled = [
401406 "*shared_expert_gate*" ,
407+ # Keep the GDN a/b projections in BF16 even for "all linear_attn"
408+ # searches. Prior healthy NVFP4 controls excluded these small
409+ # projections, while low-end full-search checkpoints quantized them.
402410 "*linear_attn.in_proj_a*" ,
403411 "*linear_attn.in_proj_b*" ,
404412 ]
@@ -437,6 +445,10 @@ def forward_step(model, batch):
437445 disabled_layers = disabled_layers ,
438446 method = auto_quantize_method ,
439447 checkpoint = auto_quantize_checkpoint ,
448+ cost_model = args .auto_quantize_cost_model ,
449+ active_moe_expert_ratio = args .auto_quantize_active_moe_expert_ratio ,
450+ cost_lower_bound = args .auto_quantize_cost_lower_bound ,
451+ cost_objective = args .auto_quantize_cost_objective ,
440452 )
441453
442454 calibrate_loop = create_forward_loop (dataloader = calib_dataloader )
@@ -1454,6 +1466,48 @@ def parse_args() -> argparse.Namespace:
14541466 "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
14551467 ),
14561468 )
1469+ parser .add_argument (
1470+ "--auto_quantize_cost_model" ,
1471+ type = str ,
1472+ default = "weight" ,
1473+ choices = ["weight" , "active_moe" ],
1474+ help = (
1475+ "Cost model for auto_quantize effective-bits accounting. 'weight' counts all "
1476+ "quantizable weights equally. 'active_moe' scales routed MoE expert weights by "
1477+ "--auto_quantize_active_moe_expert_ratio, or infers top_k/num_experts from model config."
1478+ ),
1479+ )
1480+ parser .add_argument (
1481+ "--auto_quantize_active_moe_expert_ratio" ,
1482+ type = float ,
1483+ default = None ,
1484+ help = (
1485+ "Routed MoE expert active ratio for --auto_quantize_cost_model active_moe. "
1486+ "For top-k MoE this is top_k / num_experts. If omitted, common model config "
1487+ "fields such as num_experts_per_tok and num_experts are used when available."
1488+ ),
1489+ )
1490+ parser .add_argument (
1491+ "--auto_quantize_cost_lower_bound" ,
1492+ type = float ,
1493+ default = None ,
1494+ help = (
1495+ "Optional lower bound, as a fraction of the requested effective-bits budget, "
1496+ "for the auto_quantize LP. Active-MoE cost mode uses a best-effort lower bound "
1497+ "by default when this is omitted."
1498+ ),
1499+ )
1500+ parser .add_argument (
1501+ "--auto_quantize_cost_objective" ,
1502+ type = str ,
1503+ default = "sensitivity" ,
1504+ choices = ["sensitivity" , "active_moe" ],
1505+ help = (
1506+ "Objective for auto_quantize LP. 'sensitivity' minimizes quantization sensitivity. "
1507+ "'active_moe' minimizes active routed-MoE cost while the cost model constraint "
1508+ "still controls the requested budget."
1509+ ),
1510+ )
14571511 parser .add_argument (
14581512 "--moe_calib_experts_ratio" ,
14591513 type = float ,
@@ -1475,6 +1529,23 @@ def parse_args() -> argparse.Namespace:
14751529 args = parser .parse_args ()
14761530 if args .moe_calib_experts_ratio is not None and not (0.0 < args .moe_calib_experts_ratio <= 1.0 ):
14771531 parser .error ("--moe_calib_experts_ratio must be in the range (0.0, 1.0]." )
1532+ if args .auto_quantize_active_moe_expert_ratio is not None and not (
1533+ 0.0 < args .auto_quantize_active_moe_expert_ratio <= 1.0
1534+ ):
1535+ parser .error ("--auto_quantize_active_moe_expert_ratio must be in the range (0.0, 1.0]." )
1536+ if (
1537+ args .auto_quantize_cost_model == "weight"
1538+ and args .auto_quantize_cost_objective != "active_moe"
1539+ and args .auto_quantize_active_moe_expert_ratio is not None
1540+ ):
1541+ parser .error (
1542+ "--auto_quantize_active_moe_expert_ratio requires "
1543+ "--auto_quantize_cost_model active_moe or --auto_quantize_cost_objective active_moe."
1544+ )
1545+ if args .auto_quantize_cost_lower_bound is not None and not (
1546+ 0.0 < args .auto_quantize_cost_lower_bound <= 1.0
1547+ ):
1548+ parser .error ("--auto_quantize_cost_lower_bound must be in the range (0.0, 1.0]." )
14781549
14791550 if args .specdec_offline_dataset is not None and args .sparsity_fmt != "dense" :
14801551 parser .error ("--specdec_offline_dataset is only supported with --sparsity_fmt dense (PTQ)." )
0 commit comments