NVIDIA
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 71 additions & 0 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎modelopt/torch/export/layer_utils.py‎
Lines changed: 91 additions & 0 deletions b/‎modelopt/torch/export/layer_utils.py‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 10 additions & 0 deletions b/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎modelopt/torch/opt/searcher.py‎
Lines changed: 18 additions & 1 deletion b/‎modelopt/torch/opt/searcher.py‎
Lines changed: 18 additions & 1 deletion
@@ -104,6 +104,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
     "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
     "int8_wo": mtq.INT8_WEIGHT_ONLY_CFG,
     "fp8": mtq.FP8_DEFAULT_CFG,
+    "fp8_w8a8": mtq.FP8_DEFAULT_CFG,
     "int4_awq": mtq.INT4_AWQ_CFG,
     "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
     "nvfp4": mtq.NVFP4_DEFAULT_CFG,
@@ -350,6 +351,7 @@ def auto_quantize(
         qformat
         in [
             "fp8",
+            "fp8_w8a8",
             "int8_sq",
             "int8_wo",
             "int4_awq",
@@ -396,9 +398,15 @@ def forward_step(model, batch):
         if "parent_class" not in entry and entry["quantizer_name"] != "*lm_head*"
     ]
     enable_linear_attn_big3 = os.environ.get("MODELOPT_AUTOQ_ENABLE_LINEAR_ATTN_BIG3") == "1"
+    enable_linear_attn_all = os.environ.get("MODELOPT_AUTOQ_ENABLE_LINEAR_ATTN_ALL") == "1"
     enable_shared_expert = os.environ.get("MODELOPT_AUTOQ_ENABLE_SHARED_EXPERT") == "1"
+    if enable_linear_attn_all:
+        enable_linear_attn_big3 = True
     autoq_extra_disabled = [
         "*shared_expert_gate*",
+        # Keep the GDN a/b projections in BF16 even for "all linear_attn"
+        # searches. Prior healthy NVFP4 controls excluded these small
+        # projections, while low-end full-search checkpoints quantized them.
         "*linear_attn.in_proj_a*",
         "*linear_attn.in_proj_b*",
     ]
@@ -437,6 +445,10 @@ def forward_step(model, batch):
         disabled_layers=disabled_layers,
         method=auto_quantize_method,
         checkpoint=auto_quantize_checkpoint,
+        cost_model=args.auto_quantize_cost_model,
+        active_moe_expert_ratio=args.auto_quantize_active_moe_expert_ratio,
+        cost_lower_bound=args.auto_quantize_cost_lower_bound,
+        cost_objective=args.auto_quantize_cost_objective,
     )
 
     calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
@@ -1454,6 +1466,48 @@ def parse_args() -> argparse.Namespace:
             "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
         ),
     )
+    parser.add_argument(
+        "--auto_quantize_cost_model",
+        type=str,
+        default="weight",
+        choices=["weight", "active_moe"],
+        help=(
+            "Cost model for auto_quantize effective-bits accounting. 'weight' counts all "
+            "quantizable weights equally. 'active_moe' scales routed MoE expert weights by "
+            "--auto_quantize_active_moe_expert_ratio, or infers top_k/num_experts from model config."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_active_moe_expert_ratio",
+        type=float,
+        default=None,
+        help=(
+            "Routed MoE expert active ratio for --auto_quantize_cost_model active_moe. "
+            "For top-k MoE this is top_k / num_experts. If omitted, common model config "
+            "fields such as num_experts_per_tok and num_experts are used when available."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_cost_lower_bound",
+        type=float,
+        default=None,
+        help=(
+            "Optional lower bound, as a fraction of the requested effective-bits budget, "
+            "for the auto_quantize LP. Active-MoE cost mode uses a best-effort lower bound "
+            "by default when this is omitted."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_cost_objective",
+        type=str,
+        default="sensitivity",
+        choices=["sensitivity", "active_moe"],
+        help=(
+            "Objective for auto_quantize LP. 'sensitivity' minimizes quantization sensitivity. "
+            "'active_moe' minimizes active routed-MoE cost while the cost model constraint "
+            "still controls the requested budget."
+        ),
+    )
     parser.add_argument(
         "--moe_calib_experts_ratio",
         type=float,
@@ -1475,6 +1529,23 @@ def parse_args() -> argparse.Namespace:
     args = parser.parse_args()
     if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0):
         parser.error("--moe_calib_experts_ratio must be in the range (0.0, 1.0].")
+    if args.auto_quantize_active_moe_expert_ratio is not None and not (
+        0.0 < args.auto_quantize_active_moe_expert_ratio <= 1.0
+    ):
+        parser.error("--auto_quantize_active_moe_expert_ratio must be in the range (0.0, 1.0].")
+    if (
+        args.auto_quantize_cost_model == "weight"
+        and args.auto_quantize_cost_objective != "active_moe"
+        and args.auto_quantize_active_moe_expert_ratio is not None
+    ):
+        parser.error(
+            "--auto_quantize_active_moe_expert_ratio requires "
+            "--auto_quantize_cost_model active_moe or --auto_quantize_cost_objective active_moe."
+        )
+    if args.auto_quantize_cost_lower_bound is not None and not (
+        0.0 < args.auto_quantize_cost_lower_bound <= 1.0
+    ):
+        parser.error("--auto_quantize_cost_lower_bound must be in the range (0.0, 1.0].")
 
     if args.specdec_offline_dataset is not None and args.sparsity_fmt != "dense":
         parser.error("--specdec_offline_dataset is only supported with --sparsity_fmt dense (PTQ).")
 
@@ -1173,6 +1173,97 @@ def set_expert_quantizer_amax(
 _GATE_UP_PAIRS = [("gate_proj", "up_proj"), ("w1", "w3")]
 
 
+_LINEAR_ATTN_FUSED_PAIRS = [
+    ("in_proj_qkv", "in_proj_z"),
+    ("in_proj_b", "in_proj_a"),
+]
+
+
+def _tensor_values_equal(left: torch.Tensor | None, right: torch.Tensor | None) -> bool:
+    if left is None or right is None:
+        return left is right
+    if left.is_meta or right.is_meta:
+        return False
+    return torch.equal(left, right)
+
+
+def _safe_quantizer_amax(quantizer) -> torch.Tensor | None:
+    try:
+        return getattr(quantizer, "amax", None)
+    except AssertionError:
+        return None
+
+
+def _linear_fusion_scales_match(left: nn.Module, right: nn.Module) -> bool:
+    left_iq = getattr(left, "input_quantizer", None)
+    right_iq = getattr(right, "input_quantizer", None)
+    if (
+        left_iq is not None
+        and right_iq is not None
+        and getattr(left_iq, "is_enabled", False)
+        and getattr(right_iq, "is_enabled", False)
+        and not _tensor_values_equal(_safe_quantizer_amax(left_iq), _safe_quantizer_amax(right_iq))
+    ):
+        return False
+
+    left_wq = getattr(left, "weight_quantizer", None)
+    right_wq = getattr(right, "weight_quantizer", None)
+    if left_wq is None or right_wq is None:
+        return True
+
+    if isinstance(left_wq, SequentialQuantizer) and isinstance(right_wq, SequentialQuantizer):
+        if (
+            len(left_wq) > 0
+            and len(right_wq) > 0
+            and getattr(left_wq[-1], "is_enabled", False)
+            and getattr(right_wq[-1], "is_enabled", False)
+        ):
+            return _tensor_values_equal(
+                _safe_quantizer_amax(left_wq[-1]), _safe_quantizer_amax(right_wq[-1])
+            )
+        return True
+
+    if hasattr(left_wq, "global_amax") and hasattr(right_wq, "global_amax"):
+        return _tensor_values_equal(left_wq.global_amax, right_wq.global_amax)
+
+    if getattr(left_wq, "is_enabled", False) and getattr(right_wq, "is_enabled", False):
+        return _tensor_values_equal(_safe_quantizer_amax(left_wq), _safe_quantizer_amax(right_wq))
+
+    return True
+
+
+def sync_linear_attn_fused_projection_amax(model: nn.Module) -> int:
+    """Sync quantizer amaxes for GDN projections that serving engines fuse.
+
+    Qwen3.5/Qwen3-Next GDN exports keep ``in_proj_qkv`` and ``in_proj_z`` as
+    separate HF tensors, but vLLM fuses them into ``in_proj_qkvz`` at load time.
+    Likewise ``in_proj_b`` and ``in_proj_a`` may be fused as ``in_proj_ba``.
+    Sharing the quantizer scale domains before export avoids serving-time fused
+    loaders having to reconcile different scalar/global scales.
+
+    Returns:
+        Number of projection pairs whose scale state changed.
+    """
+    changed = 0
+    for _, sub_module in model.named_modules():
+        for left_name, right_name in _LINEAR_ATTN_FUSED_PAIRS:
+            left = getattr(sub_module, left_name, None)
+            right = getattr(sub_module, right_name, None)
+            if left is None or right is None:
+                continue
+            left_format = get_quantization_format(left)
+            right_format = get_quantization_format(right)
+            if left_format != right_format or left_format is None:
+                continue
+            if left_format == QUANTIZATION_NONE:
+                continue
+            matched_before = _linear_fusion_scales_match(left, right)
+            preprocess_linear_fusion([left, right])
+            if not matched_before:
+                changed += 1
+    return changed
+
+
 def sync_moe_gate_up_amax(model: nn.Module) -> int:
     """Take element-wise max of gate and up weight quantizer amaxes per expert.
 
 
@@ -73,6 +73,7 @@
     is_moe,
     is_quantlinear,
     set_expert_quantizer_amax,
+    sync_linear_attn_fused_projection_amax,
     sync_moe_gate_up_amax,
 )
 from .model_config import (
@@ -810,6 +811,15 @@ def _export_transformers_checkpoint(
             f"Taking element-wise max of amaxes for serving-engine fusion."
         )
 
+    # Safety net for Qwen3.5/Qwen3-Next GDN projections. These remain separate
+    # HF tensors, but vLLM fuses qkv+z and b+a at load time.
+    synced = sync_linear_attn_fused_projection_amax(model)
+    if synced:
+        warnings.warn(
+            f"Synced quantizer amax/global_amax for {synced} linear-attention "
+            f"projection pair(s) that are fused by serving engines."
+        )
+
     # Process all quantized modules and export weights
     _process_quantized_modules(model, dtype, is_modelopt_qlora)
 
 
@@ -57,6 +57,16 @@
 __all__ = ["BaseSearcher"]
 
 
+def _get_optional_env_float(name: str) -> float | None:
+    value = os.environ.get(name)
+    if not value:
+        return None
+    parsed_value = float(value)
+    if parsed_value <= 0.0:
+        raise ValueError(f"{name} must be positive, got {parsed_value}.")
+    return parsed_value
+
+
 class BaseSearcher(ABC):
     """A basic search interface that can be used to search/optimize a model.
 
@@ -336,7 +346,14 @@ def __init__(
         self.constraints_to_candidate_costs = constraints_to_candidate_costs
         self.candidate_scores = candidate_scores
         self.objective_type = pulp.LpMinimize if objective_type == "minimize" else pulp.LpMaximize
-        self.solver = pulp.PULP_CBC_CMD(msg=verbose)
+        solver_kwargs = {}
+        cbc_time_limit = _get_optional_env_float("MODELOPT_LPS_CBC_TIME_LIMIT")
+        cbc_gap_rel = _get_optional_env_float("MODELOPT_LPS_CBC_GAP_REL")
+        if cbc_time_limit is not None:
+            solver_kwargs["timeLimit"] = cbc_time_limit
+        if cbc_gap_rel is not None:
+            solver_kwargs["gapRel"] = cbc_gap_rel
+        self.solver = pulp.PULP_CBC_CMD(msg=verbose, **solver_kwargs)
 
         self.num_layers = len(self.candidate_scores)
         self.num_candidates_per_layer = list(map(len, self.candidate_scores))