Address AutoQuantize cost model review feedback

meenchen · meenchen · commit 845e50eb8d23 · 2026-05-29T15:34:12.000-07:00
Signed-off-by: weimingc &lt;17592131+meenchen@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py
@@ -397,6 +397,9 @@ def default_search_config(self):
             "disabled_layers": None,
             "verbose": is_master(),
             "checkpoint": None,
+            "cost_model": COST_MODEL_WEIGHT,
+            "cost": {},
+            "active_moe_expert_ratio": None,
         }
 
     @property
@@ -405,6 +408,7 @@ def default_state_dict(self) -> SearchStateDict:
         return {
             "method": self.method_name,
             "cost_model": "weight",
+            "cost": {},
             "active_moe_expert_ratio": None,
             "cost_denominator": None,
             "candidate_stats": defaultdict(dict),
@@ -608,25 +612,22 @@ def initialize_candidate_stats(self):
             if not isinstance(hparam, QuantRecipeHparam):
                 continue
 
-            formats, scores, costs, active_costs = [], [], [], []
+            formats, scores, costs = [], [], []
             prev_score = float("inf")
             for recipe in hparam.choices:
                 formats.append(recipe)
 
                 score = hparam.get_score(recipe)  # type: ignore [arg-type]
                 cost = hparam.get_cost(recipe)  # type: ignore [arg-type]
-                active_cost = hparam.get_cost(recipe, cost_weight=hparam.cost_weight)  # type: ignore [arg-type]
 
                 score = min(score, prev_score)  # TODO: Should we get rid of this?
                 scores.append(score)
                 costs.append(cost)
-                active_costs.append(active_cost)
                 prev_score = score
 
             self.candidate_stats[name]["formats"] = formats
             self.candidate_stats[name]["scores"] = scores
             self.candidate_stats[name]["costs"] = costs
-            self.candidate_stats[name]["active_costs"] = active_costs
             self.candidate_stats[name]["module_names"] = hparam.quant_module_names
             self.candidate_stats[name]["cost_weight"] = hparam.cost_weight
 
@@ -674,6 +675,7 @@ def before_search(self):
             )
         self.method = self.method_name
         self.cost_model = self.config["cost_model"]
+        self.cost = self.config["cost"]
         self.active_moe_expert_ratio = self.config["active_moe_expert_ratio"]
         self.cost_denominator = getattr(self, "cost_denominator", None)
 
@@ -1466,7 +1468,20 @@ def _resolve_best_recipe(search_state, constraints, verbose=False):
 
     searcher.candidate_stats = candidate_stats
     searcher.cost_model = search_state.get("cost_model", COST_MODEL_WEIGHT)
-    searcher.config = {"cost_model": searcher.cost_model}
+    searcher.cost = search_state.get("cost", {})
+    searcher.active_moe_expert_ratio = search_state.get("active_moe_expert_ratio")
+    if (
+        searcher.cost_model == COST_MODEL_ACTIVE_MOE
+        and not searcher.cost
+        and searcher.active_moe_expert_ratio is not None
+    ):
+        searcher.cost = {ACTIVE_MOE_EXPERT_RATIO_KEY: searcher.active_moe_expert_ratio}
+    searcher.config = {
+        **searcher.default_search_config,
+        "cost_model": searcher.cost_model,
+        "cost": searcher.cost,
+        "active_moe_expert_ratio": searcher.active_moe_expert_ratio,
+    }
     best_recipe_info, _ = searcher.run_search_with_stats(max_weight_size, verbose=verbose)
 
     best_recipe = {name: info["format"] for name, info in best_recipe_info.items()}
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
@@ -38,7 +38,6 @@
 
 from .algorithms import AutoQuantizeGradientSearcher, AutoQuantizeKLDivSearcher, QuantRecipe
 from .algorithms import get_auto_quantize_config as _get_auto_quantize_config
-from ._auto_quantize_cost import normalize_auto_quantize_constraints
 from .config import QuantizeAlgoCfgType
 from .mode import QuantizeModeRegistry, get_modelike_from_algo_cfg
 from .nn import QuantModule, TensorQuantizer
@@ -524,8 +523,6 @@ def forward_backward_step(model, batch) -> None:
     else:
         raise ValueError(f"Invalid method: {method}. Valid options are 'gradient' or 'kl_div'.")
 
-    constraints = normalize_auto_quantize_constraints(model, constraints)
-
     model = apply_mode(
         model,
         mode="auto_quantize",
diff --git a/tests/unit/torch/quantization/test_autoquant.py b/tests/unit/torch/quantization/test_autoquant.py
@@ -189,7 +189,7 @@ def test_auto_quantize_active_moe_cost_model(num_experts_attr):
     assert shared_stats
     assert all(stats["cost_weight"] == pytest.approx(0.25) for stats in routed_stats)
     assert all(stats["cost_weight"] == pytest.approx(1.0) for stats in shared_stats)
-    assert all("active_costs" in stats for stats in search_history["candidate_stats"].values())
+    assert all("active_costs" not in stats for stats in search_history["candidate_stats"].values())
 
 
 def test_active_moe_ratio_requires_single_config_object():