address review comments- remove score_checkpoint from Autoquant_YAML, update the kv_cache pydantic type in YAML str -> QuantizeConfig, also update the dispatch in hf_ptq.py now, also add REQUIRED_SECTION_PER_RECIPE_TYPE for Autoquantize and fix a minor bug there

juhi10071998 · juhi10071998 · commit 8b1d3c6f2351 · 2026-05-22T20:43:45.000Z
Signed-off-by: Juhi Mittal &lt;juhim@nvidia.com&gt;
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -310,7 +310,7 @@ def auto_quantize(
     constraints: dict,
     quantization_formats: list[dict],
     disabled_layers: list[str],
-    kv_cache_qformat: str,
+    kv_cache_quant_cfg: dict | None,
 ):
     """Pure orchestrator: build forward_step/loss_func, call mtq.auto_quantize,
     run KV cache post-step. All knobs are explicit keyword-only args; the
@@ -396,25 +396,24 @@ def forward_step(model, batch):
     )
 
     calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
-    enable_quant_kv_cache = kv_cache_qformat != "none"
-    print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization")
-    if enable_quant_kv_cache:
-        kv_cache_quant_cfg = copy.deepcopy(
-            getattr(mtq, KV_QUANT_CFG_CHOICES[kv_cache_qformat])["quant_cfg"]
-        )
-        kv_cache_quant_cfg = [
-            e for e in kv_cache_quant_cfg if e["quantizer_name"] != "*"
+    print(f"{'Enable' if kv_cache_quant_cfg is not None else 'Disable'} KV cache quantization")
+    if kv_cache_quant_cfg is not None:
+        kv_entries = [
+            e for e in copy.deepcopy(kv_cache_quant_cfg["quant_cfg"]) if e["quantizer_name"] != "*"
         ]  # keep other quantizers from auto_quantize
 
-        if kv_cache_qformat in _KV_CAST_FORMATS:
-            _set_kv_cache_constant_amax(kv_cache_quant_cfg)
-
-        mtq.set_quantizer_by_cfg(language_model, quant_cfg=kv_cache_quant_cfg)
-        if kv_cache_qformat not in _KV_CAST_FORMATS:
+        mtq.set_quantizer_by_cfg(language_model, quant_cfg=kv_entries)
+        # Calibrate only when at least one KV entry doesn't pin amax via use_constant_amax.
+        # Cast-variant presets (kv_fp8_cast, kv_nvfp4_cast) bake this in; data-driven
+        # variants (kv_fp8, kv_nvfp4, etc.) need a calibration pass.
+        needs_calibration = not all(
+            (e.get("cfg") or {}).get("use_constant_amax") is True for e in kv_entries
+        )
+        if needs_calibration:
             # Calibrate only the KV cache quantizers; disable all others.
             with mtq.set_quantizer_by_cfg_context(
                 language_model,
-                [{"quantizer_name": "*", "enable": False}, *kv_cache_quant_cfg],
+                [{"quantizer_name": "*", "enable": False}, *kv_entries],
             ):
                 mtq.calibrate(language_model, algorithm="max", forward_loop=calibrate_loop)
     return language_model
@@ -1075,6 +1074,19 @@ def _is_layerwise(obj):
             if "parent_class" not in entry
         ]
 
+        # Resolve --kv_cache_qformat to a full QuantizeConfig dict (or None). Used as the
+        # CLI fallback when a recipe is silent on KV cache, and as the sole source for the
+        # CLI autoquant branch. Cast variants get use_constant_amax injected at this layer
+        # so the helper can stay format-agnostic (it just checks use_constant_amax to
+        # decide whether to calibrate).
+        def _cli_kv_cache_quant_cfg():
+            if args.kv_cache_qformat == "none":
+                return None
+            cfg = copy.deepcopy(getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat]))
+            if args.kv_cache_qformat in _KV_CAST_FORMATS:
+                _set_kv_cache_constant_amax(cfg["quant_cfg"])
+            return cfg
+
         if isinstance(recipe, ModelOptAutoQuantizeRecipe):
             aq = recipe.auto_quantize
 
@@ -1101,14 +1113,14 @@ def _candidate_for_mtq(fmt):
                 full_model=full_model,
                 auto_quantize_method=aq.method,
                 auto_quantize_score_size=aq.num_score_steps,
-                auto_quantize_checkpoint=aq.score_checkpoint,
+                auto_quantize_checkpoint=args.auto_quantize_checkpoint,
                 constraints=aq.constraints.model_dump(exclude_none=True),
                 quantization_formats=[_candidate_for_mtq(fmt) for fmt in aq.candidate_formats],
                 disabled_layers=aq.disabled_layers or default_disabled_layers,
-                kv_cache_qformat=(
-                    aq.kv_cache.qformat
-                    if (aq.kv_cache and aq.kv_cache.qformat)
-                    else args.kv_cache_qformat
+                kv_cache_quant_cfg=(
+                    aq.kv_cache.model_dump()
+                    if aq.kv_cache is not None
+                    else _cli_kv_cache_quant_cfg()
                 ),
             )
         else:
@@ -1148,7 +1160,7 @@ def _candidate_for_mtq(fmt):
                 constraints={"effective_bits": args.auto_quantize_bits},
                 quantization_formats=[QUANT_CFG_CHOICES[fmt] for fmt in qformat_list],
                 disabled_layers=default_disabled_layers,
-                kv_cache_qformat=args.kv_cache_qformat,
+                kv_cache_quant_cfg=_cli_kv_cache_quant_cfg(),
             )
 
     else:
diff --git a/modelopt/recipe/config.py b/modelopt/recipe/config.py
@@ -19,7 +19,7 @@
 
 import warnings
 from enum import Enum
-from typing import ClassVar, Literal
+from typing import Literal
 
 from pydantic import Field, field_validator, model_validator
 
@@ -106,45 +106,6 @@ class ModelOptPTQRecipe(ModelOptRecipeBase):
     )
 
 
-class AutoQuantizeKVCache(ModeloptBaseConfig):
-    """KV-cache configuration for an AutoQuantize recipe (optional)."""
-
-    # Mirrors the keys of KV_QUANT_CFG_CHOICES in examples/llm_ptq/hf_ptq.py.
-    # Kept inline (rather than imported) so the recipe schema stays free of
-    # example-script dependencies. Update both sides if new KV variants land.
-    # ClassVar annotation tells Pydantic this is a class-level constant, not a
-    # private model attribute (which is the default for leading-underscore names).
-    _SUPPORTED_QFORMATS: ClassVar[frozenset[str]] = frozenset(
-        {
-            "none",
-            "fp8_cast",
-            "fp8",
-            "fp8_affine",
-            "nvfp4_cast",
-            "nvfp4",
-            "nvfp4_affine",
-            "nvfp4_rotate",
-        }
-    )
-
-    qformat: str | None = ModeloptField(
-        default=None,
-        title="KV cache quantization format",
-        description="One of the entries in KV_QUANT_CFG_CHOICES, or 'none' to disable. "
-        "If omitted, the runtime --kv_cache_qformat CLI flag is used.",
-    )
-
-    @field_validator("qformat")
-    @classmethod
-    def _validate_qformat(cls, v: str | None) -> str | None:
-        if v is not None and v not in cls._SUPPORTED_QFORMATS:
-            raise ValueError(
-                f"Unsupported kv_cache.qformat: {v!r}. "
-                f"Expected one of {sorted(cls._SUPPORTED_QFORMATS)} or None."
-            )
-        return v
-
-
 class AutoQuantizeConstraints(ModeloptBaseConfig):
     """Constraints passed to ``mtq.auto_quantize`` (matches its dict shape).
 
@@ -201,16 +162,13 @@ class AutoQuantizeConfig(ModeloptBaseConfig):
         description="Glob patterns; matching layers are excluded from the search.",
     )
 
-    score_checkpoint: str | None = ModeloptField(
-        default=None,
-        title="Search-state checkpoint path",
-        description="Path to save/restore search state for resume or cheap re-solve.",
-    )
-
-    kv_cache: AutoQuantizeKVCache | None = ModeloptField(
+    kv_cache: QuantizeConfig | None = ModeloptField(
         default=None,
-        title="KV cache override",
-        description="Optional KV cache config. If omitted, --kv_cache_qformat CLI flag is used.",
+        title="KV cache QuantizeConfig (optional)",
+        description="Optional full QuantizeConfig applied as a uniform post-step after the "
+        "LP search. Typically uses ``$import: configs/ptq/units/kv_*`` for a built-in KV "
+        "preset, or inlines a custom config. If omitted, the runtime --kv_cache_qformat "
+        "CLI flag is used as a fallback.",
     )
 
     @field_validator("candidate_formats")
diff --git a/modelopt/recipe/loader.py b/modelopt/recipe/loader.py
@@ -42,6 +42,7 @@
 # must contain 'quantize'" instead of pydantic's generic missing-field error.
 _REQUIRED_SECTION_PER_RECIPE_TYPE: dict[RecipeType, str] = {
     RecipeType.PTQ: "quantize",
+    RecipeType.AUTO_QUANTIZE: "auto_quantize",
     RecipeType.SPECULATIVE_EAGLE: "eagle",
     RecipeType.SPECULATIVE_DFLASH: "dflash",
     RecipeType.SPECULATIVE_MEDUSA: "medusa",
@@ -171,8 +172,12 @@ def _load_recipe_from_file(
 
         raw = yaml.safe_load(recipe_file.read_text()) or {}
         if not isinstance(raw, dict) or required_section not in raw:
+            # Speculative recipes use the family suffix ("EAGLE" not "SPECULATIVE_EAGLE");
+            # every other multi-word recipe type uses the full value ("AUTO_QUANTIZE", not "QUANTIZE").
             kind = (
-                rtype.value.split("_", 1)[-1].upper() if "_" in rtype.value else rtype.value.upper()
+                rtype.value.removeprefix("speculative_").upper()
+                if rtype.value.startswith("speculative_")
+                else rtype.value.upper()
             )
             raise ValueError(f"{kind} recipe file {recipe_file} must contain {required_section!r}.")
 
diff --git a/modelopt_recipes/general/auto_quantize/nvfp4_fp8_at_4p8bits-kv_fp8_cast.yaml b/modelopt_recipes/general/auto_quantize/nvfp4_fp8_at_4p8bits-kv_fp8_cast.yaml
@@ -19,6 +19,7 @@
 imports:
   nvfp4: configs/ptq/presets/model/nvfp4
   fp8: configs/ptq/presets/model/fp8
+  kv_fp8_cast: configs/ptq/units/kv_fp8_cast
 
 metadata:
   recipe_type: auto_quantize
@@ -33,7 +34,8 @@ auto_quantize:
     - $import: fp8
 
   kv_cache:
-    qformat: fp8_cast
+    quant_cfg:
+      - $import: kv_fp8_cast
 
   method: gradient
   num_score_steps: 128
diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py
@@ -272,7 +272,10 @@ def test_load_recipe_autoquantize_builtin():
     aq = recipe.auto_quantize
     assert aq.constraints.effective_bits == 4.8
     assert len(aq.candidate_formats) == 2
-    assert aq.kv_cache is not None and aq.kv_cache.qformat == "fp8_cast"
+    # kv_cache is a full QuantizeConfig now (not a hardcoded qformat string).
+    assert aq.kv_cache is not None
+    assert aq.kv_cache.algorithm == "max"
+    assert len(aq.kv_cache.quant_cfg) >= 1
 
 
 def test_load_recipe_autoquantize_defaults():
@@ -281,7 +284,6 @@ def test_load_recipe_autoquantize_defaults():
     aq = recipe.auto_quantize
     assert aq.method == "gradient"
     assert aq.num_score_steps == 128
-    assert aq.score_checkpoint is None
 
 
 def test_load_recipe_autoquantize_candidates_match_presets():
@@ -293,10 +295,13 @@ def test_load_recipe_autoquantize_candidates_match_presets():
 
 
 def test_load_recipe_autoquantize_missing_section_raises(tmp_path):
-    """An AutoQuantize recipe missing the ``auto_quantize`` section is rejected."""
+    """An AutoQuantize recipe missing the ``auto_quantize`` section is rejected
+    with the clean loader-level error (not the generic pydantic missing-field one)."""
     bad = tmp_path / "bad.yml"
     bad.write_text("metadata:\n  recipe_type: auto_quantize\n")
-    with pytest.raises(ValueError, match="auto_quantize"):
+    with pytest.raises(
+        ValueError, match=r"AUTO_QUANTIZE recipe file .* must contain 'auto_quantize'"
+    ):
         load_recipe(bad)
 
 
@@ -333,14 +338,6 @@ def test_load_recipe_autoquantize_kv_cache_optional(tmp_path):
     assert recipe.auto_quantize.kv_cache is None
 
 
-def test_load_recipe_autoquantize_invalid_kv_qformat_raises(tmp_path):
-    """An unknown kv_cache.qformat is rejected at recipe-load time, not later."""
-    bad = tmp_path / "bad.yml"
-    bad.write_text(_AQ_MINIMAL_BODY + "  kv_cache:\n    qformat: not_a_real_format\n")
-    with pytest.raises(ValueError, match="kv_cache.qformat"):
-        load_recipe(bad)
-
-
 # ---------------------------------------------------------------------------
 # load_recipe — EAGLE speculative decoding
 # ---------------------------------------------------------------------------