add effective bits in the QuantRecipe field to override the estimate cost num_bits per recipe

juhi10071998 · juhi10071998 · commit a2763fac68d9 · 2026-05-27T21:54:39.000Z
Signed-off-by: Juhi Mittal &lt;juhim@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py
@@ -49,16 +49,25 @@
 def estimate_quant_compression(quant_cfg: QuantizeConfig) -> float:
     """Estimate the compression ratio of a quantization configuration.
 
-    Right now, we find the minimum compression ratio across all quantizer attribute configs.
-    This is not perfect but is a good proxy for the overall compression ratio. We will improve
-    this in future releases.
+    If ``quant_cfg.effective_bits`` is set, returns ``effective_bits / 16`` directly. This
+    is the override path for formats whose true effective bits don't match the per-quantizer
+    ``num_bits`` heuristic — e.g., NVFP4 has 4 value bits + a per-16-element FP8 scale
+    (8/16 = 0.5 bits/element), so true effective bits = 4.5, not the heuristic's 4.0.
+
+    Otherwise, falls back to the heuristic: minimum compression ratio across all enabled
+    quantizer attribute configs (``num_bits / 16`` for ints, ``(E + M + 1) / 16`` for FP
+    tuples). This is a good proxy for the overall compression ratio of formats without
+    block-scale overhead, but under-counts block-quantized formats. We will improve this
+    in future releases.
 
     Args:
         quant_cfg: The quantization configuration to estimate compression for.
 
     Returns:
         float: The estimated compression ratio (0.0 to 1.0).
     """
+    if quant_cfg.effective_bits is not None:
+        return quant_cfg.effective_bits / 16.0
 
     def estimate_quant_compression_for_quantizer(quantizer_attr_cfg):
         if isinstance(quantizer_attr_cfg, list):
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -1160,6 +1160,25 @@ class QuantizeConfig(ModeloptBaseConfig):
         validate_default=True,
     )
 
+    effective_bits: float | None = ModeloptField(
+        default=None,
+        title="Effective bits per element (autoquant cost override)",
+        description=(
+            "Optional override for the autoquant LP cost model. If set, replaces the "
+            "heuristic estimate derived from ``num_bits``. Mainly useful for block-quantized "
+            "formats where the heuristic under-counts due to per-block scale overhead "
+            "(e.g., NVFP4 actual=4.5 vs heuristic=4.0). Must be in (0, 16] when set. "
+            "Read only by autoquant; other quantization paths ignore this field."
+        ),
+    )
+
+    @field_validator("effective_bits")
+    @classmethod
+    def _validate_effective_bits(cls, v: float | None) -> float | None:
+        if v is not None and not (0 < v <= 16):
+            raise ValueError(f"effective_bits must be in (0, 16], got {v}")
+        return v
+
     @field_validator("quant_cfg", mode="before")
     @classmethod
     def normalize_quant_cfg(
diff --git a/modelopt_recipes/general/auto_quantize/nvfp4_fp8_at_4p8bits-kv_fp8_cast.yaml b/modelopt_recipes/general/auto_quantize/nvfp4_fp8_at_4p8bits-kv_fp8_cast.yaml
@@ -30,7 +30,11 @@ auto_quantize:
     effective_bits: 4.8
 
   candidate_formats:
+    # NVFP4 true effective bits = 4 value bits + 8-bit FP8 scale per 16-element block
+    # = 4 + 0.5 = 4.5 bits/element. Override the heuristic's 4.0 so the LP cost is accurate.
     - $import: nvfp4
+      effective_bits: 4.5
+    # FP8 effective bits = 8 (heuristic is correct, per-tensor scale is negligible).
     - $import: fp8
 
   kv_cache:
diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py
@@ -287,10 +287,15 @@ def test_load_recipe_autoquantize_defaults():
 
 
 def test_load_recipe_autoquantize_candidates_match_presets():
-    """Built-in AutoQuantize recipe's $imported candidates equal mtq.X_DEFAULT_CFG dicts."""
+    """Built-in AutoQuantize recipe's $imported candidates equal preset + inline override."""
     recipe = load_recipe("general/auto_quantize/nvfp4_fp8_at_4p8bits-kv_fp8_cast")
     candidates = recipe.auto_quantize.candidate_formats
-    assert candidates[0].model_dump(exclude_unset=True) == mtq.NVFP4_DEFAULT_CFG
+
+    # NVFP4 candidate = canonical preset + inline effective_bits override.
+    expected_nvfp4 = {**mtq.NVFP4_DEFAULT_CFG, "effective_bits": 4.5}
+    assert candidates[0].model_dump(exclude_unset=True) == expected_nvfp4
+
+    # FP8 candidate = canonical preset exactly (no override).
     assert candidates[1].model_dump(exclude_unset=True) == mtq.FP8_DEFAULT_CFG
 
 
@@ -338,6 +343,17 @@ def test_load_recipe_autoquantize_kv_cache_optional(tmp_path):
     assert recipe.auto_quantize.kv_cache is None
 
 
+def test_load_recipe_autoquantize_effective_bits_inline_override():
+    """Inline $import + sibling effective_bits merge applied per candidate."""
+    recipe = load_recipe("general/auto_quantize/nvfp4_fp8_at_4p8bits-kv_fp8_cast")
+    candidates = recipe.auto_quantize.candidate_formats
+
+    # NVFP4 candidate carries the override.
+    assert candidates[0].effective_bits == 4.5
+    # FP8 candidate has no override; heuristic still applies.
+    assert candidates[1].effective_bits is None
+
+
 # ---------------------------------------------------------------------------
 # load_recipe — EAGLE speculative decoding
 # ---------------------------------------------------------------------------
diff --git a/tests/unit/torch/quantization/test_autoquant.py b/tests/unit/torch/quantization/test_autoquant.py
@@ -375,6 +375,32 @@ def test_estimate_quant_compression():
     assert estimate_quant_compression(fp8_affine_kv_cfg) == 0.5
 
 
+def test_estimate_quant_compression_effective_bits_override():
+    """``QuantizeConfig.effective_bits`` overrides the per-quantizer num_bits heuristic.
+
+    Validates two things:
+      1. The override path returns ``effective_bits / 16`` and bypasses the heuristic.
+      2. Without the override, the heuristic returns the unchanged baseline value.
+    """
+    # NVFP4 — heuristic returns 4.0 bits / 16 = 0.25, but true effective bits is 4.5.
+    nvfp4_cfg = mtq.config.QuantizeConfig(**mtq.NVFP4_DEFAULT_CFG)
+    assert nvfp4_cfg.effective_bits is None
+    assert estimate_quant_compression(nvfp4_cfg) == 0.25  # heuristic baseline
+
+    nvfp4_cfg_overridden = mtq.config.QuantizeConfig(**mtq.NVFP4_DEFAULT_CFG, effective_bits=4.5)
+    assert estimate_quant_compression(nvfp4_cfg_overridden) == 4.5 / 16.0
+
+    # Override can also represent a higher cost (e.g., conservative for a sensitive recipe).
+    nvfp4_cfg_high = mtq.config.QuantizeConfig(**mtq.NVFP4_DEFAULT_CFG, effective_bits=16.0)
+    assert estimate_quant_compression(nvfp4_cfg_high) == 1.0
+
+    # Out-of-range values are rejected by the Pydantic validator.
+    with pytest.raises(ValueError, match="effective_bits must be in"):
+        mtq.config.QuantizeConfig(**mtq.NVFP4_DEFAULT_CFG, effective_bits=0.0)
+    with pytest.raises(ValueError, match="effective_bits must be in"):
+        mtq.config.QuantizeConfig(**mtq.NVFP4_DEFAULT_CFG, effective_bits=17.0)
+
+
 @pytest.mark.parametrize("method", ["gradient", "kl_div"])
 def test_auto_quantize_checkpoint_resume(method, tmp_path, capsys):
     """Test that checkpoint can be used to resume an interrupted search."""