presets

shengliangxu · shengliangxu · commit 82d5a12620e9 · 2026-04-15T14:10:36.000-07:00
Signed-off-by: Shengliang Xu &lt;shengliangx@nvidia.com&gt;
diff --git a/docs/source/guides/10_recipes.rst b/docs/source/guides/10_recipes.rst
@@ -94,8 +94,8 @@ The simplest form is a single ``.yml`` or ``.yaml`` file.
 .. code-block:: yaml
 
    imports:
-     base_disable_all: configs/ptq/base_disable_all
-     default_disabled: configs/ptq/default_disabled_quantizers
+     base_disable_all: configs/ptq/units/base_disable_all
+     default_disabled: configs/ptq/units/default_disabled_quantizers
      fp8: configs/numerics/fp8
 
    metadata:
@@ -227,8 +227,8 @@ a list splice are not supported.
 .. code-block:: yaml
 
    imports:
-     base_disable_all: configs/ptq/base_disable_all
-     default_disabled: configs/ptq/default_disabled_quantizers
+     base_disable_all: configs/ptq/units/base_disable_all
+     default_disabled: configs/ptq/units/default_disabled_quantizers
      fp8: configs/numerics/fp8
 
    metadata:
@@ -275,7 +275,7 @@ and returns the resolved list:
 
 .. code-block:: yaml
 
-   # configs/ptq/fp8_kv.yaml — list snippet that imports a dict snippet
+   # configs/ptq/units/fp8_kv.yaml — list snippet that imports a dict snippet
    imports:
      fp8: configs/numerics/fp8
    ---
@@ -305,11 +305,11 @@ Reusable snippets are stored under ``modelopt_recipes/configs/``:
      - NVFP4 E2M1 blockwise, dynamic calibration, FP8 scales (default)
    * - ``configs/numerics/nvfp4_static``
      - NVFP4 E2M1 blockwise, static calibration, FP8 scales
-   * - ``configs/ptq/base_disable_all``
+   * - ``configs/ptq/units/base_disable_all``
      - Disable all quantizers (deny-all-then-configure pattern)
-   * - ``configs/ptq/default_disabled_quantizers``
+   * - ``configs/ptq/units/default_disabled_quantizers``
      - Standard exclusions (LM head, routers, BatchNorm, etc.)
-   * - ``configs/ptq/fp8_kv``
+   * - ``configs/ptq/units/fp8_kv``
      - FP8 E4M3 KV cache quantization (multi-document, imports ``fp8``)
 
 
@@ -549,8 +549,8 @@ Example -- creating a custom PTQ recipe using imports:
 
    # my_int8_recipe.yml
    imports:
-     base_disable_all: configs/ptq/base_disable_all
-     default_disabled: configs/ptq/default_disabled_quantizers
+     base_disable_all: configs/ptq/units/base_disable_all
+     default_disabled: configs/ptq/units/default_disabled_quantizers
 
    metadata:
      recipe_type: ptq
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -273,7 +273,7 @@ def find_quant_cfg_entry_by_path(
     "algorithm": "max",
 }
 
-FP8_DEFAULT_CFG: dict[str, Any] = load_config("configs/ptq/presets/fp8_default")
+FP8_DEFAULT_CFG: dict[str, Any] = load_config("configs/ptq/presets/model/fp8")
 
 MAMBA_MOE_FP8_AGGRESSIVE_CFG = {
     "quant_cfg": [
@@ -518,14 +518,7 @@ def find_quant_cfg_entry_by_path(
 # KV-cache configs are designed to be merged with a primary quantization config (e.g.
 # FP8_DEFAULT_CFG) that already contains _base_disable_all.  They intentionally omit both
 # _base_disable_all and "algorithm" because these are provided by the primary config.
-FP8_KV_CFG = {
-    "quant_cfg": [
-        {
-            "quantizer_name": "*[kv]_bmm_quantizer",
-            "cfg": {"num_bits": (4, 3)},
-        },
-    ]
-}
+FP8_KV_CFG: dict[str, Any] = load_config("configs/ptq/presets/kv/fp8")
 
 FP8_AFFINE_KV_CFG = {
     "quant_cfg": [
diff --git a/modelopt_recipes/configs/ptq/presets/README.md b/modelopt_recipes/configs/ptq/presets/README.md
@@ -1,14 +1,20 @@
 # PTQ Preset Configs
 
 This directory holds preset quantization configurations that serve as the
-single source of truth for the hardcoded `*_CFG` dicts in
+YAML source of truth for the hardcoded `*_CFG` dicts in
 `modelopt.torch.quantization.config` (e.g., `FP8_DEFAULT_CFG`).
 
 Each preset is a complete, self-contained config with `algorithm` and
 `quant_cfg` — ready to pass directly to `mtq.quantize()`. Presets compose
-from the reusable snippets in `configs/numerics/` and `configs/ptq/` via
-the `$import` system.
+from the reusable snippets in `configs/numerics/` and `configs/ptq/units/`
+via the `$import` system.
 
-When adding a new preset, use existing snippets where possible and keep
-the YAML as the authoritative definition — the Python config should load
-from here rather than hardcoding the dict.
+**Note:** The main purpose of these presets is to support the existing
+`hf_ptq.py` script's `--qformat` / `--kv_cache_qformat` flags and other
+code paths that reference
+the hardcoded `*_CFG` dicts, maintaining backward compatibility during
+the transition to recipe-based workflows. Users are encouraged to use
+`load_recipe` with full recipe files under `general/` or `models/`
+instead. Some or all of these presets may be deprecated or removed in
+future releases as the recipe-based workflow becomes the standard entry
+point.
diff --git a/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml b/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# FP8 E4M3 KV cache quantization preset.
+# Equivalent to the hardcoded FP8_KV_CFG in config.py.
+# This is a partial config (no algorithm, no base_disable_all) — designed
+# to be merged with a primary model quantization config.
+imports:
+  fp8_kv: configs/ptq/units/fp8_kv
+
+quant_cfg:
+  - $import: fp8_kv
diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8.yaml
@@ -16,9 +16,9 @@
 # FP8 per-tensor weight and activation (W8A8), max calibration.
 # Equivalent to the hardcoded FP8_DEFAULT_CFG in config.py.
 imports:
-  base_disable_all: configs/ptq/base_disable_all
-  w8a8: configs/ptq/w8a8_fp8_fp8
-  default_disabled: configs/ptq/default_disabled_quantizers
+  base_disable_all: configs/ptq/units/base_disable_all
+  w8a8: configs/ptq/units/w8a8_fp8_fp8
+  default_disabled: configs/ptq/units/default_disabled_quantizers
 
 algorithm: max
 quant_cfg:
diff --git a/modelopt_recipes/configs/ptq/units/base_disable_all.yaml b/modelopt_recipes/configs/ptq/units/base_disable_all.yaml
diff --git a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml
diff --git a/modelopt_recipes/configs/ptq/units/fp8_kv.yaml b/modelopt_recipes/configs/ptq/units/fp8_kv.yaml
diff --git a/modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml b/modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml
diff --git a/modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml b/modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml
diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yaml b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yaml
@@ -14,10 +14,10 @@
 # limitations under the License.
 
 imports:
-  base_disable_all: configs/ptq/base_disable_all
-  default_disabled: configs/ptq/default_disabled_quantizers
-  w8a8_fp8_fp8: configs/ptq/w8a8_fp8_fp8
-  fp8_kv: configs/ptq/fp8_kv
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled: configs/ptq/units/default_disabled_quantizers
+  w8a8_fp8_fp8: configs/ptq/units/w8a8_fp8_fp8
+  fp8_kv: configs/ptq/units/fp8_kv
 
 metadata:
   recipe_type: ptq
diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yaml
@@ -14,10 +14,10 @@
 # limitations under the License.
 
 imports:
-  base_disable_all: configs/ptq/base_disable_all
-  default_disabled: configs/ptq/default_disabled_quantizers
-  w4a4_nvfp4_nvfp4: configs/ptq/w4a4_nvfp4_nvfp4
-  fp8_kv: configs/ptq/fp8_kv
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled: configs/ptq/units/default_disabled_quantizers
+  w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4
+  fp8_kv: configs/ptq/units/fp8_kv
 
 metadata:
   recipe_type: ptq
diff --git a/modelopt_recipes/general/ptq/nvfp4_default-none_kv_gptq.yaml b/modelopt_recipes/general/ptq/nvfp4_default-none_kv_gptq.yaml
@@ -14,8 +14,8 @@
 # limitations under the License.
 
 imports:
-  base_disable_all: configs/ptq/base_disable_all
-  default_disabled: configs/ptq/default_disabled_quantizers
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled: configs/ptq/units/default_disabled_quantizers
   nvfp4_static: configs/numerics/nvfp4_static
   nvfp4: configs/numerics/nvfp4
 
diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml
@@ -14,10 +14,10 @@
 # limitations under the License.
 
 imports:
-  base_disable_all: configs/ptq/base_disable_all
-  default_disabled: configs/ptq/default_disabled_quantizers
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled: configs/ptq/units/default_disabled_quantizers
   nvfp4: configs/numerics/nvfp4
-  fp8_kv: configs/ptq/fp8_kv
+  fp8_kv: configs/ptq/units/fp8_kv
 
 metadata:
   recipe_type: ptq
diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yaml
@@ -14,10 +14,10 @@
 # limitations under the License.
 
 imports:
-  base_disable_all: configs/ptq/base_disable_all
-  default_disabled: configs/ptq/default_disabled_quantizers
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled: configs/ptq/units/default_disabled_quantizers
   nvfp4: configs/numerics/nvfp4
-  fp8_kv: configs/ptq/fp8_kv
+  fp8_kv: configs/ptq/units/fp8_kv
 
 metadata:
   recipe_type: ptq
diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yaml
@@ -14,10 +14,10 @@
 # limitations under the License.
 
 imports:
-  base_disable_all: configs/ptq/base_disable_all
-  default_disabled: configs/ptq/default_disabled_quantizers
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled: configs/ptq/units/default_disabled_quantizers
   nvfp4: configs/numerics/nvfp4
-  fp8_kv: configs/ptq/fp8_kv
+  fp8_kv: configs/ptq/units/fp8_kv
 
 metadata:
   recipe_type: ptq