From a14025c990b9024f9480e152667d45af88b4c8e0 Mon Sep 17 00:00:00 2001 From: realAsma Date: Fri, 24 Apr 2026 20:05:29 +0000 Subject: [PATCH 1/2] fix: layerwise calibration backward-compat, recipe split, batch-size guard - config: accept legacy `use_sequential` via AliasChoices on `layerwise` so pre-#1251 PTQ checkpoints load; still serializes as `layerwise` - recipes: split nvfp4_experts_only-fp8_kv into default (no layerwise) and _layerwise variants - hf_ptq: auto batch-size detection not supported with layerwise; default to batch_size=1 in that case - tests: cover alias accept, current-name accept, dump under current name, and extra='forbid' still rejecting unknowns Signed-off-by: realAsma --- CHANGELOG.rst | 2 +- docs/source/guides/10_recipes.rst | 3 ++ examples/llm_ptq/hf_ptq.py | 24 +++++++--- modelopt/torch/quantization/config.py | 3 +- .../ptq/nvfp4_experts_only-kv_fp8.yaml | 2 +- .../nvfp4_experts_only-kv_fp8_layerwise.yaml | 45 +++++++++++++++++++ tests/unit/recipe/test_loader.py | 1 + .../quantization/test_config_validation.py | 33 ++++++++++++++ 8 files changed, 104 insertions(+), 9 deletions(-) create mode 100644 modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8_layerwise.yaml diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2ad8aed1879..62f2b0041cb 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -41,7 +41,7 @@ Changelog - Add support for vLLM fakequant reload using ModelOpt state for HF models. See `examples/vllm_serve/README.md `_ for more details. - [Early Testing] Add Claude Code PTQ skill (``.claude/skills/ptq/``) for agent-assisted post-training quantization. The skill guides the agent through environment detection, model support checking, format selection, and execution via the launcher or manual SLURM/Docker/bare GPU paths. Includes handling for unlisted models with custom module patching. This feature is in early testing — use with caution. - [Early Testing] Polish Claude Code evaluation skill (``.claude/skills/evaluation/``) for agent-assisted LLM accuracy benchmarking via NeMo Evaluator Launcher. Adds two companion skills vendored verbatim from `NVIDIA-NeMo/Evaluator `_: ``launching-evals`` (run/check/debug/analyze NEL evaluations) and ``accessing-mlflow`` (query MLflow runs, compare metrics, fetch artifacts). Re-sync at a pinned upstream SHA via ``.claude/scripts/sync-upstream-skills.sh``. Also adds a shared ``skills/common/credentials.md`` covering HF / NGC / Docker token setup referenced by multiple skills. This feature is in early testing — use with caution. -- Add performant layerwise calibration for large models that don't fit on GPU (e.g. DeepSeek-R1, Kimi-K2). See `modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml `_ for usage. Layerwise calibration also supports PTQ with intermediate progress saving — useful when long PTQ runs get hit with Slurm timeouts. See `modelopt_recipes/general/ptq/nvfp4_default-kv_none-gptq.yaml `_ for usage. +- Add performant layerwise calibration for large models that don't fit on GPU (e.g. DeepSeek-R1, Kimi-K2). See `modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8_layerwise.yaml `_ for usage. Layerwise calibration also supports PTQ with intermediate progress saving — useful when long PTQ runs get hit with Slurm timeouts. See `modelopt_recipes/general/ptq/nvfp4_default-kv_none-gptq.yaml `_ for usage. - Add implicit GEMM CUDA kernel for Conv3D with fused NVFP4 fake quantization (``modelopt.torch.quantization.src.conv``). When NVFP4 quantization is applied to an ``nn.Conv3d`` layer via ModelOpt PTQ, the implicit GEMM path is used automatically instead of cuDNN. Uses BF16 WMMA tensor cores (SM80+) with FP32 accumulation and in-kernel FP4 (E2M1) activation quantization. Grouped convolution (``groups > 1``) falls back to the default cuDNN path. Inference only — training mode falls back to cuDNN with a warning. - Add FP8 MHA quantization support for vision transformers. Adds an attention-aware ONNX post-processing pass (scale Mul / K-transpose move before Q, Q→DQ insertion on softmax output) in :class:`FP8QuantExporter `, per-instance nested-attention-wrapper skipping in the HF plugin, and ``nn.LayerNorm`` registration in ``QuantModuleRegistry`` so BMM input quantizers and LayerNorm output quantizers defined in FP8_DEFAULT_CFG are honored end-to-end. See `examples/torch_onnx/torch_quant_to_onnx.py `_ for the general timm-model quantize→ONNX workflow. diff --git a/docs/source/guides/10_recipes.rst b/docs/source/guides/10_recipes.rst index baa40a530af..4a1da0a3150 100644 --- a/docs/source/guides/10_recipes.rst +++ b/docs/source/guides/10_recipes.rst @@ -495,6 +495,8 @@ General PTQ recipes are model-agnostic and apply to any supported architecture: - NVFP4 for MLP layers only, FP8 KV cache * - ``general/ptq/nvfp4_experts_only-kv_fp8`` - NVFP4 for MoE expert layers only, FP8 KV cache + * - ``general/ptq/nvfp4_experts_only-kv_fp8_layerwise`` + - NVFP4 for MoE expert layers only, FP8 KV cache, layerwise calibration * - ``general/ptq/nvfp4_omlp_only-kv_fp8`` - NVFP4 for output projection + MLP layers, FP8 KV cache @@ -657,6 +659,7 @@ The ``modelopt_recipes/`` package is organized as follows: | +-- nvfp4_default-kv_nvfp4_cast.yaml | +-- nvfp4_mlp_only-kv_fp8.yaml | +-- nvfp4_experts_only-kv_fp8.yaml + | +-- nvfp4_experts_only-kv_fp8_layerwise.yaml | +-- nvfp4_omlp_only-kv_fp8.yaml +-- models/ # Model-specific recipes | +-- Step3.5-Flash/ diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 758ed75aeed..71add909145 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -988,6 +988,18 @@ def quantize_main( default_pad_token, device: torch.device, ): + # Load the recipe up front so we can detect layerwise calibration before batch-size probing. + recipe = None + if args.recipe is not None and not args.auto_quantize_bits: + print(f"Use recipe {args.recipe} for quantization") + recipe = load_recipe(args.recipe) + assert isinstance(recipe, ModelOptPTQRecipe), ( + f"Expected PTQ recipe, but got {type(recipe).__name__} from {args.recipe}" + ) + + recipe_algorithm = recipe.quantize.model_dump().get("algorithm") if recipe else None + is_layerwise = isinstance(recipe_algorithm, dict) and recipe_algorithm.get("layerwise", False) + if args.batch_size == 0: # For VL models with image-text calibration, skip automatic batch size detection # since get_max_batch_size can't handle multimodal inputs @@ -1001,6 +1013,11 @@ def quantize_main( "Offline speculative decoding calibration enabled. Using default batch_size=1 for calibration." ) args.batch_size = 1 + # Layerwise calibration processes one layer at a time; auto batch-size probing runs a + # full-model forward which defeats the point and can OOM on very large models. + elif is_layerwise: + print("Layerwise calibration enabled. Using default batch_size=1 for calibration.") + args.batch_size = 1 else: # Calibration/sparsification will actually take much more memory than regular inference # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio @@ -1064,12 +1081,7 @@ def quantize_main( else: # mono quantization - if args.recipe is not None: - print(f"Use recipe {args.recipe} for quantization") - recipe = load_recipe(args.recipe) - assert isinstance(recipe, ModelOptPTQRecipe), ( - f"Expected PTQ recipe, but got {type(recipe).__name__} from {args.recipe}" - ) + if recipe is not None: quant_cfg = recipe.quantize.model_dump() else: diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 3adb70cf6b7..dfed54cc991 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -154,7 +154,7 @@ import warnings from typing import Any, Literal, cast -from pydantic import ValidationInfo, field_validator, model_validator +from pydantic import AliasChoices, ValidationInfo, field_validator, model_validator from typing_extensions import Required, TypedDict from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField @@ -588,6 +588,7 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig): layerwise: bool = ModeloptField( default=False, + validation_alias=AliasChoices("layerwise", "use_sequential"), title="Enable layerwise (layer-by-layer) calibration.", description=( "If True, the calibration algorithm is applied layer by layer. " diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml index 6222ab39e3a..547cf312863 100644 --- a/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml @@ -21,7 +21,7 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max layerwise calibration. + description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max calibration. quantize: algorithm: method: max diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8_layerwise.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8_layerwise.yaml new file mode 100644 index 00000000000..9d1f470643f --- /dev/null +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8_layerwise.yaml @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + kv_fp8: configs/ptq/units/kv_fp8 + +metadata: + recipe_type: ptq + description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max layerwise calibration. +quantize: + algorithm: + method: max + # Max calibration is fast and does not typically need checkpointing. + layerwise: true + quant_cfg: + - $import: base_disable_all + - quantizer_name: '*mlp.experts*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*mlp.experts*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*input_quantizer' + cfg: + $import: nvfp4 + - $import: kv_fp8 + - $import: default_disabled_quantizers diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index a5c8ccaf479..738dfc268ca 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -136,6 +136,7 @@ def test_load_recipe_builtin_description(): "general/ptq/nvfp4_default-kv_nvfp4_cast", "general/ptq/nvfp4_default-kv_none-gptq", "general/ptq/nvfp4_experts_only-kv_fp8", + "general/ptq/nvfp4_experts_only-kv_fp8_layerwise", "general/ptq/nvfp4_mlp_only-kv_fp8", "general/ptq/nvfp4_omlp_only-kv_fp8", ] diff --git a/tests/unit/torch/quantization/test_config_validation.py b/tests/unit/torch/quantization/test_config_validation.py index f5b1e576f5e..84306dc5116 100644 --- a/tests/unit/torch/quantization/test_config_validation.py +++ b/tests/unit/torch/quantization/test_config_validation.py @@ -25,6 +25,7 @@ INT4_AWQ_CFG, NVFP4_DEFAULT_CFG, W4A8_AWQ_BETA_CFG, + MaxCalibConfig, QuantizeConfig, find_quant_cfg_entry_by_path, need_calibration, @@ -525,3 +526,35 @@ def test_validate_quant_cfg_entries_accepts_valid_cfg(self): algorithm="max", ) assert len(cfg.quant_cfg) == 2 + + +class TestLayerwiseUseSequentialAlias: + """`layerwise` accepts the legacy `use_sequential` name via validation_alias. + + Old PTQ checkpoints serialized the field as `use_sequential` before #1251 renamed + it to `layerwise`. AliasChoices lets those checkpoints load without a migration + validator while still serializing under the current name. + """ + + def test_use_sequential_true_sets_layerwise(self): + cfg = MaxCalibConfig(use_sequential=True) + assert cfg.layerwise is True + + def test_use_sequential_false_sets_layerwise(self): + cfg = MaxCalibConfig(use_sequential=False) + assert cfg.layerwise is False + + def test_layerwise_name_still_accepted(self): + cfg = MaxCalibConfig(layerwise=True) + assert cfg.layerwise is True + + def test_serializes_under_current_name(self): + """Dump must use `layerwise`, not the legacy alias.""" + dumped = MaxCalibConfig(use_sequential=True).model_dump() + assert dumped["layerwise"] is True + assert "use_sequential" not in dumped + + def test_unknown_field_still_rejected(self): + """extra='forbid' must still reject unrelated unknown fields.""" + with pytest.raises(ValidationError): + MaxCalibConfig(not_a_real_field=True) From 8f142fffe4ac11e1f623b47381077c34719e4c6e Mon Sep 17 00:00:00 2001 From: realAsma Date: Fri, 24 Apr 2026 22:06:33 +0000 Subject: [PATCH 2/2] fix: address review feedback on layerwise detection + header + input validation - examples/llm_ptq/hf_ptq.py: replace dict-inspection layerwise detection with a small recursive helper accepting ModelOptPTQRecipe directly, handling list-form QuantizeAlgoCfgType (per coderabbitai, jenchen13). - examples/llm_ptq/hf_ptq.py: convert recipe-type assert to explicit if/raise TypeError so validation is not stripped under python -O (per cjluo-nv). - modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv_layerwise.yaml: bump new-file copyright header to 2026 per LICENSE_HEADER (per cjluo-nv). Signed-off-by: realAsma --- examples/llm_ptq/hf_ptq.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 71add909145..875e78ceea6 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -993,12 +993,19 @@ def quantize_main( if args.recipe is not None and not args.auto_quantize_bits: print(f"Use recipe {args.recipe} for quantization") recipe = load_recipe(args.recipe) - assert isinstance(recipe, ModelOptPTQRecipe), ( - f"Expected PTQ recipe, but got {type(recipe).__name__} from {args.recipe}" - ) + if not isinstance(recipe, ModelOptPTQRecipe): + raise TypeError( + f"Expected PTQ recipe, but got {type(recipe).__name__} from {args.recipe}" + ) + + def _is_layerwise(obj): + if isinstance(obj, ModelOptPTQRecipe): + return _is_layerwise(obj.quantize.algorithm) + if isinstance(obj, list): + return any(_is_layerwise(a) for a in obj) + return bool(getattr(obj, "layerwise", False)) - recipe_algorithm = recipe.quantize.model_dump().get("algorithm") if recipe else None - is_layerwise = isinstance(recipe_algorithm, dict) and recipe_algorithm.get("layerwise", False) + is_layerwise = _is_layerwise(recipe) if args.batch_size == 0: # For VL models with image-text calibration, skip automatic batch size detection