Add offload test for vllm fakequant export; CHANGELOG entry

realAsma · realAsma · commit c9b7f827d29b · 2026-04-17T18:32:08.000Z
Adds test_hf_vllm_export_offload covering the inplace_mem_efficient=True
path of export_hf_vllm_fq_checkpoint on a CPU-offloaded tiny LLaMA. The
test asserts the inplace path actually mutates offloaded layer weights
(falsifying a silent fall-through to the copy path), that the reloaded
HF model matches a deepcopy+fold_weight reference built inside
enable_weight_access_and_writeback (materializes meta tensors before
folding), and that the saved quantizer state preserves input amaxes.

Also adds a CHANGELOG.rst bullet under 0.44 New Features describing the
layerwise calibration feature and linking to the experts-only recipe.

Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -15,6 +15,7 @@ Changelog
 - Enable PTQ workflow for the Step3.5-Flash MoE model with NVFP4 W4A4 + FP8 KV cache quantization. See `modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml>`_ for more details.
 - Add support for vLLM fakequant reload using ModelOpt state for HF models. See `examples/vllm_serve/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/vllm_serve#load-qatptq-model-and-serve-in-vllm-wip>`_ for more details.
 - [Early Testing] Add Claude Code PTQ skill (``.claude/skills/ptq/``) for agent-assisted post-training quantization. The skill guides the agent through environment detection, model support checking, format selection, and execution via the launcher or manual SLURM/Docker/bare GPU paths. Includes handling for unlisted models with custom module patching. This feature is in early testing — use with caution.
+- Add performant layerwise calibration for large models that don't fit on GPU (e.g. DeepSeek-R1, Kimi-K2). Each decoder layer is materialized once per calibration step instead of per-batch, enabling larger batch sizes during PTQ. Includes per-layer checkpoint save/resume so calibration can survive cluster time limits. See `modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml>`_ for usage.
 
 **Backward Breaking Changes**
 
diff --git a/tests/gpu/torch/export/test_vllm_fakequant_hf_export.py b/tests/gpu/torch/export/test_vllm_fakequant_hf_export.py
@@ -12,16 +12,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 from copy import deepcopy
 
 import pytest
 import torch
 from _test_utils.torch.transformers_models import create_tiny_llama_dir
-from transformers import AutoModelForCausalLM
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+from transformers import AutoConfig, AutoModelForCausalLM
 
 import modelopt.torch.quantization as mtq
 from modelopt.torch.export import export_hf_vllm_fq_checkpoint
 from modelopt.torch.quantization.model_quant import fold_weight
+from modelopt.torch.quantization.utils import enable_weight_access_and_writeback
 from modelopt.torch.utils import safe_load
 
 
@@ -111,3 +114,119 @@ def forward_loop(model):
             "_amax" in k for k in quantizer_state_dict_before[name]
         ):
             assert any("_amax" in k for k in state), f"input quantizer {name} should preserve _amax"
+
+
+def _make_cpu_offloaded_model(tmp_path, num_hidden_layers=3):
+    """Create a tiny LLaMA model with layer 0 offloaded to CPU via accelerate."""
+    tiny_llama_dir = create_tiny_llama_dir(tmp_path, num_hidden_layers=num_hidden_layers)
+    config = AutoConfig.from_pretrained(tiny_llama_dir)
+
+    with init_empty_weights():
+        model = AutoModelForCausalLM.from_config(config)
+
+    device_map = {
+        n: 0
+        for n, m in model.named_modules()
+        if "layers" not in n or n.split("layers.")[-1].isdigit()
+    }
+    device_map["model.layers.0"] = "cpu"
+
+    model = load_checkpoint_and_dispatch(model, tiny_llama_dir, device_map=device_map)
+    return model, config, tiny_llama_dir
+
+
+def _make_layerwise_cfg(base_cfg):
+    """Add layerwise=True to a quant config's algorithm field."""
+    cfg = copy.deepcopy(base_cfg)
+    algo = cfg.get("algorithm", "max")
+    if isinstance(algo, str):
+        cfg["algorithm"] = {"method": algo, "layerwise": True}
+    else:
+        algo["layerwise"] = True
+    return cfg
+
+
+@pytest.mark.parametrize("quant_cfg", [mtq.FP8_DEFAULT_CFG])
+def test_hf_vllm_export_offload(tmp_path, quant_cfg):
+    """Test ``inplace_mem_efficient=True`` export path on a CPU-offloaded model.
+
+    Mirrors ``test_hf_vllm_export`` but uses a CPU-offloaded model with layerwise
+    calibration. Skips the "model not mutated" assertion since the inplace path
+    is intentionally destructive.
+    """
+    num_hidden_layers = 3
+
+    # Test model: CPU-offloaded, layerwise calibration
+    model, _config, tiny_llama_dir = _make_cpu_offloaded_model(
+        tmp_path / "offloaded", num_hidden_layers=num_hidden_layers
+    )
+    model.eval()
+
+    seq_cfg = _make_layerwise_cfg(quant_cfg)
+
+    def forward_loop(model):
+        input_ids = torch.randint(0, model.config.vocab_size, (1, 128)).cuda()
+        with torch.no_grad():
+            model(input_ids)
+
+    model = mtq.quantize(model, seq_cfg, forward_loop)
+    quantizer_state_dict_before = mtq.utils.get_quantizer_state_dict(model)
+
+    folded_model = deepcopy(model)
+    with enable_weight_access_and_writeback(folded_model.model.layers[0], folded_model):
+        fold_weight(folded_model)
+        expected_weights = {
+            k: v.detach().clone()
+            for k, v in folded_model.state_dict().items()
+            if "quantizer" not in k
+        }
+    del folded_model
+
+    export_dir = tmp_path / "vllm_export_offload"
+    export_dir.mkdir(exist_ok=True)
+
+    # Snapshot the offloaded layer's weight before/after export to verify the
+    # inplace_mem_efficient path actually mutates offloaded weights (would otherwise
+    # be unfalsifiable if the function silently took the copy path).
+    with enable_weight_access_and_writeback(model.model.layers[0], model):
+        weight_before = model.model.layers[0].self_attn.q_proj.weight.data.clone()
+
+    export_hf_vllm_fq_checkpoint(model, export_dir=export_dir, inplace_mem_efficient=True)
+
+    with enable_weight_access_and_writeback(model.model.layers[0], model):
+        weight_after = model.model.layers[0].self_attn.q_proj.weight.data.clone()
+    assert not torch.equal(weight_before, weight_after), (
+        "inplace path must mutate offloaded layer weights"
+    )
+
+    modelopt_state_file = export_dir / "vllm_fq_modelopt_state.pth"
+    assert modelopt_state_file.exists(), (
+        f"vllm_fq_modelopt_state.pth file should be created in {export_dir}"
+    )
+
+    hf_quant_config_file = export_dir / "hf_quant_config.json"
+    assert not hf_quant_config_file.exists(), (
+        f"hf_quant_config.json file should not be created in {export_dir}"
+    )
+
+    model_after = AutoModelForCausalLM.from_pretrained(export_dir).cuda()
+    model_after.eval()
+    model_after_state_dict = model_after.state_dict()
+    for key, param in expected_weights.items():
+        assert torch.allclose(param, model_after_state_dict[key], atol=1e-6), (
+            f"Weight mismatch for {key}: "
+            f"before shape={param.shape}, after shape={model_after_state_dict[key].shape}, "
+            f"max diff={torch.abs(param - model_after_state_dict[key]).max()}"
+        )
+
+    quantizer_state_dict = safe_load(modelopt_state_file)["modelopt_state_weights"]
+    assert len(quantizer_state_dict) > 0, (
+        f"modelopt_state_weights should not be empty in {modelopt_state_file}"
+    )
+    for name, state in quantizer_state_dict.items():
+        if "weight_quantizer" in name:
+            assert state == {}, f"weight quantizer {name} should have empty state after fold"
+        elif "input_quantizer" in name and any(
+            "_amax" in k for k in quantizer_state_dict_before[name]
+        ):
+            assert any("_amax" in k for k in state), f"input quantizer {name} should preserve _amax"