NVIDIA
diff --git a/‎examples/vllm_serve/vllm_reload_utils.py‎
Lines changed: 14 additions & 42 deletions b/‎examples/vllm_serve/vllm_reload_utils.py‎
Lines changed: 14 additions & 42 deletions
diff --git a/‎modelopt/torch/export/plugins/vllm_fakequant_hf.py‎
Lines changed: 112 additions & 99 deletions b/‎modelopt/torch/export/plugins/vllm_fakequant_hf.py‎
Lines changed: 112 additions & 99 deletions
@@ -23,6 +23,7 @@
 from vllm.distributed.parallel_state import get_tp_group
 
 from modelopt.torch.export.plugins.vllm_fakequant_hf import (
+    infer_quantizer_prefix_remap,
     is_weight_quantizer_state_key,
     merge_amax_tensors_for_group,
 )
@@ -149,9 +150,10 @@ def _group_keys_for_vllm(
     for key, value in state_dict.items():
         action, new_key, new_value = _convert_key_for_vllm(key, value)
         if new_key is None or new_value is None:
-            assert action == "skip", (
-                f"Expected action to be 'skip' for key {key}, value {value}, got {action}"
-            )
+            if action != "skip":
+                raise RuntimeError(
+                    f"Expected action to be 'skip' for key {key}, value {value}, got {action}"
+                )
             continue
         if action == "copy":
             vllm_state_dict[new_key] = new_value
@@ -219,38 +221,6 @@ def _merge_values_require_identical(merged_key: str, key_value_pairs: list[tuple
     return first_value
 
 
-def _infer_prefix_remap(
-    quantizer_keys: dict[str, Any],
-    map_fun: Callable[[dict[str, Any]], dict[str, Any]],
-) -> dict[str, str]:
-    """Map HF root name → vLLM root (e.g. ``backbone`` → ``model``) using ``map_fun`` on ``*.weight`` keys.
-
-    Quantizer keys never go through ``map_fun`` later, so we probe with a tiny CPU placeholder.
-    It must be **2-D** (mappers expect matrix weights; 1-D often errors). Only the returned key
-    path is used; values are ignored. A CPU tensor is enough for typical HF↔vLLM name mapping.
-    """
-    prefix_remap: dict[str, str] = {}
-    probe_weight = torch.empty((1, 1))
-    for key in quantizer_keys:
-        first_component = key.split(".")[0]
-        if first_component in prefix_remap:
-            continue
-        last_dot = key.rfind(".")
-        if last_dot == -1:
-            continue
-        probe_key = key[:last_dot] + ".weight"
-        try:
-            result = map_fun({probe_key: probe_weight})
-            if result:
-                new_key = next(iter(result))
-                new_first = new_key.split(".")[0]
-                if new_first != first_component:
-                    prefix_remap[first_component] = new_first
-        except Exception as e:
-            warnings.warn(f"prefix-remap probe failed for {probe_key!r}: {e}")
-    return prefix_remap
-
-
 def convert_dict_to_vllm(
     state_dict: dict[str, Any],
     max_or_concat: bool = True,
@@ -273,7 +243,7 @@ def convert_dict_to_vllm(
     # invoked on non-quantizer keys.
     if map_fun is not None:
         q_only = {k: v for k, v in state_dict.items() if "_quantizer" in k}
-        prefix_remap = _infer_prefix_remap(q_only, map_fun)
+        prefix_remap = infer_quantizer_prefix_remap(q_only, map_fun)
         if prefix_remap:
             renamed = {}
             for k, v in state_dict.items():
@@ -406,11 +376,12 @@ def _has_buffers(state: dict) -> bool:
                 if not is_weight_quantizer_state_key(wq_k):
                     continue
                 wq_state = filtered[wq_k]
-                assert wq_k in saved or wq_state.get("_disabled"), (
-                    f"Weight quantizer {wq_k!r} is missing from saved quantizer_state but "
-                    f"is not marked _disabled (got _disabled={wq_state.get('_disabled')!r}). "
-                    f"vLLM fakequant export omits weight quantizer keys when weights are folded."
-                )
+                if wq_k not in saved and not wq_state.get("_disabled"):
+                    raise RuntimeError(
+                        f"Weight quantizer {wq_k!r} is missing from saved quantizer_state but "
+                        f"is not marked _disabled (got _disabled={wq_state.get('_disabled')!r}). "
+                        f"vLLM fakequant export omits weight quantizer keys when weights are folded."
+                    )
             metadata["quantizer_state"] = filtered
 
 
@@ -449,7 +420,8 @@ def restore_from_modelopt_state_vllm(
 
     if not manager.has_state and isinstance(model, ModelLikeModule):
         model = model.init_modellike()
-    assert not isinstance(model, ModelLikeModule), "Model must be a regular Module now!"
+    if isinstance(model, ModelLikeModule):
+        raise RuntimeError("Model must be a regular Module after restore, got ModelLikeModule")
     return model
 
 
 
@@ -17,6 +17,7 @@
 import copy
 import logging
 import re
+import warnings
 from collections.abc import Callable
 from pathlib import Path
 from typing import Any
@@ -121,11 +122,12 @@ def _check_all_weight_quantizers_disabled(model: nn.Module) -> None:
             if attr_name.endswith("weight_quantizer") and isinstance(
                 quantizer, (TensorQuantizer, SequentialQuantizer)
             ):
-                assert not quantizer.is_enabled, (
-                    f"vLLM fakequant export: {attr_name!r} must be disabled before saving "
-                    f"quantizer_state (weights already folded). "
-                    f"See filter_modelopt_state_quantizer_state_for_model in vllm_reload_utils."
-                )
+                if quantizer.is_enabled:
+                    raise RuntimeError(
+                        f"vLLM fakequant export: {attr_name!r} must be disabled before saving "
+                        f"quantizer_state (weights already folded). "
+                        f"See filter_modelopt_state_quantizer_state_for_model in vllm_reload_utils."
+                    )
 
 
 def disable_rotate(quantizer: TensorQuantizer):
@@ -171,25 +173,27 @@ def requant_weights_for_export(
     ``w`` (e.g. CPU offload).
     """
     copied = copy.deepcopy(quantizer).to(device=weight.device)
-    sequence_quantizers: list[TensorQuantizer] = (
+    quantizers: list[TensorQuantizer] = (
         list(copied) if isinstance(copied, SequentialQuantizer) else [copied]
     )
 
-    for quantizer_copy in sequence_quantizers:
+    for quantizer_copy in quantizers:
         quantizer_copy.eval()
         quantizer_copy.reset_amax()
         enable_stats_collection(quantizer_copy)
     # Match legacy single-quantizer path: first calib uses ``w`` as-is; chains use float.
-    if len(sequence_quantizers) == 1:
-        weight_quantized = sequence_quantizers[0](weight)
+    if len(quantizers) == 1:
+        weight_quantized = quantizers[0](weight)
     else:
-        weight_quantized = weight.float()
-        for quantizer_copy in sequence_quantizers:
+        weight_quantized = weight
+        for quantizer_copy in quantizers:
             weight_quantized = quantizer_copy(weight_quantized)
-    for quantizer_copy in sequence_quantizers:
+    for quantizer_copy in quantizers:
         finish_stats_collection(quantizer_copy)
-    weight_quantized = weight.float()
-    for quantizer_copy in sequence_quantizers:
+    # Re-run application pass to get the quantized output with the freshly collected amax.
+    # The calibration forward above only collected stats; its output is intentionally discarded.
+    weight_quantized = weight
+    for quantizer_copy in quantizers:
         weight_quantized = quantizer_copy(weight_quantized)
     return weight_quantized.to(weight.dtype)
 
@@ -219,6 +223,12 @@ def merge_amax_tensors_for_group(tensors: list[torch.Tensor]) -> torch.Tensor:
     try:
         return torch.cat(tensors, dim=0).to(dtype=first.dtype, device=first.device)
     except RuntimeError:
+        shapes = [tuple(t.shape) for t in tensors]
+        warnings.warn(
+            f"merge_amax_tensors_for_group: torch.cat failed for shapes {shapes}; "
+            "falling back to scalar max which loses per-channel amax structure.",
+            stacklevel=2,
+        )
         flat = torch.cat([t.reshape(-1).float() for t in tensors])
         return torch.max(flat).to(dtype=first.dtype, device=first.device)
 
@@ -258,7 +268,9 @@ def _process_group(modules: list[nn.Module]) -> None:
         if pqs_list is None:
             return
 
-        avg_pqs = torch.stack(pqs_list).mean(0)
+        # Mean and clamp in float32: fp16/bf16 would underflow float32.tiny to 0 and divide by zero.
+        pqs_dtype = pqs_list[0].dtype
+        avg_pqs = torch.stack([p.float() for p in pqs_list]).mean(0)
         avg_pqs = avg_pqs.clamp(min=torch.finfo(torch.float32).tiny)
 
         for m in modules:
@@ -270,8 +282,8 @@ def _process_group(modules: list[nn.Module]) -> None:
             if torch.equal(old_pqs, avg_pqs_dev):
                 continue
             weight = state_dict[f"{nm}.weight"]
-            ratio = old_pqs.to(dtype=torch.float32, device=weight.device) / avg_pqs_dev.to(
-                dtype=torch.float32, device=weight.device
+            ratio = old_pqs.to(dtype=torch.float32, device=weight.device) / avg_pqs.to(
+                device=weight.device
             )
             state_dict[f"{nm}.weight"] = (weight.to(torch.float32) * ratio).to(weight.dtype)
             requant_weights.add(f"{nm}.weight")
@@ -281,7 +293,7 @@ def _process_group(modules: list[nn.Module]) -> None:
         if all(a is not None for a in amaxes):
             synced_amax = merge_amax_tensors_for_group(amaxes)
 
-        avg_pqs_out = avg_pqs.detach().clone()
+        avg_pqs_out = avg_pqs.detach().to(pqs_dtype).clone()
         for m in modules:
             nm = id_to_name.get(id(m))
             if nm is None:
@@ -309,14 +321,15 @@ def _process_group(modules: list[nn.Module]) -> None:
 
     def _dummy_forward() -> None:
         # Partial forward is OK: hooks record layers reached before failure.
-        try:
-            model(torch.ones([1, 2], dtype=torch.long, device=dev))
-        except Exception as e:
-            import logging
+        with torch.inference_mode():
+            try:
+                model(torch.ones([1, 2], dtype=torch.long, device=dev))
+            except Exception as e:
+                import logging
 
-            logging.getLogger(__name__).debug(
-                "Dummy forward for shared-input detection failed (expected for VLMs): %s", e
-            )
+                logging.getLogger(__name__).debug(
+                    "Dummy forward for shared-input detection failed (expected for VLMs): %s", e
+                )
 
     input_to_linear, _ = collect_shared_input_modules(model, _dummy_forward)
     for modules in input_to_linear.values():
@@ -380,9 +393,8 @@ def export_hf_vllm_fq_checkpoint(
                 weight_name = attr_name.removesuffix("_quantizer")
                 prefix = f"{module_name}." if module_name else ""
                 sd_key = f"{prefix}{weight_name}"
-                assert sd_key not in fakequant_weights, (
-                    f"Weight {sd_key} has already been fakequantized"
-                )
+                if sd_key in fakequant_weights:
+                    raise RuntimeError(f"Weight {sd_key} has already been fakequantized")
                 if sd_key in state_dict:
                     w = state_dict[sd_key]
                     if sd_key in requant_weights:
@@ -419,74 +431,75 @@ def export_hf_vllm_fq_checkpoint(
     # Rotation is also cleared: the weight was already folded with rotation applied,
     # so if fold_weight is called on reload it must not re-rotate the exported weight.
     wqs_to_restore: list[tuple[TensorQuantizer, Any]] = []
-    for _, module in model.named_modules():
-        if isinstance(module, QuantModule):
-            for attr_name, quantizer in module.named_children():
-                if not (attr_name.endswith("weight_quantizer") and quantizer.is_enabled):
-                    continue
-                if isinstance(quantizer, SequentialQuantizer):
-                    quantizer.disable()
-                    for sub in quantizer:
-                        orig_rotate = sub._rotate
-                        if sub.rotate_is_enabled:
-                            sub._rotate = disable_rotate(sub)
-                        wqs_to_restore.append((sub, orig_rotate))
-                elif isinstance(quantizer, TensorQuantizer):
-                    quantizer.disable()
-                    orig_rotate = quantizer._rotate
-                    if quantizer.rotate_is_enabled:
-                        quantizer._rotate = disable_rotate(quantizer)
-                    wqs_to_restore.append((quantizer, orig_rotate))
-
-    quantizer_state_dict = get_quantizer_state_dict(model)
-    for key in list(quantizer_state_dict):
-        if is_weight_quantizer_state_key(key):
-            # Fakequant amax is folded into HF weights; do not reload weight quantizer tensors.
-            # Reload must force-disable WQs missing from saved state (see
-            # ``filter_modelopt_state_quantizer_state_for_model`` assertion in vllm_reload_utils).
-            quantizer_state_dict.pop(key)
-        elif key in input_quantizers_folded_pqs:
-            # pre_quant_scale was folded into the weight; keep the buffer for strict load but
-            # save identity so activations are not scaled twice.
-            qstate_val = quantizer_state_dict[key]
-            if isinstance(qstate_val, dict) and "_pre_quant_scale" in qstate_val:
-                quantizer_state_dict[key]["_pre_quant_scale"] = torch.ones_like(
-                    qstate_val["_pre_quant_scale"]
-                )
-
-    # Patch input quantizers with averaged pqs and unified amax so that vLLM's single
-    # per-group input quantizer sees consistent values (covers both dense qkv and MoE experts).
-    for iq_key, (avg_pqs, max_input_amax) in pqs_overrides.items():
-        if iq_key in quantizer_state_dict:
-            qstate_val = quantizer_state_dict[iq_key]
-            if isinstance(qstate_val, dict):
-                if "_pre_quant_scale" in qstate_val:
-                    qstate_val["_pre_quant_scale"] = avg_pqs
-                if max_input_amax is not None and "_amax" in qstate_val:
-                    qstate_val["_amax"] = max_input_amax
-
-    modelopt_state = mto.modelopt_state(model)
-    # ``modelopt_state`` may be stale if another mode (e.g. calibrate) ran last. Rebuild
-    # ``quantizer_state`` and strip weight-quantizer entries (same policy as
-    # ``modelopt_state_weights``). Reload synthesizes missing WQ rows with ``_disabled``.
-    _check_all_weight_quantizers_disabled(model)
-    qstate = quantizer_state(model)
-    for key in list(qstate):
-        if is_weight_quantizer_state_key(key):
-            qstate.pop(key)
-
-    for mode_str, m_state in modelopt_state.get("modelopt_state_dict", []):
-        if mode_str == "quantize" and "metadata" in m_state:
-            m_state["metadata"]["quantizer_state"] = qstate
-            break
-
-    # Per-quantizer tensor dict loaded alongside metadata on reload.
-    modelopt_state["modelopt_state_weights"] = quantizer_state_dict
-    safe_save(modelopt_state, export_dir / "vllm_fq_modelopt_state.pth")
-
-    # Step 3: Save HF weights using the pre-built folded state dict.
-    model.save_pretrained(export_dir, state_dict=clean_sd, save_modelopt_state=False)
-
-    for wq, orig_rotate in wqs_to_restore:
-        wq.enable()
-        wq._rotate = orig_rotate
+    try:
+        for _, module in model.named_modules():
+            if isinstance(module, QuantModule):
+                for attr_name, quantizer in module.named_children():
+                    if not (attr_name.endswith("weight_quantizer") and quantizer.is_enabled):
+                        continue
+                    if isinstance(quantizer, SequentialQuantizer):
+                        quantizer.disable()
+                        for sub in quantizer:
+                            orig_rotate = sub._rotate
+                            if sub.rotate_is_enabled:
+                                sub._rotate = disable_rotate(sub)
+                            wqs_to_restore.append((sub, orig_rotate))
+                    elif isinstance(quantizer, TensorQuantizer):
+                        quantizer.disable()
+                        orig_rotate = quantizer._rotate
+                        if quantizer.rotate_is_enabled:
+                            quantizer._rotate = disable_rotate(quantizer)
+                        wqs_to_restore.append((quantizer, orig_rotate))
+
+        quantizer_state_dict = get_quantizer_state_dict(model)
+        for key in list(quantizer_state_dict):
+            if is_weight_quantizer_state_key(key):
+                # Fakequant amax is folded into HF weights; do not reload weight quantizer tensors.
+                # Reload must force-disable WQs missing from saved state (see
+                # ``filter_modelopt_state_quantizer_state_for_model`` assertion in vllm_reload_utils).
+                quantizer_state_dict.pop(key)
+            elif key in input_quantizers_folded_pqs:
+                # pre_quant_scale was folded into the weight; keep the buffer for strict load but
+                # save identity so activations are not scaled twice.
+                qstate_val = quantizer_state_dict[key]
+                if isinstance(qstate_val, dict) and "_pre_quant_scale" in qstate_val:
+                    quantizer_state_dict[key]["_pre_quant_scale"] = torch.ones_like(
+                        qstate_val["_pre_quant_scale"]
+                    )
+
+        # Patch input quantizers with averaged pqs and unified amax so that vLLM's single
+        # per-group input quantizer sees consistent values (covers both dense qkv and MoE experts).
+        for iq_key, (avg_pqs, max_input_amax) in pqs_overrides.items():
+            if iq_key in quantizer_state_dict:
+                qstate_val = quantizer_state_dict[iq_key]
+                if isinstance(qstate_val, dict):
+                    if "_pre_quant_scale" in qstate_val:
+                        qstate_val["_pre_quant_scale"] = avg_pqs
+                    if max_input_amax is not None and "_amax" in qstate_val:
+                        qstate_val["_amax"] = max_input_amax
+
+        modelopt_state = mto.modelopt_state(model)
+        # ``modelopt_state`` may be stale if another mode (e.g. calibrate) ran last. Rebuild
+        # ``quantizer_state`` and strip weight-quantizer entries (same policy as
+        # ``modelopt_state_weights``). Reload synthesizes missing WQ rows with ``_disabled``.
+        _check_all_weight_quantizers_disabled(model)
+        qstate = quantizer_state(model)
+        for key in list(qstate):
+            if is_weight_quantizer_state_key(key):
+                qstate.pop(key)
+
+        for mode_str, m_state in modelopt_state.get("modelopt_state_dict", []):
+            if mode_str == "quantize" and "metadata" in m_state:
+                m_state["metadata"]["quantizer_state"] = qstate
+                break
+
+        # Per-quantizer tensor dict loaded alongside metadata on reload.
+        modelopt_state["modelopt_state_weights"] = quantizer_state_dict
+        safe_save(modelopt_state, export_dir / "vllm_fq_modelopt_state.pth")
+
+        # Step 3: Save HF weights using the pre-built folded state dict.
+        model.save_pretrained(export_dir, state_dict=clean_sd, save_modelopt_state=False)
+    finally:
+        for wq, orig_rotate in wqs_to_restore:
+            wq.enable()
+            wq._rotate = orig_rotate