cleanup

kinjalpatel27 · kinjalpatel27 · commit 5a031acadad4 · 2026-04-16T17:18:42.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py
@@ -144,8 +144,11 @@ def _fakequant_run_prolog_worker(self) -> None:
 
     mtq.fold_weight(model)
     for name, module in model.named_modules():
-        if is_weight_quantizer_state_key(name):
-            assert not module.is_enabled, f"quantizer {name} is still enabled"
+        if is_weight_quantizer_state_key(name) and module.is_enabled:
+            raise RuntimeError(
+                f"Weight quantizer {name!r} is still enabled after fold_weight — "
+                "double-quantization would corrupt activations."
+            )
 
 
 class FakeQuantWorker(BaseWorker):
diff --git a/examples/vllm_serve/vllm_reload_utils.py b/examples/vllm_serve/vllm_reload_utils.py
@@ -23,10 +23,7 @@
 import torch
 from vllm.distributed.parallel_state import get_tp_group
 
-from modelopt.torch.export.plugins.vllm_fakequant_hf import (
-    is_weight_quantizer_state_key,
-    merge_amax_tensors_for_vllm_group,
-)
+from modelopt.torch.export.plugins.vllm_fakequant_hf import is_weight_quantizer_state_key
 from modelopt.torch.opt.conversion import (
     ModelLikeModule,
     ModeloptStateManager,
@@ -163,6 +160,33 @@ def _group_keys_for_vllm(
     return vllm_state_dict, merge_groups
 
 
+def merge_amax_tensors_for_vllm_group(tensors: list[torch.Tensor]) -> torch.Tensor:
+    """Combine `_amax` buffers from a merge group into a single tensor.
+
+    Used when HuggingFace module names are folded to vLLM names (e.g. q/k/v → qkv_proj).
+
+    - If every tensor has the same shape, take the element-wise maximum over the group
+      (conservative when each branch carried the same axis layout).
+    - If shapes differ (e.g. GQA q vs k), try ``torch.cat(..., dim=0)`` when valid for
+      per-channel amax; otherwise fall back to a scalar max over all elements.
+    """
+    if not tensors:
+        raise ValueError("merge_amax_tensors_for_vllm_group: expected at least one tensor")
+    if len(tensors) == 1:
+        return tensors[0]
+
+    first = tensors[0]
+    if all(t.shape == first.shape for t in tensors):
+        stacked = torch.stack([t.float() for t in tensors], dim=0)
+        return torch.amax(stacked, dim=0).to(dtype=first.dtype, device=first.device)
+
+    try:
+        return torch.cat(tensors, dim=0).to(dtype=first.dtype, device=first.device)
+    except RuntimeError:
+        flat = torch.cat([t.reshape(-1).float() for t in tensors])
+        return torch.max(flat).to(dtype=first.dtype, device=first.device)
+
+
 def _merge_values_by_max_or_concat(merged_key: str, key_value_pairs: list[tuple[str, Any]]) -> Any:
     """
     Merge values by taking max for amax, concatenating for others.
diff --git a/modelopt/torch/export/plugins/vllm_fakequant_hf.py b/modelopt/torch/export/plugins/vllm_fakequant_hf.py
@@ -36,7 +36,6 @@
 __all__ = [
     "export_hf_vllm_fq_checkpoint",
     "is_weight_quantizer_state_key",
-    "merge_amax_tensors_for_vllm_group",
 ]
 
 # Matches ``…weight_quantizer``, ``…weight_quantizer.0``, ``…w13_weight_quantizer.0``, etc.
@@ -51,33 +50,6 @@ def is_weight_quantizer_state_key(key: str) -> bool:
     return bool(_WEIGHT_QUANTIZER_STATE_KEY.search(key))
 
 
-def merge_amax_tensors_for_vllm_group(tensors: list[torch.Tensor]) -> torch.Tensor:
-    """Combine `_amax` buffers from a merge group into a single tensor.
-
-    Used when HuggingFace module names are folded to vLLM names (e.g. q/k/v → qkv_proj).
-
-    - If every tensor has the same shape, take the element-wise maximum over the group
-      (conservative when each branch carried the same axis layout).
-    - If shapes differ (e.g. GQA q vs k), try ``torch.cat(..., dim=0)`` when valid for
-      per-channel amax; otherwise fall back to a scalar max over all elements.
-    """
-    if not tensors:
-        raise ValueError("merge_amax_tensors_for_vllm_group: expected at least one tensor")
-    if len(tensors) == 1:
-        return tensors[0]
-
-    first = tensors[0]
-    if all(t.shape == first.shape for t in tensors):
-        stacked = torch.stack([t.float() for t in tensors], dim=0)
-        return torch.amax(stacked, dim=0).to(dtype=first.dtype, device=first.device)
-
-    try:
-        return torch.cat(tensors, dim=0).to(dtype=first.dtype, device=first.device)
-    except RuntimeError:
-        flat = torch.cat([t.reshape(-1).float() for t in tensors])
-        return torch.max(flat).to(dtype=first.dtype, device=first.device)
-
-
 def disable_rotate(quantizer: TensorQuantizer):
     """Return a disabled copy of the quantizer's ``_rotate`` field, preserving its type."""
     if isinstance(quantizer._rotate, RotateConfig):