minor

kinjalpatel27 · kinjalpatel27 · commit 0acc8350d0f6 · 2026-04-14T22:17:08.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py
@@ -31,7 +31,7 @@
 )
 
 import modelopt.torch.quantization as mtq
-from modelopt.torch.export.hf_vllm_quantizer_merge import is_weight_quantizer_state_key
+from modelopt.torch.export.plugins.vllm_fakequant_hf import is_weight_quantizer_state_key
 from modelopt.torch.quantization.plugins.vllm import (
     disable_compilation,
     post_restore_vllm_parallel_linears,
diff --git a/examples/vllm_serve/vllm_reload_utils.py b/examples/vllm_serve/vllm_reload_utils.py
@@ -23,7 +23,7 @@
 import torch
 from vllm.distributed.parallel_state import get_tp_group
 
-from modelopt.torch.export.hf_vllm_quantizer_merge import (
+from modelopt.torch.export.plugins.vllm_fakequant_hf import (
     is_weight_quantizer_state_key,
     merge_amax_tensors_for_vllm_group,
 )
diff --git a/modelopt/torch/export/plugins/vllm_fakequant_hf.py b/modelopt/torch/export/plugins/vllm_fakequant_hf.py
@@ -15,6 +15,7 @@
 """Export HuggingFace model to vLLM fakequant checkpoint."""
 
 import copy
+import re
 from pathlib import Path
 from typing import Any
 
@@ -29,11 +30,52 @@
 from modelopt.torch.quantization.utils import get_quantizer_state_dict
 from modelopt.torch.utils import get_unwrapped_name, safe_save
 
-from ..hf_vllm_quantizer_merge import is_weight_quantizer_state_key
 from ..layer_utils import get_experts_list, is_moe
 from ..quant_utils import get_quantization_format
 
-__all__ = ["export_hf_vllm_fq_checkpoint", "is_weight_quantizer_state_key"]
+__all__ = [
+    "export_hf_vllm_fq_checkpoint",
+    "is_weight_quantizer_state_key",
+    "merge_amax_tensors_for_vllm_group",
+]
+
+# Matches ``…weight_quantizer``, ``…weight_quantizer.0``, ``…w13_weight_quantizer.0``, etc.
+_WEIGHT_QUANTIZER_STATE_KEY = re.compile(r"(?:^|\.)(?:\w+_)?weight_quantizer(?:\.\d+)*$")
+
+
+def is_weight_quantizer_state_key(key: str) -> bool:
+    """Return True for weight-quantizer state keys, including SequentialQuantizer entries.
+
+    Matches ``weight_quantizer``, ``w13_weight_quantizer``, ``weight_quantizer.0``, etc.
+    """
+    return bool(_WEIGHT_QUANTIZER_STATE_KEY.search(key))
+
+
+def merge_amax_tensors_for_vllm_group(tensors: list[torch.Tensor]) -> torch.Tensor:
+    """Combine `_amax` buffers from a merge group into a single tensor.
+
+    Used when HuggingFace module names are folded to vLLM names (e.g. q/k/v → qkv_proj).
+
+    - If every tensor has the same shape, take the element-wise maximum over the group
+      (conservative when each branch carried the same axis layout).
+    - If shapes differ (e.g. GQA q vs k), try ``torch.cat(..., dim=0)`` when valid for
+      per-channel amax; otherwise fall back to a scalar max over all elements.
+    """
+    if not tensors:
+        raise ValueError("merge_amax_tensors_for_vllm_group: expected at least one tensor")
+    if len(tensors) == 1:
+        return tensors[0]
+
+    first = tensors[0]
+    if all(t.shape == first.shape for t in tensors):
+        stacked = torch.stack([t.float() for t in tensors], dim=0)
+        return torch.amax(stacked, dim=0).to(dtype=first.dtype, device=first.device)
+
+    try:
+        return torch.cat(tensors, dim=0).to(dtype=first.dtype, device=first.device)
+    except RuntimeError:
+        flat = torch.cat([t.reshape(-1).float() for t in tensors])
+        return torch.max(flat).to(dtype=first.dtype, device=first.device)
 
 
 def disable_rotate(quantizer: TensorQuantizer):
@@ -217,7 +259,7 @@ def export_hf_vllm_fq_checkpoint(
                         if (
                             hasattr(inp_q, "_pre_quant_scale")
                             and inp_q._pre_quant_scale is not None
-                            and inp_q._disabled
+                            and not inp_q.is_enabled
                         ):
                             scale = inp_q._pre_quant_scale.squeeze().to(device=w_quant.device)
                             w_quant = (w_quant * scale[None, :]).to(w_quant.dtype)

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@`
`31`	`31`	`)`
`32`	`32`
`33`	`33`	`import modelopt.torch.quantization as mtq`
`34`		`-from modelopt.torch.export.hf_vllm_quantizer_merge import is_weight_quantizer_state_key`
	`34`	`+from modelopt.torch.export.plugins.vllm_fakequant_hf import is_weight_quantizer_state_key`
`35`	`35`	`from modelopt.torch.quantization.plugins.vllm import (`
`36`	`36`	`disable_compilation,`
`37`	`37`	`post_restore_vllm_parallel_linears,`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`import torch`
`24`	`24`	`from vllm.distributed.parallel_state import get_tp_group`
`25`	`25`
`26`		`-from modelopt.torch.export.hf_vllm_quantizer_merge import (`
	`26`	`+from modelopt.torch.export.plugins.vllm_fakequant_hf import (`
`27`	`27`	`is_weight_quantizer_state_key,`
`28`	`28`	`merge_amax_tensors_for_vllm_group,`
`29`	`29`	`)`