minor

kinjalpatel27 · kinjalpatel27 · commit 6806f16691c0 · 2026-04-15T00:50:49.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/examples/vllm_serve/vllm_reload_utils.py b/examples/vllm_serve/vllm_reload_utils.py
@@ -23,7 +23,10 @@
 import torch
 from vllm.distributed.parallel_state import get_tp_group
 
-from modelopt.torch.export.plugins.vllm_fakequant_hf import is_weight_quantizer_state_key
+from modelopt.torch.export.plugins.vllm_fakequant_hf import (
+    is_weight_quantizer_state_key,
+    merge_amax_tensors_for_group,
+)
 from modelopt.torch.opt.conversion import (
     ModelLikeModule,
     ModeloptStateManager,
@@ -137,33 +140,6 @@ def _group_keys_for_vllm(
     return vllm_state_dict, merge_groups
 
 
-def merge_amax_tensors_for_vllm_group(tensors: list[torch.Tensor]) -> torch.Tensor:
-    """Combine `_amax` buffers from a merge group into a single tensor.
-
-    Used when HuggingFace module names are folded to vLLM names (e.g. q/k/v → qkv_proj).
-
-    - If every tensor has the same shape, take the element-wise maximum over the group
-      (conservative when each branch carried the same axis layout).
-    - If shapes differ (e.g. GQA q vs k), try ``torch.cat(..., dim=0)`` when valid for
-      per-channel amax; otherwise fall back to a scalar max over all elements.
-    """
-    if not tensors:
-        raise ValueError("merge_amax_tensors_for_vllm_group: expected at least one tensor")
-    if len(tensors) == 1:
-        return tensors[0]
-
-    first = tensors[0]
-    if all(t.shape == first.shape for t in tensors):
-        stacked = torch.stack([t.float() for t in tensors], dim=0)
-        return torch.amax(stacked, dim=0).to(dtype=first.dtype, device=first.device)
-
-    try:
-        return torch.cat(tensors, dim=0).to(dtype=first.dtype, device=first.device)
-    except RuntimeError:
-        flat = torch.cat([t.reshape(-1).float() for t in tensors])
-        return torch.max(flat).to(dtype=first.dtype, device=first.device)
-
-
 def _merge_values_by_max_or_concat(merged_key: str, key_value_pairs: list[tuple[str, Any]]) -> Any:
     """
     Merge values by taking max for amax, concatenating for others.
@@ -179,7 +155,7 @@ def _merge_values_by_max_or_concat(merged_key: str, key_value_pairs: list[tuple[
         for dict_key in values[0]:
             tensors = [v[dict_key] for v in values]
             if "_amax" in dict_key:
-                merged_value[dict_key] = merge_amax_tensors_for_vllm_group(tensors)
+                merged_value[dict_key] = merge_amax_tensors_for_group(tensors)
             elif "_pre_quant_scale" in dict_key:
                 # _pre_quant_scale is per-input-channel: identical across q/k/v projections
                 # since they share the same input. Do not concatenate; take the first value.
@@ -190,7 +166,7 @@ def _merge_values_by_max_or_concat(merged_key: str, key_value_pairs: list[tuple[
     else:
         # Values are tensors directly
         if "_amax" in merged_key:
-            merged_value = merge_amax_tensors_for_vllm_group(values)
+            merged_value = merge_amax_tensors_for_group(values)
         else:
             merged_value = torch.cat(values, dim=0)
         return merged_value
diff --git a/modelopt/torch/export/plugins/vllm_fakequant_hf.py b/modelopt/torch/export/plugins/vllm_fakequant_hf.py
@@ -36,6 +36,7 @@
 __all__ = [
     "export_hf_vllm_fq_checkpoint",
     "is_weight_quantizer_state_key",
+    "merge_amax_tensors_for_group",
 ]
 
 # Matches ``…weight_quantizer``, ``…weight_quantizer.0``, ``…w13_weight_quantizer.0``, etc.
@@ -90,6 +91,33 @@ def requant_weights_for_export(
     return quantizer_copy(w.float()).to(w.dtype)
 
 
+def merge_amax_tensors_for_group(tensors: list[torch.Tensor]) -> torch.Tensor:
+    """Combine `_amax` buffers from a merge group into a single tensor.
+
+    Used when HuggingFace module names are folded to vLLM names (e.g. q/k/v → qkv_proj).
+
+    - If every tensor has the same shape, take the element-wise maximum over the group
+      (conservative when each branch carried the same axis layout).
+    - If shapes differ (e.g. GQA q vs k), try ``torch.cat(..., dim=0)`` when valid for
+      per-channel amax; otherwise fall back to a scalar max over all elements.
+    """
+    if not tensors:
+        raise ValueError("merge_amax_tensors_for_group: expected at least one tensor")
+    if len(tensors) == 1:
+        return tensors[0]
+
+    first = tensors[0]
+    if all(t.shape == first.shape for t in tensors):
+        stacked = torch.stack([t.float() for t in tensors], dim=0)
+        return torch.amax(stacked, dim=0).to(dtype=first.dtype, device=first.device)
+
+    try:
+        return torch.cat(tensors, dim=0).to(dtype=first.dtype, device=first.device)
+    except RuntimeError:
+        flat = torch.cat([t.reshape(-1).float() for t in tensors])
+        return torch.max(flat).to(dtype=first.dtype, device=first.device)
+
+
 def _resmooth_experts_for_export(
     model: nn.Module,
     state_dict: dict[str, Any],
@@ -147,7 +175,7 @@ def _resmooth_experts_for_export(
             if iq0.is_enabled:
                 amaxes = [e.input_quantizer.amax for e in experts]
                 if all(a is not None for a in amaxes):
-                    max_in_amax = torch.stack(amaxes).max()
+                    max_in_amax = merge_amax_tensors_for_group(amaxes)
 
             avg_out = avg_pqs.detach().clone()
             for ex in experts: