Move AutoQuant review helpers

meenchen · meenchen · commit 61e506a2ef75 · 2026-06-05T15:59:35.000-07:00
Signed-off-by: weimingc &lt;17592131+meenchen@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -42,6 +42,9 @@
     ProcessorMixin,
 )
 
+from modelopt.torch.export.model_utils import is_multimodal_model
+from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg
+
 try:
     from huggingface_hub import snapshot_download
 except ImportError:
@@ -51,6 +54,58 @@
 
 SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
 
+# TODO: Refactor into the config system.
+_QWEN36_AUTOQ_DISABLED_LAYERS = (
+    "*shared_expert_gate*",
+    "*linear_attn.in_proj_a*",
+    "*linear_attn.in_proj_b*",
+)
+_VLM_AUTOQ_DISABLED_LAYERS = ("*visual*", "*mtp*", "*vision_tower*")
+
+
+def _is_qwen_model(model) -> bool:
+    """Return True when model/config identifiers indicate a Qwen-family model."""
+    candidates = [type(model).__name__]
+    config = getattr(model, "config", None)
+    configs = [
+        config,
+        getattr(config, "text_config", None),
+        getattr(config, "language_config", None),
+    ]
+    for cfg in configs:
+        if cfg is None:
+            continue
+        candidates.append(type(cfg).__name__)
+        model_type = getattr(cfg, "model_type", None)
+        if model_type is not None:
+            candidates.append(str(model_type))
+        architectures = getattr(cfg, "architectures", ()) or ()
+        if isinstance(architectures, str):
+            architectures = (architectures,)
+        candidates.extend(str(architecture) for architecture in architectures)
+    return any("qwen" in candidate.lower() for candidate in candidates)
+
+
+def _get_auto_quantize_disabled_layers(model) -> list[str]:
+    """Return layer patterns that should be excluded from AutoQuantize search."""
+    disabled_layers = [
+        entry["quantizer_name"]
+        for entry in _default_disabled_quantizer_cfg
+        if "parent_class" not in entry and entry["quantizer_name"] != "*lm_head*"
+    ]
+    if _is_qwen_model(model):
+        disabled_layers.extend(p for p in _QWEN36_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
+    if is_multimodal_model(model):
+        disabled_layers.extend(p for p in _VLM_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
+    return disabled_layers
+
+
+def _get_auto_quantize_cost_excluded_patterns(model) -> list[str]:
+    """Return layer patterns excluded only from AutoQuantize cost accounting."""
+    if is_multimodal_model(model):
+        return list(_VLM_AUTOQ_DISABLED_LAYERS)
+    return []
+
 
 def run_nemotron_vl_preview(
     full_model,
@@ -133,7 +188,6 @@ def is_nemotron_vl(model_or_config):
     # Try to get config from model, or use directly if it's a config
     if hasattr(model_or_config, "config"):
         config = model_or_config.config
-        from modelopt.torch.export.model_utils import is_multimodal_model
 
         if not is_multimodal_model(model_or_config):
             return False
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -27,6 +27,8 @@
 from cast_mxfp4_to_nvfp4 import apply_to_model as apply_cast_mxfp4_to_nvfp4
 from cast_mxfp4_to_nvfp4 import force_weight_quantizers_static
 from example_utils import (
+    _get_auto_quantize_cost_excluded_patterns,
+    _get_auto_quantize_disabled_layers,
     build_quant_cfg,
     copy_custom_model_files,
     create_vlm_calibration_loop,
@@ -73,7 +75,7 @@
 )
 from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
 from modelopt.torch.quantization._auto_quantize_cost import EXCLUDED_MODULE_NAME_PATTERNS_KEY
-from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration
+from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
 from modelopt.torch.speculative.eagle.utils import (
@@ -159,59 +161,6 @@ def _canonical_qformat(name: str) -> str:
 mto.enable_huggingface_checkpointing()
 
 
-# TODO: Refactor into the config system.
-_QWEN36_AUTOQ_DISABLED_LAYERS = (
-    "*shared_expert_gate*",
-    "*linear_attn.in_proj_a*",
-    "*linear_attn.in_proj_b*",
-)
-_VLM_AUTOQ_DISABLED_LAYERS = ("*visual*", "*mtp*", "*vision_tower*")
-
-
-def _is_qwen_model(model) -> bool:
-    """Return True when model/config identifiers indicate a Qwen-family model."""
-    candidates = [type(model).__name__]
-    config = getattr(model, "config", None)
-    configs = [
-        config,
-        getattr(config, "text_config", None),
-        getattr(config, "language_config", None),
-    ]
-    for cfg in configs:
-        if cfg is None:
-            continue
-        candidates.append(type(cfg).__name__)
-        model_type = getattr(cfg, "model_type", None)
-        if model_type is not None:
-            candidates.append(str(model_type))
-        architectures = getattr(cfg, "architectures", ()) or ()
-        if isinstance(architectures, str):
-            architectures = (architectures,)
-        candidates.extend(str(architecture) for architecture in architectures)
-    return any("qwen" in candidate.lower() for candidate in candidates)
-
-
-def _get_auto_quantize_disabled_layers(model) -> list[str]:
-    """Return layer patterns that should be excluded from AutoQuantize search."""
-    disabled_layers = [
-        entry["quantizer_name"]
-        for entry in _default_disabled_quantizer_cfg
-        if "parent_class" not in entry and entry["quantizer_name"] != "*lm_head*"
-    ]
-    if _is_qwen_model(model):
-        disabled_layers.extend(p for p in _QWEN36_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
-    if is_multimodal_model(model):
-        disabled_layers.extend(p for p in _VLM_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
-    return disabled_layers
-
-
-def _get_auto_quantize_cost_excluded_patterns(model) -> list[str]:
-    """Return layer patterns excluded only from AutoQuantize cost accounting."""
-    if is_multimodal_model(model):
-        return list(_VLM_AUTOQ_DISABLED_LAYERS)
-    return []
-
-
 def extract_and_prepare_language_model_from_vl(full_model):
     """Extract language model from VL model and disable quantization for non-language components.
 
diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py
@@ -55,22 +55,15 @@
 from .utils import is_quantized_linear
 
 
-def _is_fused_experts_module(module: nn.Module) -> bool:
-    """Return True if ``module`` is a quantized fused-MoE-experts container.
-
-    These modules expose plural ``*_input_quantizer`` and ``*_weight_quantizers``
-    (an ``nn.ModuleList`` of per-expert quantizers) instead of the singular
-    ``input_quantizer`` / ``weight_quantizer`` attrs found on standard
-    ``nn.Linear``-derived QuantModules. AutoQuantize hparam discovery and cost
-    accounting need to recognize this layout to enumerate fused experts as
-    search dimensions.
-    """
-    # Late import to avoid a circular import at module load time.
+def _is_hf_quant_fused_experts_module(module: nn.Module) -> bool:
+    """Return True for a converted HF fused-MoE-experts quantization wrapper."""
+    # Late import avoids a circular import: the HF plugin registers AutoQuantize
+    # support from this module at import time.
     try:
-        from .plugins.huggingface import _QuantFusedExperts
+        from .plugins.huggingface import _is_quant_fused_experts_module
     except ImportError:
         return False
-    return isinstance(module, _QuantFusedExperts)
+    return _is_quant_fused_experts_module(module)
 
 
 # Quantizer attribute names that participate in AutoQuantize snapshot/restore.
@@ -90,7 +83,7 @@ def _get_quantizer_attrs(module: nn.Module) -> tuple[str, ...]:
     shared input quantizers + two ``ModuleList`` of per-expert weight quantizers).
     For standard Linear-derived QuantModules, returns the canonical trio.
     """
-    if _is_fused_experts_module(module):
+    if _is_hf_quant_fused_experts_module(module):
         return _FUSED_EXPERTS_QUANTIZER_ATTRS
     return _STD_QUANTIZER_ATTRS
 
@@ -517,7 +510,7 @@ def _is_auto_quantize_module(module):
         # weight quantizers in an ``nn.ModuleList`` plus shared input quantizers.
         # All N experts in a layer share one search dimension (one recipe per
         # fused module).
-        return _is_fused_experts_module(module) and isinstance(module, QuantModule)
+        return _is_hf_quant_fused_experts_module(module) and isinstance(module, QuantModule)
 
     @staticmethod
     def _get_search_recipes(quantization_formats):
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -946,6 +946,11 @@ def fold_weight(self, keep_attrs: bool = False):
                             delattr(q, attr_name)
 
 
+def _is_quant_fused_experts_module(module):
+    """Return True for a converted HF fused-MoE-experts quantization wrapper."""
+    return isinstance(module, _QuantFusedExperts)
+
+
 class _QuantDbrxFFN(_QuantSparseSequentialMoe):
     @property
     def num_experts(self):
diff --git a/tests/examples/llm_ptq/test_hf_ptq_args.py b/tests/examples/llm_ptq/test_hf_ptq_args.py
@@ -28,6 +28,11 @@ def _import_hf_ptq(monkeypatch):
     return importlib.import_module("hf_ptq")
 
 
+def _import_example_utils(monkeypatch):
+    monkeypatch.syspath_prepend(str(_EXAMPLES_DIR))
+    return importlib.import_module("example_utils")
+
+
 def _parse_hf_ptq_args(monkeypatch, *args):
     hf_ptq = _import_hf_ptq(monkeypatch)
     monkeypatch.setattr(sys, "argv", ["hf_ptq.py", *args])
@@ -87,7 +92,7 @@ def test_load_model_keeps_nemotron_vl_text_calibration_for_autoquant(monkeypatch
 
 
 def test_qwen_autoquant_disabled_layers_are_scoped_to_qwen_models(monkeypatch):
-    hf_ptq = _import_hf_ptq(monkeypatch)
+    example_utils = _import_example_utils(monkeypatch)
     qwen_model = SimpleNamespace(config=SimpleNamespace(model_type="qwen3_moe"))
     llama_model = SimpleNamespace(config=SimpleNamespace(model_type="llama"))
     qwen_only_patterns = {
@@ -96,10 +101,10 @@ def test_qwen_autoquant_disabled_layers_are_scoped_to_qwen_models(monkeypatch):
         "*linear_attn.in_proj_b*",
     }
 
-    monkeypatch.setattr(hf_ptq, "is_multimodal_model", lambda model: False)
+    monkeypatch.setattr(example_utils, "is_multimodal_model", lambda model: False)
 
-    qwen_disabled_layers = set(hf_ptq._get_auto_quantize_disabled_layers(qwen_model))
-    llama_disabled_layers = set(hf_ptq._get_auto_quantize_disabled_layers(llama_model))
+    qwen_disabled_layers = set(example_utils._get_auto_quantize_disabled_layers(qwen_model))
+    llama_disabled_layers = set(example_utils._get_auto_quantize_disabled_layers(llama_model))
 
     assert qwen_only_patterns <= qwen_disabled_layers
     assert qwen_only_patterns.isdisjoint(llama_disabled_layers)