|
27 | 27 | from cast_mxfp4_to_nvfp4 import apply_to_model as apply_cast_mxfp4_to_nvfp4 |
28 | 28 | from cast_mxfp4_to_nvfp4 import force_weight_quantizers_static |
29 | 29 | from example_utils import ( |
| 30 | + _get_auto_quantize_cost_excluded_patterns, |
| 31 | + _get_auto_quantize_disabled_layers, |
30 | 32 | build_quant_cfg, |
31 | 33 | copy_custom_model_files, |
32 | 34 | create_vlm_calibration_loop, |
|
73 | 75 | ) |
74 | 76 | from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model |
75 | 77 | from modelopt.torch.quantization._auto_quantize_cost import EXCLUDED_MODULE_NAME_PATTERNS_KEY |
76 | | -from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration |
| 78 | +from modelopt.torch.quantization.config import need_calibration |
77 | 79 | from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights |
78 | 80 | from modelopt.torch.quantization.utils import is_quantized |
79 | 81 | from modelopt.torch.speculative.eagle.utils import ( |
@@ -159,59 +161,6 @@ def _canonical_qformat(name: str) -> str: |
159 | 161 | mto.enable_huggingface_checkpointing() |
160 | 162 |
|
161 | 163 |
|
162 | | -# TODO: Refactor into the config system. |
163 | | -_QWEN36_AUTOQ_DISABLED_LAYERS = ( |
164 | | - "*shared_expert_gate*", |
165 | | - "*linear_attn.in_proj_a*", |
166 | | - "*linear_attn.in_proj_b*", |
167 | | -) |
168 | | -_VLM_AUTOQ_DISABLED_LAYERS = ("*visual*", "*mtp*", "*vision_tower*") |
169 | | - |
170 | | - |
171 | | -def _is_qwen_model(model) -> bool: |
172 | | - """Return True when model/config identifiers indicate a Qwen-family model.""" |
173 | | - candidates = [type(model).__name__] |
174 | | - config = getattr(model, "config", None) |
175 | | - configs = [ |
176 | | - config, |
177 | | - getattr(config, "text_config", None), |
178 | | - getattr(config, "language_config", None), |
179 | | - ] |
180 | | - for cfg in configs: |
181 | | - if cfg is None: |
182 | | - continue |
183 | | - candidates.append(type(cfg).__name__) |
184 | | - model_type = getattr(cfg, "model_type", None) |
185 | | - if model_type is not None: |
186 | | - candidates.append(str(model_type)) |
187 | | - architectures = getattr(cfg, "architectures", ()) or () |
188 | | - if isinstance(architectures, str): |
189 | | - architectures = (architectures,) |
190 | | - candidates.extend(str(architecture) for architecture in architectures) |
191 | | - return any("qwen" in candidate.lower() for candidate in candidates) |
192 | | - |
193 | | - |
194 | | -def _get_auto_quantize_disabled_layers(model) -> list[str]: |
195 | | - """Return layer patterns that should be excluded from AutoQuantize search.""" |
196 | | - disabled_layers = [ |
197 | | - entry["quantizer_name"] |
198 | | - for entry in _default_disabled_quantizer_cfg |
199 | | - if "parent_class" not in entry and entry["quantizer_name"] != "*lm_head*" |
200 | | - ] |
201 | | - if _is_qwen_model(model): |
202 | | - disabled_layers.extend(p for p in _QWEN36_AUTOQ_DISABLED_LAYERS if p not in disabled_layers) |
203 | | - if is_multimodal_model(model): |
204 | | - disabled_layers.extend(p for p in _VLM_AUTOQ_DISABLED_LAYERS if p not in disabled_layers) |
205 | | - return disabled_layers |
206 | | - |
207 | | - |
208 | | -def _get_auto_quantize_cost_excluded_patterns(model) -> list[str]: |
209 | | - """Return layer patterns excluded only from AutoQuantize cost accounting.""" |
210 | | - if is_multimodal_model(model): |
211 | | - return list(_VLM_AUTOQ_DISABLED_LAYERS) |
212 | | - return [] |
213 | | - |
214 | | - |
215 | 164 | def extract_and_prepare_language_model_from_vl(full_model): |
216 | 165 | """Extract language model from VL model and disable quantization for non-language components. |
217 | 166 |
|
|
0 commit comments