Improve AWQ init speed (#748)

meenchen · web-flow · commit 6f18490b838d · 2026-01-08T12:08:17.000-08:00
## What does this PR do? **Type of change:** ?Improvement **Overview:** Improve speed of accessing weight through enable_weight_access_and_writeback in AWQ helper init. This change reduces the time complexity from O(num_modules^2) to O(num_modules) and the runtime from ~1hour to 30 seconds. ## Usage  ```python # Add a code snippet demonstrating how to use this ``` ## Testing  python hf_ptq.py --pyt_ckpt_path /home/scratch.omniml_data_1/models/qwen/Qwen3-30B-A3B-Instruct-2507 --qformat int4_awq ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information  Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -532,9 +532,11 @@ def awq(
             awq_clip(model, forward_loop, **kwargs)
 
     # Special handling for SequentialQuantizer
+    # Pre-compute name_to_module dict to avoid O(n^2) complexity in enable_weight_access_and_writeback
+    name_to_module = dict(model.named_modules())
     for name, module in model.named_modules():
         if is_quantized_linear(module) and isinstance(module.weight_quantizer, SequentialQuantizer):
-            with enable_weight_access_and_writeback(module, model):
+            with enable_weight_access_and_writeback(module, model, name_to_module):
                 max_calibrate(module, lambda linear: linear.weight_quantizer(module.weight))
 
 
@@ -606,8 +608,9 @@ def get_weight_scale(weight, block_size=None):
                 weight = F.pad(weight, (0, block_size - org_shape[-1] % block_size), "constant", 0)
                 org_shape = weight.shape
             weight = weight.contiguous().view(-1, block_size)
-        weight_abs_amax = weight.abs().amax(dim=1, keepdim=True)
-        scale = weight.abs() / (weight_abs_amax + torch.finfo(weight.dtype).tiny)
+        weight_abs = weight.abs()  # Cache to avoid redundant computation
+        weight_abs_amax = weight_abs.amax(dim=1, keepdim=True)
+        scale = weight_abs / (weight_abs_amax + torch.finfo(weight.dtype).tiny)
         scale = scale.view(org_shape)
         if slice_after_padding is not None:
             scale = scale[..., slice_after_padding]
@@ -701,9 +704,11 @@ def forward(self, input, *args, **kwargs):
         # Now forward the actual output without any quantization
         return out_actual
 
-    for name, module in model.named_modules():
+    # Pre-compute name_to_module dict ONCE to avoid O(n^2) complexity in enable_weight_access_and_writeback
+    name_to_module = dict(model.named_modules())
+    for name, module in name_to_module.items():
         if is_quantized_linear(module) and module.weight_quantizer.is_enabled:
-            with enable_weight_access_and_writeback(module, model):
+            with enable_weight_access_and_writeback(module, model, name_to_module):
                 module.awq_lite = AWQLiteHelper(module, name)
             module.awq_lite.setup()
 
@@ -793,7 +798,7 @@ def postprocess(module, name):
                     f" {name}. Please provide a valid `forward_loop` function that can be used to"
                     " forward data through the model many times."
                 )
-            with enable_weight_access_and_writeback(module, model):
+            with enable_weight_access_and_writeback(module, model, name_to_module):
                 postprocess(module, name)
 
             module.awq_lite.cleanup()
@@ -973,14 +978,16 @@ def forward(name, self, input, *args, **kwargs):
         self.weight_quantizer.disable()
         return self._forward_no_awq(input, *args, **kwargs)
 
+    # Pre-compute name_to_module dict to avoid O(n^2) complexity in enable_weight_access_and_writeback
+    name_to_module = dict(model.named_modules())
     for name, module in model.named_modules():
         if (
             is_quantized_linear(module)
             and module.weight_quantizer.is_enabled
             and module.weight_quantizer.block_sizes is not None
         ):
             bind_forward_method(module, partial(forward, name), "_forward_no_awq")
-            with enable_weight_access_and_writeback(module, model):
+            with enable_weight_access_and_writeback(module, model, name_to_module):
                 module.awq_clip = AWQClipHelper(module)
 
     print_rank_0("awq_clip: Estimating parameters...")
@@ -1004,7 +1011,7 @@ def postprocess(module):
     for name, module in model.named_modules():
         if is_quantized_linear(module) and hasattr(module, "awq_clip"):
             if module.awq_clip.num_tokens > 0:
-                with enable_weight_access_and_writeback(module, model):
+                with enable_weight_access_and_writeback(module, model, name_to_module):
                     postprocess(module)
 
             if not debug:
diff --git a/modelopt/torch/quantization/utils.py b/modelopt/torch/quantization/utils.py
@@ -396,19 +396,30 @@ def _get_fsdp2_mesh(module: nn.Module):
         return fsdp_state._fsdp_param_group.post_forward_mesh_info.mesh
 
 
-def _get_module_name(module: nn.Module, root_model: nn.Module):
-    name_to_module = dict(root_model.named_modules())
+def _get_module_name(module: nn.Module, root_model: nn.Module, name_to_module: dict | None = None):
+    if name_to_module is None:
+        name_to_module = dict(root_model.named_modules())
     target_module_name = next((name for name, m in name_to_module.items() if m is module), None)
     return target_module_name
 
 
-def _get_enclosing_fsdp_module(module: nn.Module, root_model: nn.Module):
-    """Get the enclosing FSDP module for a given module."""
+def _get_enclosing_fsdp_module(
+    module: nn.Module, root_model: nn.Module, name_to_module: dict | None = None
+):
+    """Get the enclosing FSDP module for a given module.
+
+    Args:
+        module: The module to find the enclosing FSDP for.
+        root_model: The root model containing the module.
+        name_to_module: Optional pre-computed dict mapping names to modules (for performance).
+    """
     if isinstance(module, FSDPModule):
         return module
 
-    name_to_module = dict(root_model.named_modules())
-    target_module_name = _get_module_name(module, root_model)
+    if name_to_module is None:
+        name_to_module = dict(root_model.named_modules())
+
+    target_module_name = _get_module_name(module, root_model, name_to_module)
 
     if target_module_name is None:
         raise ValueError(f"Module {module} not found in the root model {root_model}.")
@@ -469,13 +480,19 @@ def fsdp2_weight_access_and_writeback_context(module: nn.Module, root_model: nn.
 
 
 @contextmanager
-def enable_weight_access_and_writeback(module, root_model):
+def enable_weight_access_and_writeback(module, root_model, name_to_module: dict | None = None):
     """Enable weight access and writeback for a module.
 
     Useful for modules with weight not intact such as Linear layer in FSDP wrapped model or
     HF accelerate CPU off-loaded models.
+
+    Args:
+        module: The module to access weights for.
+        root_model: The root model containing the module.
+        name_to_module: Optional pre-computed dict mapping names to modules (for performance).
+                        If not provided, will be computed on-the-fly.
     """
-    if _get_enclosing_fsdp_module(module, root_model) is not None:
+    if _get_enclosing_fsdp_module(module, root_model, name_to_module) is not None:
         context = fsdp2_weight_access_and_writeback_context(module, root_model)
     elif is_quantized_parallel_linear(module) and hasattr(module, "_hf_tp_plan"):
         # HF transformers TP sharded linear layer