Preserve weight dtype for LAQ amax and per-tensor scales

realAsma · claude · realAsma · commit 215f6433ed83 · 2026-04-17T22:30:12.000Z
- StaticBlockScaleQuantizer.enable_laq no longer forces float32 on
  _amax_pre, _amax_post, and _per_tensor_scale buffers/parameters;
  they now inherit the dtype of the passed tensors.
- laq() calibration casts amax and per_tensor_scale to the weight
  dtype before calling enable_laq so the quantizer matches module
  precision (bf16/fp16) instead of silently upcasting to fp32.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -1818,6 +1818,10 @@ def laq(
 
     for module, weight_name, quantizer in _iter_weight_quantizers(model):
         amax, per_tensor_scale, quantize_scales = _compute_laq_params(quantizer)
+        weight_dtype = getattr(module, weight_name).dtype
+        amax = amax.to(weight_dtype)
+        if per_tensor_scale is not None:
+            per_tensor_scale = per_tensor_scale.to(weight_dtype)
         quantizer.enable_laq(
             amax,
             per_tensor_scale,
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -1455,18 +1455,18 @@ def enable_laq(
         learn = {learnable_amax} if isinstance(learnable_amax, str) else set(learnable_amax)
 
         if "post" in learn:
-            self._amax_post = nn.Parameter(amax.clone().detach().float(), requires_grad=True)
+            self._amax_post = nn.Parameter(amax.clone().detach(), requires_grad=True)
         else:
-            self.register_buffer("_amax_post", amax.clone().detach().float())
+            self.register_buffer("_amax_post", amax.clone().detach())
 
         if not tied_amax:
             if "pre" in learn:
-                self._amax_pre = nn.Parameter(amax.clone().detach().float(), requires_grad=True)
+                self._amax_pre = nn.Parameter(amax.clone().detach(), requires_grad=True)
             else:
-                self.register_buffer("_amax_pre", amax.clone().detach().float())
+                self.register_buffer("_amax_pre", amax.clone().detach())
 
         if per_tensor_scale is not None:
-            self.register_buffer("_per_tensor_scale", per_tensor_scale.clone().detach().float())
+            self.register_buffer("_per_tensor_scale", per_tensor_scale.clone().detach())
         self._quantize_scales = quantize_scales
         self._laq = True
         self._learnable_amax = sorted(learn)