Added support to rotate in fp32 (optional) (#885)

kinjalpatel27 · danielkorzekwa · commit b3531108f435 · 2026-03-04T03:27:11.000-08:00
## What does this PR do? **Type of change:** New Feature **Overview:** This MR adds support to perform rotation for RHT in float32 if enabled by quantization configuration. It also makes rotate argument in quantization configuration of type bool (for backward compatibility) or dict (added option for float32 rotation) ## Usage ``` python hf_ptq.py --pyt_ckpt_path meta-llama/Llama-3.2-3B-Instruct --qformat nvfp4 --export_fmt hf --dataset cnn_dailymail --export_path test --trust_remote_code --inference_pipeline_parallel 1 --batch_size 1 --calib_size 4 --kv_cache_qformat nvfp4_rotate ``` Updated `NVFP4_KV_ROTATE_CFG` locally with `"rotate": {"enable": True, "rotate_fp32": True}` ``` ... model.layers.27.self_attn.k_bmm_quantizer TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=8.3750 rotated (fp32) calibrator =MaxCalibrator quant) ... ``` Updated `NVFP4_KV_ROTATE_CFG` locally with `"rotate": {"enable": True, "rotate_fp32": False}` ``` model.layers.27.self_attn.k_bmm_quantizer TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=8.3750 rotated calibrator=MaxCalibrator quant) ``` ## Testing Updated unit test in `tests/gpu/torch/quantization/test_hadamard.py` ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes - **Did you write any new necessary tests?**: No (updated existing test) - **Did you add or update any necessary documentation?**: Yes - **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes ## Additional Information   ## Summary by CodeRabbit ## Release Notes * **New Features** * Added rotational input capability prior to quantization for RHT (Rotated Hyperplane Transform). * Introduced granular rotation configuration options enabling FP32 casting for improved numerical stability during transforms. * **Tests** * Expanded test coverage for rotation functionality with parameterized FP32 casting scenarios.  --------- Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com> Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,6 +9,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow.
 - ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory.
 - Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
+- Add support for rotating the input before quantization for RHT.
 
 0.42 (2026-02-xx)
 ^^^^^^^^^^^^^^^^^
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -1033,14 +1033,20 @@ def validate_calibrator(cls, v, info: ValidationInfo):
             assert v in ["max", "histogram"]
         return v
 
-    rotate: bool = ModeloptField(
+    rotate: bool | dict[str, bool] = ModeloptField(
         default=False,
-        title="""If rotate the input before quantization.""",
-        description=""""If true, the input of the quantizer will be rotated with a hadamard matrix
+        title="""Configuration for rotating the input before quantization.""",
+        description="""Can be a boolean or a dictionary with the following keys:
+        - "enable": Boolean to enable/disable rotation (default: False)
+        - "rotate_fp32": Boolean to compute rotation in float32 precision (default: False)
+
+        If a boolean is provided, it is treated as the "enable" value with "rotate_fp32" defaulting to False.
+
+        When enabled, the input of the quantizer will be rotated with a hadamard matrix
         given by scipy.linalg.hadamard, i.e.
         ``input = input @ scipy.linalg.hadamard(input.shape[-1]) / sqrt(input.shape[-1])``.
 
-        This can be used for ratation based PTQ methods, e.g. QuaRot or SpinQuant.
+        This can be used for rotation based PTQ methods, e.g. QuaRot or SpinQuant.
         See https://arxiv.org/abs/2404.00456 for example.""",
     )
 
diff --git a/modelopt/torch/quantization/nn/functional.py b/modelopt/torch/quantization/nn/functional.py
@@ -93,7 +93,7 @@ def backward(ctx, grad_outputs):
         return fast_hadamard_transform.hadamard_transform(grad_outputs)  # type: ignore[name-defined]
 
 
-def normalized_hadamard_transform(inputs):
+def normalized_hadamard_transform(inputs, rotate_fp32=False):
     """Normalized fast hadamard transform."""
     global fast_hadamard_transform
     try:
@@ -104,6 +104,10 @@ def normalized_hadamard_transform(inputs):
             "`pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git`"
         )
 
-    return FastHadamardTransform.apply(inputs) / torch.sqrt(
+    dtype = inputs.dtype
+    if rotate_fp32:
+        inputs = inputs.to(torch.float32)
+    outputs = FastHadamardTransform.apply(inputs) / torch.sqrt(
         torch.tensor(inputs.shape[-1], dtype=torch.float32)
     )
+    return outputs.to(dtype) if rotate_fp32 else outputs
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -529,6 +529,20 @@ def is_static_block_quant(self):
             and self._fake_quant
         )
 
+    @property
+    def rotate_is_enabled(self):
+        """Check if rotate is enabled in quant config."""
+        return self._rotate.get("enable", False) if isinstance(self._rotate, dict) else self._rotate
+
+    @property
+    def rotate_is_fp32(self):
+        """Check if rotation needs to be computed in float32."""
+        return (
+            self._rotate.get("rotate_fp32", False)
+            if isinstance(self._rotate, dict) and self.rotate_is_enabled
+            else False
+        )
+
     def disable_calib(self):
         """Disable calibration."""
         self._if_calib = False
@@ -996,8 +1010,8 @@ def forward(self, inputs):
             inputs = inputs * self.pre_quant_scale
 
         # Rotating the input
-        if self._rotate:
-            inputs = normalized_hadamard_transform(inputs)
+        if self.rotate_is_enabled:
+            inputs = normalized_hadamard_transform(inputs, rotate_fp32=self.rotate_is_fp32)
 
         if self._disabled:
             # if quantizer is disabled, we still need to track the input dtype for saving the model
@@ -1109,7 +1123,8 @@ def extra_repr(self):
             if self.pre_quant_scale is not None
             else ""
         )
-        s += " rotated" if self._rotate else ""
+        s += " rotated" if self.rotate_is_enabled else ""
+        s += " (fp32)" if self.rotate_is_fp32 else ""
         s += (
             f" calibrator={self._calibrator.__class__.__name__}"
             if (self._calibrator is not None)
diff --git a/tests/gpu/torch/quantization/test_hadamard.py b/tests/gpu/torch/quantization/test_hadamard.py
@@ -41,21 +41,32 @@ def test_hadamard_transform(dim):
     xxt_h = x_h @ x_h.T
     # The numerical error can be large, especially for 16-bit floats.
     assert torch.allclose(xxt_h, xxt, atol=0.05)
+    x_h_fp32 = normalized_hadamard_transform(x, rotate_fp32=True)
+    xxt_h_fp32 = x_h_fp32 @ x_h_fp32.T
+    assert torch.allclose(xxt_h_fp32, xxt, atol=0.05)
 
 
-def test_kv_rotate():
+@pytest.mark.parametrize(
+    "rotate_fp32",
+    [True, False],
+)
+def test_kv_rotate(rotate_fp32):
     mtq.plugins.register_attention_for_kv_quant(SDPAAttention)
     model = nn.Sequential(SDPAAttention())
     mtq.replace_quant_module(model)
 
     set_quantizer_by_cfg(model, {"*": {"enable": False}})
     dummy_input = SDPAAttention.get_input(device="cuda")
     output_ref = model(dummy_input)
+    if rotate_fp32:
+        rotate = {"enable": True, "rotate_fp32": True}
+    else:
+        rotate = True
     with set_quantizer_by_cfg_context(
         model,
         {
             "*[qk]_bmm_quantizer": {
-                "rotate": True,
+                "rotate": rotate,
             },
         },
     ):
@@ -67,7 +78,7 @@ def test_kv_rotate():
         model,
         {
             "*k_bmm_quantizer": {
-                "rotate": True,
+                "rotate": rotate,
             },
         },
     ):