Add weights_scaling_factor to quantize in MXFP8QTensor

danisereb · meenchen · commit 4a2a15e92d7f · 2026-01-28T08:50:22.000-08:00
Signed-off-by: Daniel Serebrenik &lt;daserebrenik@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/qtensor/mxfp8_tensor.py b/modelopt/torch/quantization/qtensor/mxfp8_tensor.py
@@ -144,23 +144,24 @@ def get_weights_scaling_factor_from_quantizer(
     def quantize_with_scale(
         cls,
         weight: torch.Tensor,
-        e8m0_scale: torch.Tensor,
+        weights_scaling_factor: torch.Tensor,
     ) -> torch.Tensor:
         """Quantize weight tensor using a pre-computed E8M0 scale.
 
         This method is useful for export paths where the scale has already been computed.
 
         Args:
             weight: The weight tensor to quantize. Must be at least 1D.
-            e8m0_scale: E8M0 scale as uint8 biased exponent (bias = 127).
+            weights_scaling_factor: E8M0 scale as uint8 biased exponent (bias = 127).
                 Shape should be [..., out_dim, in_dim // 32] for 2D+ tensors,
                 or [in_dim // 32] for 1D tensors.
 
         Returns:
             torch.Tensor: Quantized weight as float8_e4m3fn with same shape as input.
         """
-        assert e8m0_scale.dtype == cls.SCALE_DTYPE, (
-            f"e8m0_scale must be {cls.SCALE_DTYPE} (E8M0 format), got {e8m0_scale.dtype}"
+        assert weights_scaling_factor.dtype == cls.SCALE_DTYPE, (
+            f"weights_scaling_factor must be {cls.SCALE_DTYPE} (E8M0 format), "
+            f"got {weights_scaling_factor.dtype}"
         )
 
         in_dim = weight.shape[-1]
@@ -171,13 +172,13 @@ def quantize_with_scale(
         )
 
         # Convert E8M0 biased exponent to scale factor: scale = 2^(127 - exponent)
-        scale_factor = torch.exp2(127 - e8m0_scale.float())
+        scale_factor = torch.exp2(127 - weights_scaling_factor.float())
 
         # NOTE: vLLM/flashinfer may require this behavior:
         # scale_factor = torch.where(
-        #    e8m0_scale == 0,
+        #    weights_scaling_factor == 0,
         #    1.0,
-        #    torch.exp2(127 - e8m0_scale.float())
+        #    torch.exp2(127 - weights_scaling_factor.float())
         # )
 
         weight_reshaped = weight.view(*weight.shape[:-1], num_blocks, cls.BLOCK_SIZE)
@@ -189,30 +190,39 @@ def quantize_with_scale(
         return quantized_weight.view(weight.shape)
 
     @classmethod
-    def quantize(cls, input: torch.Tensor) -> tuple:
+    def quantize(
+        cls,
+        input: torch.Tensor,
+        weights_scaling_factor: torch.Tensor | None = None,
+    ) -> tuple:
         """Convert a tensor to MXFP8 quantized format.
 
         Args:
             input (torch.Tensor): The input tensor to be quantized.
+            weights_scaling_factor (torch.Tensor | None): Optional pre-computed E8M0 scale
+                as uint8 biased exponent. If None, the scale will be computed from the input.
+                Shape should be [..., in_dim // 32] matching input dimensions.
 
         Returns:
-            tuple: (MXFP8QTensor, e8m0_scale) where e8m0_scale is uint8 biased exponent.
+            tuple: (MXFP8QTensor, weights_scaling_factor) where weights_scaling_factor is
+                E8M0 scale as uint8 biased exponent.
         """
         original_shape = input.shape
         original_dtype = input.dtype
 
         input = reduce_block_padding(input, block_sizes={-1: cls.BLOCK_SIZE})
-        input_amax = reduce_block_amax(input, block_sizes={-1: cls.BLOCK_SIZE})
 
-        e8m0_exponent = cls._compute_e8m0_exponent(input_amax)
-        e8m0_scale = (e8m0_exponent + 127).to(cls.SCALE_DTYPE)
+        if weights_scaling_factor is None:
+            input_amax = reduce_block_amax(input, block_sizes={-1: cls.BLOCK_SIZE})
+            e8m0_exponent = cls._compute_e8m0_exponent(input_amax)
+            weights_scaling_factor = (e8m0_exponent + 127).to(cls.SCALE_DTYPE)
 
-        quantized_data = cls.quantize_with_scale(input, e8m0_scale)
+        quantized_data = cls.quantize_with_scale(input, weights_scaling_factor)
 
         # Crop back to original shape
         quantized_data = quantized_data[..., : original_shape[-1]]
 
-        return cls(original_shape, original_dtype, quantized_data), e8m0_scale
+        return cls(original_shape, original_dtype, quantized_data), weights_scaling_factor
 
     def dequantize(self, dtype: torch.dtype = None, **kwargs) -> torch.Tensor:
         """Dequantize MXFP8 tensor back to the target dtype.
diff --git a/tests/gpu/torch/quantization/test_qtensor_cuda.py b/tests/gpu/torch/quantization/test_qtensor_cuda.py
@@ -797,6 +797,50 @@ def test_mxfp8_get_weights_scaling_factor(self, device, input_shape):
         # Note: 255 (0xFF) represents NaN in E8M0 and should never appear from valid weights
         assert torch.all(e8m0_scale <= 254), "E8M0 scale contains NaN value (255)"
 
+    @pytest.mark.parametrize("device", ["cuda", "cpu"])
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
+    @pytest.mark.parametrize(
+        "input_shape",
+        [
+            (64, 64),
+            (128, 128),
+            (4, 64, 128),  # 3D MoE shape
+            # Note: All shapes must have last dim divisible by 32 since
+            # get_weights_scaling_factor() requires this (unlike quantize() which pads)
+        ],
+    )
+    def test_mxfp8_quantize_with_precomputed_scale(self, device, input_dtype, input_shape):
+        """Test MXFP8 quantize() with pre-computed weights_scaling_factor."""
+        test_tensor = torch.randn(input_shape, dtype=input_dtype, device=device)
+
+        # Quantize without pre-computed scale (baseline)
+        qtensor_auto, scale_auto = MXFP8QTensor.quantize(test_tensor)
+
+        # Pre-compute scale and pass to quantize
+        precomputed_scale = MXFP8QTensor.get_weights_scaling_factor(test_tensor)
+        qtensor_precomputed, scale_precomputed = MXFP8QTensor.quantize(
+            test_tensor, weights_scaling_factor=precomputed_scale
+        )
+
+        # Verify scales match
+        assert torch.equal(scale_auto, scale_precomputed), (
+            "Pre-computed scale should match auto-computed scale"
+        )
+
+        # Verify quantized data matches
+        assert torch.equal(qtensor_auto._quantized_data, qtensor_precomputed._quantized_data), (
+            "Quantized data should match when using pre-computed scale"
+        )
+
+        # Verify dequantized results match
+        dequant_auto = qtensor_auto.dequantize(dtype=input_dtype, scale=scale_auto)
+        dequant_precomputed = qtensor_precomputed.dequantize(
+            dtype=input_dtype, scale=scale_precomputed
+        )
+        assert torch.equal(dequant_auto, dequant_precomputed), (
+            "Dequantized results should match"
+        )
+
     @pytest.mark.parametrize(
         ("amax_value", "expected_exponent"),
         [
@@ -834,7 +878,7 @@ def test_mxfp8_quantize_with_scale_asserts(self, device):
         # Test wrong scale dtype assertion
         weight = torch.randn(64, 64, dtype=torch.float32, device=device)
         wrong_dtype_scale = torch.randn(64, 2, dtype=torch.float32, device=device)
-        with pytest.raises(AssertionError, match="e8m0_scale must be"):
+        with pytest.raises(AssertionError, match="weights_scaling_factor must be"):
             MXFP8QTensor.quantize_with_scale(weight, wrong_dtype_scale)
 
         # Test non-divisible dimension assertion