NVIDIA
diff --git a/‎examples/llm_ptq/cast_mxfp4_to_nvfp4.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/llm_ptq/cast_mxfp4_to_nvfp4.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎modelopt/torch/quantization/calib/mse.py‎
Lines changed: 32 additions & 27 deletions b/‎modelopt/torch/quantization/calib/mse.py‎
Lines changed: 32 additions & 27 deletions
diff --git a/‎modelopt/torch/quantization/config.py‎
Lines changed: 4 additions & 4 deletions b/‎modelopt/torch/quantization/config.py‎
Lines changed: 4 additions & 4 deletions
@@ -297,8 +297,8 @@ def force_weight_quantizers_static(quant_cfg: list) -> None:
     The MXFP4 -> NVFP4 cast needs the per-block weight ``_amax`` to be recorded
     by max-cal (so it can be paired with the pinned global_amax later). Setting
     ``block_sizes['type'] = 'static'`` makes ``is_static_block_quant`` True so
-    ``promote_nvfp4_static_quantizers`` picks the entry up automatically at the
-    end of max_calibrate.
+    static NVFP4 finalization picks the entry up automatically at the end of
+    max_calibrate.
     """
     for i, entry in enumerate(quant_cfg):
         qname = entry.get("quantizer_name", "")
 
@@ -178,9 +178,8 @@ class NVFP4MSECalibrator(MseCalibrator):
     Uses a fused Triton kernel as an internal fast path on the first ``collect`` call
     when (a) ``error_func is None``, (b) the input tensor is on CUDA in the standard
     blocked ``[n_blocks, block_size]`` layout, and (c) Triton + the kernel package are
-    importable. Falls back to the reference 126-step Python sweep otherwise (custom
-    ``error_func`` users, multi-``collect`` activation flows, CPU inputs, or when the
-    fast path is disabled via ``MODELOPT_NVFP4_TRITON_SWEEP=0``).
+    importable. Falls back to the reference 126-step Python sweep otherwise and caches
+    the final amax immediately, so this calibrator is one-shot between resets.
     """
 
     def __init__(
@@ -193,14 +192,16 @@ def __init__(
     ):
         """Initialize NVFP4 MSE calibrator with per-block and global amax."""
         super().__init__(amax=amax, axis=axis, quant_func=quant_func, error_func=error_func)
-        self._global_amax = global_amax
-        # Set by the Triton fast path on its (one-shot) collect; consumed by compute_amax.
-        self._best_amax_fast: torch.Tensor | None = None
+        self._global_amax = global_amax.to(dtype=torch.float32)
+        # Set by collect() after either sweep path; consumed by compute_amax.
+        self._best_amax: torch.Tensor | None = None
 
     def _compute_candidate_amax(self, candidates: torch.Tensor) -> torch.Tensor:
         if candidates.ndim != 0:  # Called during final compute amax
             candidates = candidates.view_as(self._initial_amax)
-        return torch.ones_like(self._initial_amax) * self._global_amax * candidates
+        return torch.ones_like(self._initial_amax, dtype=torch.float32) * (
+            self._global_amax * candidates
+        )
 
     def _generate_candidates(self, device: torch.device) -> torch.Tensor:
         """Generate the 126 valid FP8 E4M3 scale candidates."""
@@ -235,35 +236,39 @@ def _can_use_triton_fast_path(self, x: torch.Tensor) -> bool:
 
     @torch.no_grad()
     def collect(self, x: torch.Tensor):
-        """Collect input statistics. Uses the Triton fast path when eligible."""
-        if self._best_amax_fast is not None:
+        """Collect input statistics and cache the final per-block amax."""
+        if self._best_amax is not None:
             raise RuntimeError(
-                "NVFP4MSECalibrator: the Triton fast path produced a final amax on a "
-                "previous collect() call; multi-collect after the fast path is not "
-                "supported. Call reset() to start a fresh cycle, set "
-                "MODELOPT_NVFP4_TRITON_SWEEP=0, or pass a non-None error_func to force "
-                "the reference path for activation-style accumulation."
+                "NVFP4MSECalibrator: a previous collect() call produced a final amax; "
+                "multi-collect is not supported. Call reset() to start a fresh cycle."
             )
-        # Fast path is eligible only on the first call, before the reference accumulator
-        # has produced any state.
-        if self._losses_sum is None and self._can_use_triton_fast_path(x):
+        if self._can_use_triton_fast_path(x):
             from modelopt.torch.kernels.quantization.gemm import nvfp4_fp8_scale_sweep
 
             best_flat = nvfp4_fp8_scale_sweep(x.detach(), self._global_amax, block_size=x.shape[-1])
-            # Match the original shape/dtype of the initial amax so downstream
-            # load_calib_amax behaves identically to the reference path.
-            self._best_amax_fast = best_flat.reshape(self._initial_amax.shape).to(
-                self._initial_amax.dtype
-            )
+            # Store the selected amax in fp32; the fake-quant kernel still returns
+            # tensors in the requested output dtype.
+            self._best_amax = best_flat.reshape(self._initial_amax.shape).to(dtype=torch.float32)
             return
+
+        self._run_reference_collect(x)
+
+    def _run_reference_collect(self, x: torch.Tensor):
         super().collect(x)
+        best_amax = super().compute_amax(verbose=False)
+        self._best_amax = best_amax.to(dtype=torch.float32) if best_amax is not None else None
+        self._losses_sum = None
+        # Synchronize before calibrating another weight so reference MSE sweeps do
+        # not overlap. _losses_sum stores one fp32 reduced loss per candidate per
+        # block. With 16-element NVFP4 blocks and bf16 weights, this is roughly
+        # 128 / 16 * (4 / 2) = 16x the calibrated weight size.
+        if x.is_cuda:
+            torch.cuda.synchronize(x.device)
 
     @torch.no_grad()
     def compute_amax(self, verbose: bool = False):
-        """Return the per-block amax — from the fast path if it ran, else from the reference sweep."""
-        if self._best_amax_fast is not None:
-            return self._best_amax_fast
-        return super().compute_amax(verbose=verbose)
+        """Return the cached per-block amax."""
+        return self._best_amax
 
     def reset(self):
         """Reset per-cycle state. Keep ``_initial_amax`` so the calibrator stays reusable.
@@ -273,7 +278,7 @@ def reset(self):
         small enough to keep so a follow-up ``collect()`` can run again on the same
         calibrator instance.
         """
-        self._best_amax_fast = None
+        self._best_amax = None
         self._losses_sum = None
         self._candidates = None
         self._amax = None
@@ -722,9 +722,9 @@ class MseCalibConfig(QuantizeAlgorithmConfig):
     Finds a scale s (via amax a, with s = a / q_max) that minimizes the
     reconstruction error of a tensor after uniform Q→DQ:
 
-        s* = argmin_s  E[(X - DQ(Q(X; s)))^2],   X ∈ {weights | activations}
+        s* = argmin_s  E[(W - DQ(Q(W; s)))^2],   W ∈ weights
 
-    When fp8_scale_sweep is enabled, step_size is ignored.
+    When fp8_scale_sweep is enabled for a supported FP8-scale format, step_size is ignored.
     """
 
     method: Literal["mse"] = ModeloptField("mse")
@@ -755,8 +755,8 @@ class MseCalibConfig(QuantizeAlgorithmConfig):
         default=False,
         title="Enable FP8 scale sweep for NVFP4 per-block quantization.",
         description="If True, sweep all 128 FP8 E4M3 scale values instead of using multipliers. "
-        "Only applies to NVFP4 weight quantization. When enabled, num_steps, step_size, "
-        "start_multiplier, and stop_multiplier are ignored.",
+        "Applies to ModelOpt static NVFP4 weight quantizers and registered custom backends with "
+        "FP8 sweep support. Other weight quantizers use the multiplier search.",
     )
 
     distributed_sync: bool | None = ModeloptField(