Apply per-phase calibrated skip threshold at HF inference

kaix-nv · rohansjoshi · commit 37b77a065612 · 2026-06-08T23:51:26.000Z
Signed-off-by: Kai Xu &lt;kaix@nvidia.com&gt;
diff --git a/modelopt/torch/kernels/common/attention/hf_triton_attention.py b/modelopt/torch/kernels/common/attention/hf_triton_attention.py
@@ -145,10 +145,13 @@ def triton_attention_forward(
         kw["dense_sink_tokens"] = method.dense_sink_tokens
         kw["dense_recent_tokens"] = method.dense_recent_tokens
 
-    # Skip-softmax: applies to both prefill and decode
+    # Skip-softmax: applies to both prefill and decode. Prefer the method's
+    # per-phase calibrated dynamic threshold (scale_factor / seq_k); fall back
+    # to the static threshold when uncalibrated.
     if method is not None and getattr(module, "_apply_skip_softmax", False):
-        if method.skip_softmax_threshold:
-            kw["skip_softmax_threshold"] = method.skip_softmax_threshold
+        threshold = method.get_inference_threshold(seq_len, seq_k)
+        if threshold:
+            kw["skip_softmax_threshold"] = threshold
 
     o = attention(q, k, v, **kw)
 
diff --git a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
@@ -133,20 +133,20 @@ def _triton_calibration_context(self, module):
                 module._apply_skip_softmax = False
                 self._clear_triton_backends()
 
-    def _get_scale_factor(self) -> float | None:
-        """Compute scale_factor from calibration params, or None if uncalibrated.
+    def _get_scale_factor(self, phase: str = "prefill") -> float | None:
+        """Compute the scale_factor for ``phase`` from calibration params, or None.
 
-        The scale_factor is sequence-length-independent. Backends divide by the
+        The scale_factor is sequence-length-independent. Callers divide by the
         actual ``seq_k`` at call time: ``threshold = scale_factor / seq_k``.
         """
         if self.calibration_params and self.target_sparse_ratio:
             import math
             import warnings
 
-            params = self.calibration_params.get("prefill", {})
+            params = self.calibration_params.get(phase, {})
             a = params.get("a", 0)
             b = params.get("b", 0)
-            target = self.target_sparse_ratio.get("prefill", 0.5)
+            target = self.target_sparse_ratio.get(phase, 0.5)
             if a > 0 and b > 0:
                 # Warn if target is outside the calibrated range
                 min_s = params.get("min_observed_sparsity")
@@ -167,6 +167,22 @@ def _get_scale_factor(self) -> float | None:
                 return a * math.exp(b * target)
         return None
 
+    def get_inference_threshold(self, seq_q: int, seq_k: int) -> float | None:
+        """Return the skip threshold to apply for this call's phase.
+
+        Picks the phase from the query length (``decode`` when ``seq_q == 1``,
+        else ``prefill``) and returns the calibrated dynamic threshold
+        ``scale_factor(phase) / seq_k`` when the phase is calibrated, otherwise
+        the static ``skip_softmax_threshold`` (or ``None`` to disable). This is
+        what the HF backend applies; it keeps prefill and decode on their own
+        calibrated ``(a, b)`` instead of forcing decode onto prefill's.
+        """
+        phase = "decode" if seq_q <= 1 else "prefill"
+        scale_factor = self._get_scale_factor(phase)
+        if scale_factor is not None and seq_k > 0:
+            return scale_factor / seq_k
+        return self.skip_softmax_threshold or None
+
     @staticmethod
     @contextmanager
     def _get_diffusers_backend_context():