Revert skip-softmax threshold formula change: restore * sm_scale

yeyu-nvidia · yeyu-nvidia · commit 708f1130a8d7 · 2026-04-08T11:31:31.000-07:00
The * sm_scale factor is intentional: it scales the tile-skip threshold
relative to head dimension, so larger head_dim (smaller sm_scale) produces
more aggressive sparsity for the same lambda value. The previous 'fix' was
incorrect.

Signed-off-by: Ye Yu &lt;yeyu@nvidia.com&gt;
diff --git a/modelopt/torch/kernels/triton_fa.py b/modelopt/torch/kernels/triton_fa.py
@@ -1003,29 +1003,17 @@ def forward(
         BLOCK_D = triton.next_power_of_2(HEAD_DIM)
 
         # Skip-softmax: convert lambda threshold to log2 space for the kernel.
-        #
-        # BLASST (https://arxiv.org/pdf/2512.12087) checks the criterion on the
-        # sm_scale-SCALED attention logits a_ij = q·k / sqrt(d):
-        #
-        #   tile_max_a < running_max_a + ln(lambda)
-        #
-        # The Triton kernel stores scores as x = a * log2(e) (for exp2 efficiency),
-        # so a = x * ln(2).  Substituting:
-        #
-        #   tile_max_x * ln(2) < running_max_x * ln(2) + ln(lambda)
-        #   tile_max_x         < running_max_x + log2(lambda)
-        #
-        # Therefore the threshold in kernel (log2) space is simply log2(lambda).
-        # Do NOT multiply by sm_scale — that factor is already absorbed into the
-        # log2(e) conversion above.
+        # The threshold is scaled by sm_scale to control sparsity relative to
+        # head dimension: larger head_dim → smaller sm_scale → more aggressive
+        # skipping for the same lambda value.
         if quantize_p and (q.requires_grad or k.requires_grad or v.requires_grad):
             raise NotImplementedError(
                 "quantize_p supports inference only; backward does not model the quantized P path"
             )
 
         apply_skip = skip_softmax_threshold is not None and skip_softmax_threshold > 0.0
         if apply_skip:
-            skip_threshold_log2 = math.log2(skip_softmax_threshold)
+            skip_threshold_log2 = math.log2(skip_softmax_threshold) * sm_scale
         else:
             skip_threshold_log2 = 0.0