Fix decode calibration: full-cache kv_bound + 128x128 block to match PyTorch

kaix-nv · kaix-nv · commit 422a5f0afbcb · 2026-06-02T16:45:32.000-07:00
Signed-off-by: Kai Xu &lt;kaix@nvidia.com&gt;
diff --git a/modelopt/torch/kernels/common/attention/triton_fa.py b/modelopt/torch/kernels/common/attention/triton_fa.py
@@ -80,7 +80,10 @@ def _load_sparsity_helpers() -> None:
     _FWD_CONFIGS = [triton.Config({"BLOCK_M": 128, "BLOCK_N": 64}, num_stages=1, num_warps=4)]
 
 _MEASURE_BLOCK_M = 128
-_MEASURE_BLOCK_N = 64
+# 128 (not 64) so the kernel sparsity-measurement block matches the PyTorch
+# flash_skip_softmax calibration block (br = bc = 128) and the Triton
+# calibration kernel; otherwise the two measure at different granularities.
+_MEASURE_BLOCK_N = 128
 _MEASURE_NUM_STAGES = 1
 _MEASURE_NUM_WARPS = 4
 
diff --git a/modelopt/torch/kernels/sparsity/attention/calibrate.py b/modelopt/torch/kernels/sparsity/attention/calibrate.py
@@ -111,7 +111,17 @@ def _attn_fwd_calibrate(
     local_skipped = tl.zeros([PADDED_THRESHOLDS], dtype=tl.int32)
     num_tiles = 0
 
-    kv_bound = seq_len_kv if not IS_CAUSAL else tl.minimum((tile_q + 1) * BLOCK_M, seq_len_kv)
+    # Causal bound: when Q is a suffix of KV (decode: seq_len_q == 1 against a
+    # long cache; or chunked prefill), the visible KV extends to
+    # causal_offset + (tile_q + 1) * BLOCK_M. Without the offset the loop stops
+    # at the first BLOCK_M KV tokens, so decode would only ever measure the
+    # start of the cache instead of the whole thing.
+    causal_offset = seq_len_kv - seq_len_q
+    kv_bound = (
+        seq_len_kv
+        if not IS_CAUSAL
+        else tl.minimum(causal_offset + (tile_q + 1) * BLOCK_M, seq_len_kv)
+    )
 
     for kv_start in range(0, kv_bound, BLOCK_N):
         kv_start = tl.multiple_of(kv_start, BLOCK_N)
@@ -261,8 +271,10 @@ def attention_calibrate(
     sm_scale = 1.0 / (HEAD_DIM**0.5) if softmax_scale is None else softmax_scale
     qk_scale = sm_scale * LOG2E
     BLOCK_D = triton.next_power_of_2(HEAD_DIM)
+    # 128x128 to match the PyTorch flash_skip_softmax calibration block (br = bc = 128),
+    # so Triton-kernel and PyTorch calibration measure sparsity at the same granularity.
     BLOCK_M = 128
-    BLOCK_N = 64
+    BLOCK_N = 128
 
     if b_seq_len_k is None:
         b_seq_len_k = b_seq_len
diff --git a/tests/gpu/torch/kernels/sparsity/attention/test_triton_fa_calibrate.py b/tests/gpu/torch/kernels/sparsity/attention/test_triton_fa_calibrate.py
@@ -319,7 +319,9 @@ def test_first_measured_call_has_real_tile_count_with_autotune(self):
         assert result.returncode == 0, result.stderr
         totals = [line for line in result.stdout.splitlines() if line.startswith("TOTAL=")]
         assert totals, result.stdout
-        assert int(totals[-1].split("=", maxsplit=1)[1]) == 8
+        # seq_len=256, _MEASURE_BLOCK_M = _MEASURE_BLOCK_N = 128, non-causal:
+        # Q tiles = ceil(256/128) = 2, KV tiles = ceil(256/128) = 2, total = 4.
+        assert int(totals[-1].split("=", maxsplit=1)[1]) == 4
 
     def test_measure_sparsity_without_skip_is_noop(self):
         """Without skip-softmax, measure_sparsity doesn't attach counters."""