gemma4_31b TQ4 SDPA: add 5090-feasible autotune configs + comment updates

Gasoonjia · Gasoonjia · commit 2a93d5344c02 · 2026-06-25T09:40:26.000-07:00
diff --git a/backends/cuda/triton/kernels/tq4_sdpa.py b/backends/cuda/triton/kernels/tq4_sdpa.py
@@ -294,16 +294,14 @@ def _tq4_sdpa_fwd_kernel_body(
 
 @triton.autotune(
     configs=[
-        # No-spill prefill configs, pruned to the profiled-optimal set for the
-        # gemma4 global shape (heavy-shape optimum = BLOCK_M=32/BLOCK_N=32/w4/s2).
-        # BLOCK_M=32 keeps the fp32 acc[BLOCK_M, HEAD_DIM] in registers (BLOCK_M=64
-        # at HEAD_DIM=512 = 128 KB/CTA spills to local memory) and BLOCK_N<=64
-        # keeps the staged decompressed K/V tile within the A100 SMEM budget.
-        # BLOCK_M=16 / BLOCK_N=16 configs were pruned (slower; BLOCK_N=16 also
-        # measured low cosine ~0.79-0.93 at this shape).
         triton.Config({"BLOCK_M": 32, "BLOCK_N": 64}, num_warps=8, num_stages=2),
         triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=4, num_stages=3),
         triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=4, num_stages=2),
+        # Extra BLOCK_N in {32,64} configs for smaller-SMEM GPUs (e.g. RTX 5090);
+        # correctness-safe (cos~1.0), never BLOCK_N=16 (numerically wrong).
+        triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_M": 32, "BLOCK_N": 32}, num_warps=4, num_stages=4),
+        triton.Config({"BLOCK_M": 32, "BLOCK_N": 64}, num_warps=8, num_stages=3),
     ],
     key=["Lq", "Lk", "HEAD_DIM", "HAS_MASK", "IS_CAUSAL", "NUM_GROUPS", "PACK_GQA"],
 )
@@ -783,17 +781,14 @@ def tq4_sdpa(
 
 @triton.autotune(
     configs=[
-        # Split-K decode configs, curated to the profiled-optimal set so the
-        # HAS_MASK=False specialization (decode passes attn_mask=None too, for the
-        # AOTI weights-blob dedup) bakes a good config: BLOCK_N=32/w4/s2 is the
-        # primary optimum (964us@127K, 344us@32K), BLOCK_N=64/w8/s3 wins at 127K
-        # (914us), BLOCK_N=128/w8/s2 is a safe fallback. Other configs were pruned:
-        # BLOCK_N=64/w2/s1 (12.8ms), 128/w4/s{1,2,3} (up to 9.4ms) and 32/w2/s1 are
-        # catastrophic for HAS_MASK=False; the rest were not measured-optimal and
-        # are dropped so AOTI cannot bake a slow one (no autotune lottery).
         triton.Config({"BLOCK_N": 32}, num_warps=4, num_stages=2),
         triton.Config({"BLOCK_N": 64}, num_warps=8, num_stages=3),
         triton.Config({"BLOCK_N": 128}, num_warps=8, num_stages=2),
+        # Extra BLOCK_N in {32,64} configs for smaller-SMEM GPUs (e.g. RTX 5090);
+        # correctness-safe (cos~1.0), never BLOCK_N=16 (numerically wrong).
+        triton.Config({"BLOCK_N": 32}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_N": 64}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_N": 32}, num_warps=4, num_stages=3),
     ],
     key=["Lk", "HEAD_DIM", "NUM_GROUPS", "HAS_MASK", "PACK_GQA"],
 )
diff --git a/examples/models/gemma4_31b/cuda_source_transformations.py b/examples/models/gemma4_31b/cuda_source_transformations.py
@@ -52,6 +52,9 @@ def _turboquant_attention_forward(
 
     Mirrors the default forward up to (and including) RoPE; only the
     cache update and SDPA call differ.
+
+    NOTE: ``attn_mask`` is unused here and will be reconstucted in
+    the kernel to save data transfer, but is passed to the default forward
     """
     B, T, _ = x.shape
 
@@ -94,15 +97,6 @@ def _turboquant_attention_forward(
     # step (catastrophic at 128k: ~2.7 tok/s decode vs ~37+ when bounded).
     kv_len = input_pos[0] + input_pos.shape[0]
 
-    # attn_mask=None for BOTH prefill and decode: tq4_sdpa applies causal masking
-    # analytically (mask_is_causal + kv_len, absolute causal-offset), so the SDPA
-    # call is identical across the two exported methods — AOTI dedups the shared
-    # weights blob (~26 GB). Prefill takes the no-spill analytic path; decode takes
-    # split-K with HAS_MASK=False, whose autotune list is curated (tq4_sdpa.py) to
-    # the profiled-optimal BLOCK_N configs, so HAS_MASK=False does not regress
-    # decode. ``scale=self.scaling`` (= 1.0 for Gemma 4) overrides tq4_sdpa's
-    # 1/sqrt(D) default (Gemma's QK-norm folded that factor into the weights).
-    sdpa_attn_mask = None
     y = torch.ops.triton.tq4_sdpa(
         q,
         k_packed,
@@ -111,7 +105,7 @@ def _turboquant_attention_forward(
         v_norms,
         self.kv_cache.centroids,
         self.kv_cache.rotation,
-        sdpa_attn_mask,
+        None,  # reconstuct attention mask in the kernel to save data transfer
         False,  # is_causal: needs L_q==L_kv; causal comes from mask_is_causal
         self.scaling,
         kv_len,