NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 7 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎modelopt/torch/kernels/triton_fa.py‎
Lines changed: 8 additions & 4 deletions b/‎modelopt/torch/kernels/triton_fa.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎tests/gpu/torch/sparsity/attention_sparsity/conftest.py‎
Lines changed: 72 additions & 0 deletions b/‎tests/gpu/torch/sparsity/attention_sparsity/conftest.py‎
Lines changed: 72 additions & 0 deletions
@@ -1,6 +1,13 @@
 NVIDIA Model Optimizer Changelog
 ================================
 
+0.44 (2026-04-xx)
+^^^^^^^^^^^^^^^^^
+
+**New Features**
+
+- Add N:M sparse softmax support to the Triton flash attention kernel (``modelopt.torch.kernels.triton_fa``). For every M consecutive key positions, the top-N attention scores are kept and the rest are set to -inf before softmax.
+
 0.43 (2026-03-xx)
 ^^^^^^^^^^^^^^^^^
 
 
@@ -111,6 +111,10 @@ def _apply_sparse_nm_to_qk_tile(
     For every ``SPARSITY_M`` consecutive elements along the N (key) dimension,
     keeps the top ``SPARSITY_N`` values and sets the rest to ``-inf``.
     ``BLOCK_N`` must be divisible by ``SPARSITY_M``.
+
+    For M=4, exactly N values are retained (ties broken by position).
+    For M=8, a threshold-based approach (``tl.sort``) may retain more
+    than N values when ties straddle the threshold boundary.
     """
     tl.static_assert(SPARSITY_M == 4 or SPARSITY_M == 8, "SPARSITY_M must be 4 or 8")  # noqa: PLR1714
     MASK_VAL: tl.constexpr = float("-inf")
@@ -141,7 +145,7 @@ def _apply_sparse_nm_to_qk_tile(
         sorted_vals = tl.sort(reshaped, dim=2)
         KTH_IDX: tl.constexpr = SPARSITY_M - SPARSITY_N  # index of N-th largest in ascending order
 
-        # Extract the threshold value (one extraction vs eight before)
+        # Extract the threshold value at KTH_IDX via masked sum
         # Use 0.0 as fill (not -inf) so sum equals just the KTH element
         cols = tl.arange(0, 8)[None, None, :]
         threshold = tl.sum(tl.where(cols == KTH_IDX, sorted_vals, 0.0), axis=2)
@@ -272,7 +276,7 @@ def _attn_fwd(
         scores = tl.dot(q, k) * qk_scale
         scores = _apply_mask(scores, q_pos, kv_pos, seq_len_q, seq_len_kv, kv_start, IS_CAUSAL)
 
-        # --- Optional 2:4 structured sparsity ---
+        # --- Optional N:M structured sparsity ---
         if SPARSITY_N > 0:
             # Check if this KV tile should be kept dense
             is_sink = kv_start < NUM_SINK_TOKENS
@@ -473,7 +477,7 @@ def _attn_bwd_dq(
         scores = tl.dot(q, kT) * qk_scale
         scores = _apply_mask(scores, q_pos, kv_pos, seq_len_q, seq_len_kv, kv_start, IS_CAUSAL)
 
-        # Re-apply 2:4 sparsity to match forward pass
+        # Re-apply N:M sparsity to match forward pass
         if SPARSITY_N > 0:
             is_sink = kv_start < NUM_SINK_TOKENS
             causal_offset = seq_len_kv - seq_len_q
@@ -613,7 +617,7 @@ def _attn_bwd_dkdv(
             scores = tl.dot(q_tile, kT) * qk_scale
             scores = _apply_mask(scores, q_pos, kv_pos, seq_len_q, seq_len_kv, kv_start, IS_CAUSAL)
 
-            # Re-apply 2:4 sparsity to match forward pass
+            # Re-apply N:M sparsity to match forward pass
             if SPARSITY_N > 0:
                 is_sink = kv_start < NUM_SINK_TOKENS
                 causal_offset = seq_len_kv - seq_len_q
 
@@ -0,0 +1,72 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared fixtures and helpers for Triton flash attention tests."""
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+
+def make_qkv(total, num_heads, num_kv_heads, head_dim, device="cuda", dtype=torch.float16):
+    """Create packed Q, K, V tensors."""
+    q = torch.randn(total, num_heads, head_dim, device=device, dtype=dtype)
+    k = torch.randn(total, num_kv_heads, head_dim, device=device, dtype=dtype)
+    v = torch.randn(total, num_kv_heads, head_dim, device=device, dtype=dtype)
+    return q, k, v
+
+
+def make_varlen_meta(seq_lens, device="cuda"):
+    """Create b_start_loc and b_seq_len from a list of sequence lengths."""
+    b_seq_len = torch.tensor(seq_lens, device=device, dtype=torch.int32)
+    b_start_loc = torch.zeros(len(seq_lens), device=device, dtype=torch.int32)
+    b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], dim=0)
+    return b_start_loc, b_seq_len
+
+
+def sdpa_reference(q, k, v, b_start_loc, b_seq_len, is_causal=True):
+    """SDPA reference. Supports GQA. Returns [total_tokens, num_heads, dim]."""
+    batch = b_seq_len.shape[0]
+    num_q, num_kv = q.shape[1], k.shape[1]
+    parts = []
+    for b in range(batch):
+        s, n = int(b_start_loc[b].item()), int(b_seq_len[b].item())
+        qb = q[s : s + n].unsqueeze(0).permute(0, 2, 1, 3)
+        kb = k[s : s + n].unsqueeze(0).permute(0, 2, 1, 3)
+        vb = v[s : s + n].unsqueeze(0).permute(0, 2, 1, 3)
+        if num_q != num_kv:
+            r = num_q // num_kv
+            kb = kb.repeat_interleave(r, dim=1)
+            vb = vb.repeat_interleave(r, dim=1)
+        ob = F.scaled_dot_product_attention(qb, kb, vb, is_causal=is_causal)
+        parts.append(ob.permute(0, 2, 1, 3).squeeze(0))
+    return torch.cat(parts, dim=0)
+
+
+@pytest.fixture(scope="module")
+def tiny_llama_dir(tmp_path_factory):
+    """Tiny Llama: 2 layers, 64 hidden, 4 q-heads, 2 kv-heads, head_dim=16."""
+    from _test_utils.torch.transformers_models import create_tiny_llama_dir
+
+    return create_tiny_llama_dir(
+        tmp_path_factory.mktemp("tiny_llama"),
+        with_tokenizer=True,
+        num_hidden_layers=2,
+        hidden_size=64,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        intermediate_size=64,
+        max_position_embeddings=64,
+    )