NVIDIA
diff --git a/‎benchmarks/linear/benchmark_grouped_linear.py‎
Lines changed: 3 additions & 0 deletions b/‎benchmarks/linear/benchmark_grouped_linear.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎qa/L0_pytorch_unittest/test.sh‎
Lines changed: 1 addition & 0 deletions b/‎qa/L0_pytorch_unittest/test.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/pytorch/attention/test_attention.py‎
Lines changed: 26 additions & 6 deletions b/‎tests/pytorch/attention/test_attention.py‎
Lines changed: 26 additions & 6 deletions
@@ -3,6 +3,7 @@
 # See LICENSE for license information.
 
 import argparse
+import os
 import torch
 import torch.utils.benchmark as benchmark
 import pandas as pd
@@ -185,6 +186,8 @@ def run_benchmark_linear(
         x = torch.randn((m, k), dtype=torch.bfloat16, device=device, requires_grad=True)
         ws = [torch.randn((n, k), dtype=torch.bfloat16, device=device) for _ in range(num_gemms)]
         m_splits = [m // num_gemms] * num_gemms if m_splits_provided is None else m_splits_provided
+        if bool(int(os.getenv("NVTE_GROUPED_LINEAR_USE_FUSED_GROUPED_GEMM", "0"))):
+            m_splits = torch.tensor(m_splits, dtype=torch.int64, device=device)
         # Bias is not supported for GroupedLinear benchmark
         bias = None
 
 
@@ -29,6 +29,7 @@ python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_P
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_custom_recipe.xml $TE_PATH/tests/pytorch/test_custom_recipe.py || test_fail "test_custom_recipe.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py || test_fail "test_deferred_init.py"
 PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "test_numerics.py"
+PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_grouped_linear.xml $TE_PATH/tests/pytorch/test_grouped_linear.py || test_fail "test_grouped_linear.py"
 PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_cuda_graphs.xml $TE_PATH/tests/pytorch/test_cuda_graphs.py || test_fail "test_cuda_graphs.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_jit.xml $TE_PATH/tests/pytorch/test_jit.py || test_fail "test_jit.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_rope.xml $TE_PATH/tests/pytorch/test_fused_rope.py || test_fail "test_fused_rope.py"
 
@@ -344,12 +344,36 @@ def test_dpa_num_splits(dtype, model_configs, model):
 @pytest.mark.skipif(
     not FlashAttentionUtils.v4_is_installed, reason="Flash-attn v4 (flash-attn-4) is required."
 )
-@pytest.mark.skipif(get_cudnn_version() < (8, 9, 1), reason="cuDNN 8.9.1+ is required.")
 @pytest.mark.parametrize("dtype", param_types_lean)
 @pytest.mark.parametrize("model_configs", [model_configs_fa4_base])
 @pytest.mark.parametrize("model", model_configs_fa4_base.keys())
 def test_dpa_fa4_base(dtype, model_configs, model):
-    """Test DotProductAttention with FA4: base configs, extended head dims, GQA, num_splits"""
+    """Test DotProductAttention with FA4: base configs, GQA, num_splits"""
+    test_dot_product_attention(dtype, model_configs, model, False, True, None, False, False)
+
+
+# head_dim=256 is supported only on SM100 via FA4's dedicated kernel
+# (flash_attn/cute/sm100_hd256_2cta_fmha_*.py), available in flash-attn-4 > 4.0.0b10.
+# On other architectures, _validate_head_dims rejects (256, 256), FA4 is disabled, and
+# the test would silently fall back to another backend — defeating the purpose. Gate
+# explicitly so the CI signal is unambiguous.
+model_configs_fa4_hdim256 = {
+    "fa4_hdim256": ModelConfig(2, 1024, 8, 256, attn_mask_type="causal"),
+}
+
+
+@pytest.mark.skipif(
+    not FlashAttentionUtils.v4_is_installed, reason="Flash-attn v4 (flash-attn-4) is required."
+)
+@pytest.mark.skipif(
+    device_compute_capability not in ((10, 0), (10, 3)),
+    reason="FA4 head_dim=256 dedicated kernel is SM100/103-only.",
+)
+@pytest.mark.parametrize("dtype", param_types_lean)
+@pytest.mark.parametrize("model_configs", [model_configs_fa4_hdim256])
+@pytest.mark.parametrize("model", model_configs_fa4_hdim256.keys())
+def test_dpa_fa4_hdim256(dtype, model_configs, model):
+    """Test DotProductAttention with FA4: head_dim=256 dedicated kernel on SM100"""
     test_dot_product_attention(dtype, model_configs, model, False, True, None, False, False)
 
 
@@ -369,7 +393,6 @@ def test_dpa_fa4_base(dtype, model_configs, model):
 @pytest.mark.skipif(
     not FlashAttentionUtils.v4_is_installed, reason="Flash-attn v4 (flash-attn-4) is required."
 )
-@pytest.mark.skipif(get_cudnn_version() < (8, 9, 1), reason="cuDNN 8.9.1+ is required.")
 @pytest.mark.parametrize("dtype", param_types_lean)
 @pytest.mark.parametrize("model_configs", [model_configs_fa4_mla])
 @pytest.mark.parametrize("model", model_configs_fa4_mla.keys())
@@ -396,7 +419,6 @@ def test_dpa_fa4_mla(dtype, model_configs, model):
 @pytest.mark.skipif(
     not FlashAttentionUtils.v4_is_installed, reason="Flash-attn v4 (flash-attn-4) is required."
 )
-@pytest.mark.skipif(get_cudnn_version() < (8, 9, 1), reason="cuDNN 8.9.1+ is required.")
 @pytest.mark.parametrize("dtype", param_types_lean)
 @pytest.mark.parametrize("model_configs", [model_configs_fa4_swa])
 @pytest.mark.parametrize("model", model_configs_fa4_swa.keys())
@@ -420,7 +442,6 @@ def test_dpa_fa4_sliding_window(dtype, model_configs, model, qkv_layout):
 @pytest.mark.skipif(
     not FlashAttentionUtils.v4_is_installed, reason="Flash-attn v4 (flash-attn-4) is required."
 )
-@pytest.mark.skipif(get_cudnn_version() < (8, 9, 1), reason="cuDNN 8.9.1+ is required.")
 @pytest.mark.parametrize("dtype", param_types_lean)
 @pytest.mark.parametrize("model_configs", [model_configs_fa4_varlen])
 @pytest.mark.parametrize("model", model_configs_fa4_varlen.keys())
@@ -446,7 +467,6 @@ def test_dpa_fa4_varlen(dtype, model_configs, model, qkv_layout):
 @pytest.mark.skipif(
     not FlashAttentionUtils.v4_is_installed, reason="Flash-attn v4 (flash-attn-4) is required."
 )
-@pytest.mark.skipif(get_cudnn_version() < (8, 9, 1), reason="cuDNN 8.9.1+ is required.")
 @pytest.mark.parametrize("dtype", param_types_lean)
 @pytest.mark.parametrize("model_configs", [model_configs_fa4_mask])
 @pytest.mark.parametrize("model", model_configs_fa4_mask.keys())