added thd a100 guard

francesco-bertolotti · francesco-bertolotti · commit 97557451b29e · 2026-06-05T14:20:54.000+02:00
Signed-off-by: Francesco Bertolotti &lt;francesco.bertolotti@igenius.ai&gt;
diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -399,16 +399,22 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
         // qkv format
         (qkv_format == NVTE_QKV_Format::NVTE_SBHD || qkv_format == NVTE_QKV_Format::NVTE_BSHD ||
          qkv_format == NVTE_QKV_Format::NVTE_BHSD ||
-         (qkv_format == NVTE_QKV_Format::NVTE_THD && sm_arch_ >= 90 &&
+         (qkv_format == NVTE_QKV_Format::NVTE_THD &&
+          (sm_arch_ >= 90 || cudnn_runtime_version >= 91801) &&
           ((cudnn_runtime_version >= 90100 && num_attn_heads == num_gqa_groups) ||
            cudnn_runtime_version >= 90600)) ||
          ((q_format == NVTE_QKV_Format::NVTE_SBHD || q_format == NVTE_QKV_Format::NVTE_BSHD ||
            q_format == NVTE_QKV_Format::NVTE_BHSD ||
-           (q_format == NVTE_QKV_Format::NVTE_THD && sm_arch_ >= 90) ||
+           (q_format == NVTE_QKV_Format::NVTE_THD &&
+            (sm_arch_ >= 90 || cudnn_runtime_version >= 91801)) ||
            kv_format == NVTE_QKV_Format::NVTE_SBHD || kv_format == NVTE_QKV_Format::NVTE_BSHD ||
            kv_format == NVTE_QKV_Format::NVTE_BHSD ||
-           (kv_format == NVTE_QKV_Format::NVTE_THD && sm_arch_ >= 90)) &&
+           (kv_format == NVTE_QKV_Format::NVTE_THD &&
+            (sm_arch_ >= 90 || cudnn_runtime_version >= 91801))) &&
           cudnn_runtime_version >= 90700)) &&
+        // THD (ragged offset) support: Ampere/Ada (sm80/sm89) only from cuDNN 9.18.1
+        ((q_format != NVTE_QKV_Format::NVTE_THD && kv_format != NVTE_QKV_Format::NVTE_THD) ||
+         sm_arch_ >= 90 || cudnn_runtime_version >= 91801) &&
         // sliding window
         // pre-9.2: full attn, causal
         ((cudnn_runtime_version < 90200 && window_size_left == -1 &&
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -996,6 +996,18 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
                         qkv_layout,
                     )
                 use_fused_attention = False
+    # THD support on Ampere/Ada requires cuDNN 9.18.1+ ("SDPA backward with THD layout on
+    # RTX-PRO 6000 and Ampere-architecture GPUs"). Check q_format/kv_format, not just
+    # qkv_format, since KV-cache layouts (e.g. paged_kv_thd_bshd_bshd) have
+    # qkv_format = thd_2bshd.
+    if "thd" in (q_format, kv_format) and device_compute_capability < (9, 0):
+        if cudnn_version < (9, 18, 1):
+            if use_fused_attention:
+                logger.debug(
+                    "Disabling FusedAttention as qkv_format = thd is not supported for"
+                    " compute capability < sm90 and cuDNN version < 9.18.1"
+                )
+            use_fused_attention = False
 
     # Filter: Dropout
     if attention_dropout != 0.0: