guarding max_logits fused attention for cudnn < 9.21.0

francesco-bertolotti · francesco-bertolotti · commit ae53b5b0ba54 · 2026-06-05T11:49:26.000+02:00
Signed-off-by: Francesco Bertolotti &lt;francesco.bertolotti@igenius.ai&gt;
diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -455,6 +455,10 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
         (cudnn_runtime_version >= 91301 ||
          (cudnn_runtime_version < 91301 &&
           softmax_type == NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX)) &&
+        // max_logit
+        // pre-9.21: no (the composite softmax node rejects the Stats + Max output combination)
+        // 9.21+: yes (Stats + Max via the unified softmax node)
+        (!return_max_logit || cudnn_runtime_version >= 92100) &&
         // determinism on Blackwell
         // pre-9.18.1: fwd: deterministic; bwd: non-deterministic
         // 9.18.1+: fwd: deterministic; bwd: non-deterministic/deterministic
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -672,6 +672,13 @@ def _disable_all_flash_attention() -> None:
         if use_flash_attention:
             use_flash_attention = False
             logger.debug("Disabling FlashAttention for max_logit")
+        # FusedAttention emits max_logit alongside the softmax stats, which cuDNN only
+        # supports through the unified softmax node introduced in cuDNN 9.21.0. On older
+        # cuDNN the composite softmax node rejects the stats+max combination, so fall back
+        # to UnfusedDotProductAttention.
+        if use_fused_attention and cudnn_version < (9, 21, 0):
+            use_fused_attention = False
+            logger.debug("Disabling FusedAttention for max_logit for cuDNN < 9.21.0")
         if fp8 and fp8_meta["recipe"].fp8_dpa:
             use_flash_attention = False
             use_fused_attention = False