requested changes

francesco-bertolotti · francesco-bertolotti · commit 28608a796a1c · 2026-06-11T06:46:45.000+02:00
Signed-off-by: Francesco Bertolotti &lt;francesco.bertolotti@igenius.ai&gt;
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
@@ -86,7 +86,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
   const auto cudnn_runtime_version = cudnnGetVersion();
   const int device_id = cuda::current_device();
   const int sm_arch_ = cuda::sm_arch(device_id);
-  bool use_ragged_stats = is_ragged_q && cudnn_runtime_version >= 90600 && sm_arch_ != 120;
+  bool use_ragged_stats = is_ragged_q && sm_arch_ >= 90 && cudnn_runtime_version >= 90600 && sm_arch_ != 120;
 
   NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
   bool is_paged_kv = (layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD);
@@ -98,10 +98,8 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
   int64_t actual_b = b;
   if ((is_ragged_q || is_ragged_kv) && cudnn_runtime_version >= 90600) {
     NVTE_CHECK(is_padding, "Ragged QKV input requires padding or padding_causal mask!");
-    // On SM 120, cuDNN support check treats layouts with stride[0] > dim[1]*dim[2]*dim[3]
-    // as interleaved and rejects them. Use BHSD-like dimensions/strides with max_seqlen at plan build
-    // so the check passes; ragged offset still provides variable-length boundaries.
-    if (sm_arch_ != 120) {
+    // On SM8X/SM12X, cuDNN requires BHSD-like strides with max_seqlen at plan build.
+    if (sm_arch_ >= 90 && sm_arch_ != 120) {
       // replace batch size and maximum sequence lengths with maximum token counts
       // for query and key/value so the graph is static within each quantization bucket
       b = max_b;
@@ -385,7 +383,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
       }
 
       Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_dim({b, h, s_q, 1});
-      if (is_ragged_q && cudnn_runtime_version >= 90600) {
+      if (use_ragged_stats) {
         Stats->set_stride({h * s_q, 1, h, 1}).set_ragged_offset(offset_stats);
       } else {
         Stats->set_stride({h * s_q, s_q, 1, 1});
@@ -590,7 +588,7 @@ void fused_attn_arbitrary_seqlen_bwd_impl(
   const auto cudnn_runtime_version = cudnnGetVersion();
   const int device_id = cuda::current_device();
   const int sm_arch_ = cuda::sm_arch(device_id);
-  bool use_ragged_stats = is_ragged_q && cudnn_runtime_version >= 90600 && sm_arch_ != 120;
+  bool use_ragged_stats = is_ragged_q && sm_arch_ >= 90 && cudnn_runtime_version >= 90600 && sm_arch_ != 120;
 
   NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
   bool is_paged_kv = (layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD);
@@ -602,8 +600,8 @@ void fused_attn_arbitrary_seqlen_bwd_impl(
   int64_t actual_b = b;
   if ((is_ragged_q || is_ragged_kv) && cudnn_runtime_version >= 90600) {
     NVTE_CHECK(is_padding, "Ragged QKV input requires padding or padding_causal mask!");
-    // On SM 120, cuDNN support check requires BHSD-like strides with max_seqlen (see fwd).
-    if (sm_arch_ != 120) {
+    // On SM8X/SM12X, cuDNN requires BHSD-like strides with max_seqlen at plan build.
+    if (sm_arch_ >= 90 && sm_arch_ != 120) {
       // replace batch size and maximum sequence lengths with maximum token counts
       // for query and key/value so the graph is static within each quantization bucket
       b = max_b;
@@ -805,7 +803,7 @@ void fused_attn_arbitrary_seqlen_bwd_impl(
       if (use_ragged_stats) {
         sdpa_backward_options.set_max_total_seq_len_q(s_q);
       }
-      if (is_ragged_kv && cudnn_runtime_version >= 90600 && sm_arch_ != 120) {
+      if (is_ragged_kv && sm_arch_ >= 90 && cudnn_runtime_version >= 90600 && sm_arch_ != 120) {
         sdpa_backward_options.set_max_total_seq_len_kv(s_kv);
       }
 
@@ -1139,10 +1137,13 @@ void fused_attn_arbitrary_seqlen_fwd(
   size_t i = 0;
   if (Aux_CTX_Tensors->size == 0) {
     const auto cudnn_runtime_version = cudnnGetVersion();
+    bool is_ragged_q = (q_format == NVTE_QKV_Format::NVTE_THD);
+    bool use_ragged_stats =
+        is_ragged_q && sm_arch_ >= 90 && cudnn_runtime_version >= 90600 && sm_arch_ != 120;
 
     Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
     output_S->data.dptr = nullptr;
-    if (q_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) {
+    if (use_ragged_stats) {
       output_S->data.shape = {num_tokens_q, num_attn_heads, 1};
     } else {
       output_S->data.shape = {batch, num_attn_heads, max_seqlen_q, 1};
@@ -1152,8 +1153,7 @@ void fused_attn_arbitrary_seqlen_fwd(
     if (return_max_logit) {
       Tensor *output_Max = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
       output_Max->data.dptr = nullptr;
-      if ((q_format == NVTE_QKV_Format::NVTE_THD && cudnn_runtime_version >= 90600) &&
-          (sm_arch_ != 120)) {
+      if (use_ragged_stats) {
         output_Max->data.shape = {num_tokens_q, num_attn_heads, 1};
       } else {
         output_Max->data.shape = {batch, num_attn_heads, max_seqlen_q, 1};
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -965,7 +965,7 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
                 use_flash_attention_4 = False
 
     # Filter: QKV layout
-    if qkv_format == "thd":
+    if "thd" in (q_format, kv_format):
         if pad_between_seqs:
             if (  # pylint: disable=too-many-boolean-expressions
                 use_flash_attention_2 and FlashAttentionUtils.is_installed