Use unfused SDPA for short sequences (q_len <= 128 or kv_len <= 128)

kimishpatel · kimishpatel · commit 79a8168730f4 · 2026-04-01T13:32:41.000-07:00
ATT Differential Revision: [D96044308](https://our.internmc.facebook.com/intern/diff/D96044308/) ghstack-source-id: 361224789 Pull Request resolved: #18651
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
@@ -412,7 +412,13 @@ Tensor& custom_sdpa_out_impl(
       InvalidArgument,
       output);
 
-  bool use_unfused_sdpa = seq_len == 1;
+  // Quantized GEMM kernels may not handle non-contiguous per-head strides
+  // correctly when seq_dim=ONE and seq_len > 1, so keep the conservative
+  // condition for quantized inputs.
+  bool is_quantized = q.scalar_type() == ScalarType::Char;
+  bool use_unfused_sdpa = is_quantized
+      ? (seq_len == 1)
+      : (seq_len <= 128 || num_keys_for_causal_attention <= 128);
   if (use_unfused_sdpa) {
     ET_SWITCH_FLOAT_TYPES(
         output.scalar_type(), ctx, "sdpa", CTYPE, [&] {