Fix MHA CrossAttention regression: use num_heads_ for non-GQA paths

feich-ms · claude · feich-ms · commit 83182ea3cd6e · 2026-05-14T15:19:32.000+08:00
The internal present KV buffer shape must use num_heads_ for MHA
(where kv_num_heads_ is 0) and kv_num_heads_ only for GQA. Using
kv_num_heads_ unconditionally caused zero-sized buffers for MHA
CrossAttention tests.

Co-Authored-By: Claude Opus 4 &lt;noreply@anthropic.com&gt;
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -423,14 +423,15 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
   // Create present_key and present_value tensors if they are nullptr
   Tensor internal_present_key;
   Tensor internal_present_value;
+  const int present_kv_heads = parameters.is_gqa_ ? parameters.kv_num_heads_ : parameters.num_heads_;
   if (present_key == nullptr) {
-    TensorShapeVector present_kv_shape({parameters.batch_size_, parameters.kv_num_heads_,
+    TensorShapeVector present_kv_shape({parameters.batch_size_, present_kv_heads,
                                         parameters.total_sequence_length_, parameters.head_size_});
     internal_present_key = context.CreateGPUTensor(Q->DataType(), TensorShape(present_kv_shape));
     present_key = &internal_present_key;
   }
   if (present_value == nullptr) {
-    TensorShapeVector present_kv_shape({parameters.batch_size_, parameters.kv_num_heads_,
+    TensorShapeVector present_kv_shape({parameters.batch_size_, present_kv_heads,
                                         parameters.total_sequence_length_, parameters.head_size_});
     internal_present_value = context.CreateGPUTensor(Q->DataType(), TensorShape(present_kv_shape));
     present_value = &internal_present_value;