Fix kv_empty layers failing with sliding window on long sequences

feich-ms · claude · feich-ms · commit 607b4e8b0dd1 · 2026-05-26T17:47:13.000+08:00
When total_sequence_length exceeds local_window_size, the sliding window
check was blocking flash attention for kv_empty (shared KV) layers. This
is incorrect because sliding window is irrelevant for these layers — they
have no local KV cache and reuse another layer's already-computed cache.
Add regression test for this case.

Co-Authored-By: Claude Opus 4 &lt;noreply@anthropic.com&gt;
diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
@@ -283,7 +283,10 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
   // Use a sliding window if the total sequence exceeds the window's length.
   bool use_sliding_window = (local_window_size_ != -1 && local_window_size_ < parameters.total_sequence_length_);
   bool will_use_flash_attention = false;
-  if (!use_smooth_softmax_ && !use_sliding_window) {
+  // For kv_empty layers (shared KV), sliding window is irrelevant — there's no new KV to window
+  // over, the layer reuses another layer's already-computed KV cache. Flash attention is required
+  // for these layers, so we bypass the sliding window check to allow it.
+  if (!use_smooth_softmax_ && (!use_sliding_window || kv_empty)) {
     // Create a temporary parameters copy with is_packed_qkv_ set to false to check if flash attention can be applied after unpacking
     WebgpuAttentionParameters temp_params = parameters;
     temp_params.is_packed_qkv_ = false;
diff --git a/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc b/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
@@ -1854,5 +1854,68 @@ TEST(GroupQueryAttentionTest, WebGPU_SharedKV_Rotary) {
   ExpectOutputsMatch(webgpu_output, cpu_output, 0.05f, "SharedKV_Rotary_WebGPU_vs_CPU");
 }
 
+// WebGPU: kv_sequence_length=0 with sliding window active (total_seq > local_window_size).
+// Regression test: sliding window must not block flash attention for kv_empty layers.
+TEST(GroupQueryAttentionTest, WebGPU_SharedKV_SlidingWindow) {
+  auto webgpu_ep = DefaultWebGpuExecutionProvider();
+  if (!webgpu_ep) {
+    GTEST_SKIP() << "WebGPU EP not available";
+  }
+
+  constexpr int batch_size = 1;
+  constexpr int q_seq_len = 4;
+  constexpr int past_seq_len = 32;
+  constexpr int num_heads = 2;
+  constexpr int kv_num_heads = 1;
+  constexpr int head_size = 8;
+  constexpr int hidden_size = num_heads * head_size;
+  constexpr int kv_hidden_size = kv_num_heads * head_size;
+  constexpr int local_window_size = 16;  // < past_seq_len to trigger sliding window
+  constexpr int total_seq_len = past_seq_len;
+
+  OpTester tester("GroupQueryAttention", 1, onnxruntime::kMSDomain);
+  tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
+  tester.AddAttribute<int64_t>("kv_num_heads", static_cast<int64_t>(kv_num_heads));
+  tester.AddAttribute<int64_t>("local_window_size", static_cast<int64_t>(local_window_size));
+
+  std::vector<float> query_data(batch_size * q_seq_len * hidden_size);
+  std::vector<float> past_key_data(batch_size * kv_num_heads * past_seq_len * head_size);
+  std::vector<float> past_value_data(batch_size * kv_num_heads * past_seq_len * head_size);
+  for (size_t i = 0; i < query_data.size(); i++) query_data[i] = 0.1f * static_cast<float>(i % 7 + 1);
+  for (size_t i = 0; i < past_key_data.size(); i++) past_key_data[i] = 0.2f * static_cast<float>(i % 5 + 1);
+  for (size_t i = 0; i < past_value_data.size(); i++) past_value_data[i] = 0.3f * static_cast<float>(i % 3 + 1);
+
+  tester.AddInput<float>("query", {batch_size, q_seq_len, hidden_size}, query_data);
+  tester.AddInput<float>("key", {batch_size, 0, kv_hidden_size}, {});
+  tester.AddInput<float>("value", {batch_size, 0, kv_hidden_size}, {});
+  tester.AddInput<float>("past_key", {batch_size, kv_num_heads, past_seq_len, head_size}, past_key_data);
+  tester.AddInput<float>("past_value", {batch_size, kv_num_heads, past_seq_len, head_size}, past_value_data);
+
+  std::vector<int32_t> seqlens_k_data(batch_size, static_cast<int32_t>(total_seq_len - 1));
+  tester.AddInput<int32_t>("seqlens_k", {batch_size}, seqlens_k_data);
+  tester.AddInput<int32_t>("total_sequence_length", {1}, {static_cast<int32_t>(total_seq_len)});
+
+  tester.AddOptionalInputEdge<float>();    // cos_cache
+  tester.AddOptionalInputEdge<float>();    // sin_cache
+  tester.AddOptionalInputEdge<int64_t>();  // position_ids
+  tester.AddOptionalInputEdge<float>();    // attention_bias
+  tester.AddOptionalInputEdge<float>();    // head_sink
+
+  const int output_size = batch_size * q_seq_len * hidden_size;
+  tester.AddOutput<float>("output", {batch_size, q_seq_len, hidden_size},
+                          std::vector<float>(output_size, 0.0f));
+  const int present_size = batch_size * kv_num_heads * past_seq_len * head_size;
+  tester.AddOutput<float>("present_key", {batch_size, kv_num_heads, past_seq_len, head_size},
+                          std::vector<float>(present_size, 0.0f));
+  tester.AddOutput<float>("present_value", {batch_size, kv_num_heads, past_seq_len, head_size},
+                          std::vector<float>(present_size, 0.0f));
+
+  tester.SetOutputTolerance(1e6f);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 }  // namespace test
 }  // namespace onnxruntime