Remove cudaStreamSynchronize from CUDA LLM ops for CUDA graph capture compatibility (microsoft#27484)

titaiwangms · web-flow · commit 479dd393b254 · 2026-02-27T13:39:16.000-08:00
This pull request refactors validation logic for CUDA attention masks
and tensor scatter operations to move error checking from host-side
(CPU) to device-side (GPU) using CUDA kernel assertions
(`CUDA_KERNEL_ASSERT`). This change eliminates synchronous host-device
memory transfers and stream synchronizations, improving performance and
simplifying code. Corresponding test cases are updated to only expect
validation failures on the CPU, as CUDA errors are now asynchronous.

Key changes:

**Attention mask validation (GQA path):**
- Removes host-side validation and memory copies for boolean attention
masks in `attention.cc`; mask validity (right-padding, contiguous
True/False) is now checked asynchronously via `CUDA_KERNEL_ASSERT` in
the CUDA kernel.
[[1]](diffhunk://#diff-0701e4cc6d4951894ae1a60f35c1e6c0f69ba7595f896a23c8f5ed7265eab4ffL385-L387)
[[2]](diffhunk://#diff-0701e4cc6d4951894ae1a60f35c1e6c0f69ba7595f896a23c8f5ed7265eab4ffL414-L418)
[[3]](diffhunk://#diff-0701e4cc6d4951894ae1a60f35c1e6c0f69ba7595f896a23c8f5ed7265eab4ffL427-L448)
- Updates the CUDA kernel and its interface to drop the
`validation_result` buffer and rely on device assertions for mask
validation. Documentation is updated to reflect this asynchronous error
checking.
[[1]](diffhunk://#diff-00f7d49ccee44f1573357c07633bd03f21b9c2e1b1617c7a6a878a79ee6a6e49L10-R17)
[[2]](diffhunk://#diff-00f7d49ccee44f1573357c07633bd03f21b9c2e1b1617c7a6a878a79ee6a6e49L34)
[[3]](diffhunk://#diff-00f7d49ccee44f1573357c07633bd03f21b9c2e1b1617c7a6a878a79ee6a6e49L81-R76)
[[4]](diffhunk://#diff-00f7d49ccee44f1573357c07633bd03f21b9c2e1b1617c7a6a878a79ee6a6e49L104-R92)
[[5]](diffhunk://#diff-00f7d49ccee44f1573357c07633bd03f21b9c2e1b1617c7a6a878a79ee6a6e49L118)
[[6]](diffhunk://#diff-00f7d49ccee44f1573357c07633bd03f21b9c2e1b1617c7a6a878a79ee6a6e49L137)
[[7]](diffhunk://#diff-8aa9a15a92d7dc138346dce5de055911895d940ba2183b4ba45bd95ac0e5bfc9L37-L45)

**TensorScatter write_indices validation:**
- Removes host-side validation and synchronization for `write_indices`
in `tensorscatter.cc`; index bounds checking is now performed
asynchronously inside the CUDA kernel via `CUDA_KERNEL_ASSERT`.
[[1]](diffhunk://#diff-d69233ff3987fe3093132a31710b6b64cc0a32140e2a5a415a2f1f0907bd22d2L75-R76)
[[2]](diffhunk://#diff-1694a04b8ba9963cc06d651ec6a3be8aa9cb2bcb73c2438dc251ca8cdcb2eb41L31-R37)

**Test updates:**
- Updates negative test cases for `TensorScatter` to run only on CPU,
since CUDA now validates asynchronously and will not synchronously
return errors to the host.
[[1]](diffhunk://#diff-8c90e642cc0cf4e68b2f3d4e4b3f1e21bf6d07f01663d424bc52c75ad0db2dfeR300)
[[2]](diffhunk://#diff-8c90e642cc0cf4e68b2f3d4e4b3f1e21bf6d07f01663d424bc52c75ad0db2dfeL311-R319)
[[3]](diffhunk://#diff-8c90e642cc0cf4e68b2f3d4e4b3f1e21bf6d07f01663d424bc52c75ad0db2dfeL327-R339)
[[4]](diffhunk://#diff-8c90e642cc0cf4e68b2f3d4e4b3f1e21bf6d07f01663d424bc52c75ad0db2dfeL342-R354)
diff --git a/onnxruntime/core/providers/cuda/llm/attention.cc b/onnxruntime/core/providers/cuda/llm/attention.cc
@@ -382,9 +382,6 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
     // the MHA path below, where 2D masks follow ONNX broadcasting: [A, B] → [1, 1, A, B], so
     // 2D = (q_seq_len, total_seq_len) with both batch and heads broadcast.
     if (attn_mask != nullptr && attn_mask->IsDataType<bool>()) {
-      // Allocate validation result buffer on GPU
-      auto validation_buffer = GetScratchBuffer<int>(parameters.batch_size, context->GetComputeStream());
-
       // Get mask dimensions for broadcasting
       // attn_mask can be 2D, 3D, or 4D and broadcasts to (batch_size, num_heads, q_seq_len, total_seq_len)
       const auto& mask_shape = attn_mask->Shape();
@@ -411,11 +408,11 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                                "Boolean attn_mask must be 2D, 3D, or 4D. Got ", mask_dims, "D.");
       }
 
-      // Launch CUDA kernel to convert mask to seqlens_k and validate
+      // Launch CUDA kernel to convert mask to seqlens_k.
+      // Mask validity (right-padding, contiguous) is checked asynchronously via CUDA_KERNEL_ASSERT.
       ORT_RETURN_IF_ERROR(LaunchConvertMaskToSeqlensK(
           attn_mask->Data<bool>(),
           seqlens_k_buffer.get(),
-          validation_buffer.get(),
           parameters.batch_size,
           parameters.total_sequence_length,
           mask_dims,
@@ -424,28 +421,6 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
           mask_dim2,
           cuda_stream,
           device_prop.maxThreadsPerBlock));
-
-      // Copy validation results to CPU and check for errors
-      std::vector<int> validation_host(parameters.batch_size);
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(validation_host.data(), validation_buffer.get(),
-                                           sizeof(int) * parameters.batch_size,
-                                           cudaMemcpyDeviceToHost, cuda_stream));
-      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(cuda_stream));
-
-      for (int b = 0; b < parameters.batch_size; ++b) {
-        if (validation_host[b] == 1) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                                 "Boolean attn_mask for batch ", b,
-                                 " does not start with True. "
-                                 "GQA path only supports right-padding masks where valid tokens come first.");
-        } else if (validation_host[b] == 2) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                                 "Boolean attn_mask for batch ", b,
-                                 " is not contiguous. "
-                                 "GQA path only supports right-padding masks with contiguous True values "
-                                 "followed by contiguous False values (no interleaving).");
-        }
-      }
     } else if (attn_mask != nullptr) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
                              "Non-boolean attn_mask is not supported yet in GQA path of Attention op (CUDA).");
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.cu
@@ -7,19 +7,14 @@
 namespace onnxruntime {
 namespace cuda {
 
-// Validation error codes (stored in validation_result buffer)
-constexpr int kValidationOK = 0;
-constexpr int kValidationErrorNotStartWithTrue = 1;
-constexpr int kValidationErrorNotContiguous = 2;
-
 // CUDA kernel to convert boolean attention mask to sequence lengths.
-// Also validates that the mask follows right-padding convention.
+// Also validates that the mask follows right-padding convention via CUDA_KERNEL_ASSERT.
 //
 // The kernel processes one batch per thread.
 // For each batch, it finds the first False in the mask row, which indicates
 // where padding starts. The sequence length is the index of first False.
 //
-// Validation:
+// Validation (via CUDA_KERNEL_ASSERT, reported asynchronously):
 // - The mask must start with True (first element must be True)
 // - After the first False, all remaining elements must be False (contiguous padding)
 //
@@ -31,7 +26,6 @@ constexpr int kValidationErrorNotContiguous = 2;
 __global__ void ConvertMaskToSeqlensKernel(
     const bool* __restrict__ attn_mask,
     int* __restrict__ seqlens_k,
-    int* __restrict__ validation_result,
     const int batch_size,
     const int total_seq_len,
     const int mask_dims,
@@ -78,15 +72,8 @@ __global__ void ConvertMaskToSeqlensKernel(
     mask_row = attn_mask + effective_batch * batch_stride + h_idx * head_stride + q_idx * q_stride;
   }
 
-  // Initialize validation result for this batch
-  validation_result[batch_idx] = kValidationOK;
-
-  // Check that mask starts with True
-  if (!mask_row[0]) {
-    validation_result[batch_idx] = kValidationErrorNotStartWithTrue;
-    seqlens_k[batch_idx] = -1;  // Invalid
-    return;
-  }
+  // Validate that mask starts with True (right-padding convention)
+  CUDA_KERNEL_ASSERT(mask_row[0]);  // mask must start with True
 
   // Find the first False (where padding starts)
   // All elements before this should be True, all after should be False
@@ -101,10 +88,8 @@ __global__ void ConvertMaskToSeqlensKernel(
       seq_len = i;
       found_first_false = true;
     } else if (found_first_false && current) {
-      // Found True after False - this is invalid (not contiguous)
-      validation_result[batch_idx] = kValidationErrorNotContiguous;
-      seqlens_k[batch_idx] = -1;  // Invalid
-      return;
+      // Found True after False - mask is not contiguous (invalid)
+      CUDA_KERNEL_ASSERT(false);  // mask must be contiguous (no True after False)
     }
   }
 
@@ -115,7 +100,6 @@ __global__ void ConvertMaskToSeqlensKernel(
 Status LaunchConvertMaskToSeqlensK(
     const bool* attn_mask_bool,
     int* seqlens_k,
-    int* validation_result,
     int batch_size,
     int total_seq_len,
     int mask_dims,
@@ -134,7 +118,6 @@ Status LaunchConvertMaskToSeqlensK(
   ConvertMaskToSeqlensKernel<<<blocks, threads, 0, stream>>>(
       attn_mask_bool,
       seqlens_k,
-      validation_result,
       batch_size,
       total_seq_len,
       mask_dims,
diff --git a/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h b/onnxruntime/core/providers/cuda/llm/attention_mask_impl.h
@@ -34,15 +34,13 @@ namespace cuda {
 //
 // Returns:
 //   Status::OK() on success
-//   Error status if mask is invalid (not right-padding, doesn't start with True, etc.)
 //
-// Note: This function validates the mask on GPU and will return an error if:
-//   - The mask doesn't start with True for any batch
-//   - The True/False values are not contiguous (e.g., True, False, True pattern)
+// Note: Mask validity (right-padding convention, starts with True, contiguous True/False)
+//   is checked asynchronously via CUDA_KERNEL_ASSERT inside the kernel. Invalid masks will
+//   trigger a device-side assertion failure.
 Status LaunchConvertMaskToSeqlensK(
     const bool* attn_mask_bool,
     int* seqlens_k,
-    int* validation_result,  // GPU buffer for validation, size = batch_size
     int batch_size,
     int total_seq_len,
     int mask_dims,
diff --git a/onnxruntime/core/providers/cuda/llm/tensorscatter.cc b/onnxruntime/core/providers/cuda/llm/tensorscatter.cc
@@ -72,24 +72,8 @@ Status TensorScatter::ComputeInternal(OpKernelContext* context) const {
                     write_indices_tensor->Shape()[0] == batch_size,
                 "TensorScatter: write_indices must have shape [batch_size]");
     write_indices = write_indices_tensor->Data<int64_t>();
-
-    // Copy write_indices to host for validation (batch_size elements, negligible overhead).
-    std::vector<int64_t> host_write_indices(static_cast<size_t>(batch_size));
-    CUDA_RETURN_IF_ERROR(
-        cudaMemcpyAsync(host_write_indices.data(), write_indices,
-                        static_cast<size_t>(batch_size) * sizeof(int64_t),
-                        cudaMemcpyDeviceToHost, Stream(context)));
-    CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(Stream(context)));
-
-    for (int64_t b = 0; b < batch_size; ++b) {
-      int64_t wi = host_write_indices[static_cast<size_t>(b)];
-      ORT_ENFORCE(wi >= 0, "TensorScatter: write_indices[", b, "] = ", wi, " is negative");
-      if (!circular_) {
-        ORT_ENFORCE(wi + sequence_length <= max_sequence_length,
-                    "TensorScatter linear mode: write_indices[", b, "] + sequence_length (",
-                    wi, " + ", sequence_length, ") exceeds max_sequence_length (", max_sequence_length, ")");
-      }
-    }
+    // write_indices values (non-negative, in-bounds) are validated asynchronously
+    // inside the CUDA kernel via CUDA_KERNEL_ASSERT to avoid cudaStreamSynchronize.
   }
 
   // Allocate output with the same shape as past_cache.
diff --git a/onnxruntime/core/providers/cuda/llm/tensorscatter_impl.cu b/onnxruntime/core/providers/cuda/llm/tensorscatter_impl.cu
@@ -28,12 +28,13 @@ __global__ void _TensorScatterKernel(
 
   int64_t batch_idx = prefix_idx / prefix_stride_for_batch;
   int64_t wi = (write_indices != nullptr) ? write_indices[batch_idx] : 0;
-  // write_indices are validated on the host before kernel launch.
+  CUDA_KERNEL_ASSERT(wi >= 0);
   int64_t cache_pos;
   if (circular) {
     cache_pos = (wi + seq_idx) % max_seq_len;
   } else {
     cache_pos = wi + seq_idx;
+    CUDA_KERNEL_ASSERT(cache_pos < max_seq_len);
   }
 
   int64_t out_offset = prefix_idx * (max_seq_len * suffix_count) + cache_pos * suffix_count + suffix_idx;
diff --git a/onnxruntime/test/providers/cpu/llm/tensorscatter_op_test.cc b/onnxruntime/test/providers/cpu/llm/tensorscatter_op_test.cc
@@ -297,6 +297,7 @@ TEST(TensorScatterTest, InPlace_IOBinding) {
 }
 
 // Negative write_indices should fail validation.
+// Run CPU-only: CUDA validates asynchronously via CUDA_KERNEL_ASSERT.
 TEST(TensorScatterTest, Linear_NegativeWriteIndex) {
   OpTester test("TensorScatter", 24);
   test.AddAttribute<std::string>("mode", "linear");
@@ -308,10 +309,14 @@ TEST(TensorScatterTest, Linear_NegativeWriteIndex) {
   test.AddOutput<float>("present_cache", {1, 4, 3},
                         {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "is negative");
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "is negative",
+           {}, nullptr, &execution_providers);
 }
 
 // Linear mode: write_indices + sequence_length > max_sequence_length should fail.
+// Run CPU-only: CUDA validates asynchronously via CUDA_KERNEL_ASSERT.
 TEST(TensorScatterTest, Linear_OutOfBoundsWriteIndex) {
   OpTester test("TensorScatter", 24);
   test.AddAttribute<std::string>("mode", "linear");
@@ -324,10 +329,14 @@ TEST(TensorScatterTest, Linear_OutOfBoundsWriteIndex) {
   test.AddOutput<float>("present_cache", {1, 4, 3},
                         {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "exceeds max_sequence_length");
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "exceeds max_sequence_length",
+           {}, nullptr, &execution_providers);
 }
 
 // Circular mode: negative write_indices should still fail.
+// Run CPU-only: CUDA validates asynchronously via CUDA_KERNEL_ASSERT.
 TEST(TensorScatterTest, Circular_NegativeWriteIndex) {
   OpTester test("TensorScatter", 24);
   test.AddAttribute<std::string>("mode", "circular");
@@ -339,7 +348,10 @@ TEST(TensorScatterTest, Circular_NegativeWriteIndex) {
   test.AddOutput<float>("present_cache", {1, 4, 2},
                         {0, 0, 0, 0, 0, 0, 0, 0});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "is negative");
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "is negative",
+           {}, nullptr, &execution_providers);
 }
 
 }  // namespace test