fix race condition in nvexec's when_all implementation

ericniebler · ericniebler · commit 70e2266ec4ac · 2025-03-13T19:24:41.000-07:00
diff --git a/include/nvexec/detail/event.cuh b/include/nvexec/detail/event.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2025 NVIDIA Corporation
+ *
+ * Licensed under the Apache License Version 2.0 with LLVM Exceptions
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   https://llvm.org/LICENSE.txt
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format Language: Cpp
+
+#pragma once
+
+#include "config.cuh"
+#include "cuda_fwd.cuh"
+#include "throw_on_cuda_error.cuh"
+
+#include <utility>
+
+namespace nvexec::detail {
+  struct cuda_event {
+    cuda_event() {
+      if (auto status =
+            STDEXEC_DBG_ERR(::cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+          status != cudaSuccess) {
+        throw cuda_error(status, "cudaEventCreate");
+      }
+    }
+
+    cuda_event(cuda_event&& other) noexcept
+      : event_(std::exchange(other.event_, nullptr)) {
+    }
+
+    ~cuda_event() {
+      if (event_ != nullptr) {
+        STDEXEC_DBG_ERR(::cudaEventDestroy(event_));
+      }
+    }
+
+    auto operator=(cuda_event&& other) noexcept -> cuda_event& {
+      event_ = std::exchange(other.event_, nullptr);
+      return *this;
+    }
+
+    auto try_record(cudaStream_t stream) noexcept -> cudaError_t {
+      return STDEXEC_DBG_ERR(::cudaEventRecord(event_, stream));
+    }
+
+    auto try_wait(cudaStream_t stream) noexcept -> cudaError_t {
+      return STDEXEC_DBG_ERR(::cudaStreamWaitEvent(stream, event_, 0));
+    }
+
+   private:
+    cudaEvent_t event_{};
+  };
+} // namespace nvexec::detail
diff --git a/include/nvexec/detail/throw_on_cuda_error.cuh b/include/nvexec/detail/throw_on_cuda_error.cuh
@@ -18,18 +18,37 @@
 #include "config.cuh"
 
 #include <cstdio>
+#include <stdexcept>
 
 #include <cuda_runtime_api.h>
 
-namespace nvexec {
-  namespace detail {
-    inline cudaError_t debug_cuda_error(
-      cudaError_t error,
-      [[maybe_unused]] char const * file_name,
-      [[maybe_unused]] int line) {
-      // Clear the global CUDA error state which may have been set by the last
-      // call. Otherwise, errors may "leak" to unrelated calls.
-      cudaGetLastError();
+namespace nvexec::detail {
+  class cuda_error : public ::std::runtime_error {
+   private:
+    struct __msg_storage {
+      char __buffer[256]; // NOLINT
+    };
+
+    static auto
+      __format_cuda_error(const int __status, const char* __msg, char* __msg_buffer) noexcept
+      -> char* {
+      ::snprintf(__msg_buffer, 256, "cudaError %d: %s", __status, __msg);
+      return __msg_buffer;
+    }
+
+   public:
+    cuda_error(const int __status, const char* __msg, __msg_storage __msg_buffer = {0}) noexcept
+      : ::std::runtime_error(__format_cuda_error(__status, __msg, __msg_buffer.__buffer)) {
+    }
+  };
+
+  inline auto debug_cuda_error(
+    cudaError_t error,
+    [[maybe_unused]] char const * file_name,
+    [[maybe_unused]] int line) -> cudaError_t {
+    // Clear the global CUDA error state which may have been set by the last
+    // call. Otherwise, errors may "leak" to unrelated calls.
+    cudaGetLastError();
 
 #if defined(STDEXEC_STDERR)
       if (error != cudaSuccess) {
@@ -43,8 +62,7 @@ namespace nvexec {
 #endif
 
       return error;
-    }
-  } // namespace detail
+  }
+} // namespace nvexec::detail
 
 #define STDEXEC_DBG_ERR(E) ::nvexec::detail::debug_cuda_error(E, __FILE__, __LINE__) /**/
-} // namespace nvexec
diff --git a/include/nvexec/stream/when_all.cuh b/include/nvexec/stream/when_all.cuh
@@ -25,6 +25,7 @@
 #include <utility>
 
 #include "common.cuh"
+#include "../detail/event.cuh"
 #include "../detail/throw_on_cuda_error.cuh"
 
 STDEXEC_PRAGMA_PUSH()
@@ -166,9 +167,17 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
           using Completions = completion_sigs<env_of_t<Receiver>, CvrefReceiverId>;
 
           template <class Error>
-          void _set_error_impl(Error&& err, _when_all::state_t expected) noexcept {
+          void _set_error_impl(Error&& err) noexcept {
             // TODO: What memory orderings are actually needed here?
-            if (op_state_->state_.compare_exchange_strong(expected, _when_all::error)) {
+            auto old_state = op_state_->__state_.exchange(_when_all::error);
+            // If the previous state was __error or __stopped, then we have already requested
+            // stop on the stop source. Otherwise, request stop.
+            if (old_state == _when_all::started) {
+              op_state_->__stop_source_.request_stop();
+            }
+            // If we are the first child to complete with an error, we must save the error.
+            // (Any subsequent errors are ignores.)
+            if (old_state != _when_all::error) {
               op_state_->stop_source_.request_stop();
               // We won the race, free to write the error into the operation
               // state without worry.
@@ -187,12 +196,12 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
                 if constexpr (sizeof...(Values)) {
                   _when_all::copy_kernel<Values&&...><<<1, 1, 0, stream>>>(
                     &__tup::get<Index>(*op_state_->values_), static_cast<Values&&>(vals)...);
+                  op_state_->statuses_[Index] = cudaGetLastError();
                 }
 
                 if constexpr (stream_receiver<Receiver>) {
-                  if (op_state_->status_ == cudaSuccess) {
-                    op_state_->status_ =
-                      STDEXEC_DBG_ERR(cudaEventRecord(op_state_->events_[Index], stream));
+                  if (op_state_->statuses_[Index] == cudaSuccess) {
+                    op_state_->statuses_[Index] = op_state_->events_[Index].try_record(stream);
                   }
                 }
               }
@@ -203,7 +212,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
           template <class Error>
             requires tag_invocable<set_error_t, Receiver, Error>
           void set_error(Error&& err) && noexcept {
-            _set_error_impl(static_cast<Error&&>(err), _when_all::started);
+            _set_error_impl(static_cast<Error&&>(err));
           }
 
           void set_stopped() && noexcept {
@@ -238,8 +247,6 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
         using Env = env_of_t<Receiver>;
         using Completions = completion_sigs<Env, CvrefReceiverId>;
 
-        cudaError_t status_{cudaSuccess};
-
         template <class SenderId, std::size_t Index>
         using child_op_state_t = exit_operation_state_t<
           __copy_cvref_t<WhenAll, stdexec::__t<SenderId>>,
@@ -285,6 +292,14 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
           // Stop callback is no longer needed. Destroy it.
           on_stop_.reset();
 
+          // See if any child operations completed with an error status:
+          for (auto status: statuses_) {
+            if (status != cudaSuccess) {
+              status_ = status;
+              break;
+            }
+          }
+
           // Synchronize streams
           if (status_ == cudaSuccess) {
             if constexpr (stream_receiver<Receiver>) {
@@ -294,7 +309,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
 
               for (int i = 0; i < sizeof...(SenderIds); i++) {
                 if (status_ == cudaSuccess) {
-                  status_ = STDEXEC_DBG_ERR(cudaStreamWaitEvent(stream, events_[i], 0));
+                  status_ = events_[i].try_wait(stream);
                 }
               }
             } else {
@@ -354,19 +369,10 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
           , child_states_{
               operation_t::connect_children_(this, static_cast<WhenAll&&>(when_all), Indices{})} {
           status_ = STDEXEC_DBG_ERR(cudaMallocManaged(&values_, sizeof(child_values_tuple_t)));
-          for (std::size_t i = 0; i < sizeof...(SenderIds); ++i) {
-            if (status_ == cudaSuccess) {
-              status_ = STDEXEC_DBG_ERR(cudaEventCreate(&events_[i], cudaEventDisableTiming));
-            }
-          }
         }
 
         ~operation_t() {
           STDEXEC_DBG_ERR(cudaFree(values_));
-
-          for (int i = 0; i < sizeof...(SenderIds); i++) {
-            STDEXEC_DBG_ERR(cudaEventDestroy(events_[i]));
-          }
         }
 
         STDEXEC_IMMOVABLE(operation_t);
@@ -388,7 +394,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
           }
         }
 
-        // tuple<optional<tuple<Vs1...>>, optional<tuple<Vs2...>>, ...>
+        // tuple<tuple<Vs1...>, tuple<Vs2...>, ...>
         using child_values_tuple_t = //
           __if<
             sends_values<Completions>,
@@ -408,9 +414,11 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS {
             __uniqued_variant_for>;
 
         Receiver rcvr_;
+        cudaError_t status_{cudaSuccess};
         std::atomic<std::size_t> count_{sizeof...(SenderIds)};
         std::array<stream_provider_t, sizeof...(SenderIds)> stream_providers_;
-        std::array<cudaEvent_t, sizeof...(SenderIds)> events_;
+        std::array<detail::cuda_event, sizeof...(SenderIds)> events_{};
+        std::array<cudaError_t, sizeof...(SenderIds)> statuses_{}; // all initialized to cudaSuccess
         child_op_states_tuple_t child_states_;
         // Could be non-atomic here and atomic_ref everywhere except __completion_fn
         std::atomic<_when_all::state_t> state_{_when_all::started};
diff --git a/include/stdexec/__detail/__when_all.hpp b/include/stdexec/__detail/__when_all.hpp
@@ -324,10 +324,15 @@ namespace stdexec {
       template <class _State, class _Receiver, class _Error>
       static void __set_error(_State& __state, _Receiver&, _Error&& __err) noexcept {
         // TODO: What memory orderings are actually needed here?
-        if (__error != __state.__state_.exchange(__error)) {
+        auto __old_state = __state.__state_.exchange(__error);
+        // If the previous state was __error or __stopped, then we have already requested
+        // stop on the stop source. Otherwise, request stop.
+        if (__old_state == __started) {
           __state.__stop_source_.request_stop();
-          // We won the race, free to write the error into the operation
-          // state without worry.
+        }
+        // If we are the first child to complete with an error, we must save the error.
+        // (Any subsequent errors are ignores.)
+        if (__old_state != __error) {
           if constexpr (__nothrow_decay_copyable<_Error>) {
             __state.__errors_.template emplace<__decay_t<_Error>>(static_cast<_Error&&>(__err));
           } else {