Add semaphore-aware Sequence evalAsync overloads

cone-forest · cone-forest · commit 4f73bbe18e4a · 2026-05-13T14:23:57.000+03:00
Allow submit-level GPU synchronization by letting Sequence submissions
wait on and signal Vulkan semaphores without CPU-side waits.
This keeps existing evalAsync behavior while adding validation and
coverage for the new overload path.

Signed-off-by: Michael Tsukanov &lt;tsukanovWOW@ya.ru&gt;
diff --git a/src/Sequence.cpp b/src/Sequence.cpp
@@ -110,6 +110,14 @@ Sequence::eval(std::shared_ptr<OpBase> op)
 
 std::shared_ptr<Sequence>
 Sequence::evalAsync()
+{
+    return this->evalAsync({}, {}, {});
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAsync(const std::vector<vk::Semaphore>& waitSemaphores,
+                    const std::vector<vk::PipelineStageFlags>& waitDstStageMasks,
+                    const std::vector<vk::Semaphore>& signalSemaphores)
 {
     if (this->isRecording()) {
         this->end();
@@ -127,8 +135,35 @@ Sequence::evalAsync()
         this->mOperations[i]->preEval(*this->mCommandBuffer);
     }
 
+    if (!waitDstStageMasks.empty() &&
+        waitSemaphores.size() != waitDstStageMasks.size()) {
+        throw std::runtime_error("Kompute Sequence evalAsync wait semaphore "
+                                 "count must match wait dst stage mask count");
+    }
+
+    std::vector<vk::PipelineStageFlags> resolvedWaitDstStageMasks =
+      waitDstStageMasks;
+    if (resolvedWaitDstStageMasks.empty() && !waitSemaphores.empty()) {
+        resolvedWaitDstStageMasks.resize(waitSemaphores.size(),
+                                         vk::PipelineStageFlagBits::eAllCommands);
+    }
+
+    const vk::Semaphore* waitSemaphoresPtr =
+      waitSemaphores.empty() ? nullptr : waitSemaphores.data();
+    const vk::PipelineStageFlags* waitDstStageMasksPtr =
+      resolvedWaitDstStageMasks.empty() ? nullptr
+                                        : resolvedWaitDstStageMasks.data();
+    const vk::Semaphore* signalSemaphoresPtr =
+      signalSemaphores.empty() ? nullptr : signalSemaphores.data();
+
     vk::SubmitInfo submitInfo(
-      0, nullptr, nullptr, 1, this->mCommandBuffer.get());
+      static_cast<uint32_t>(waitSemaphores.size()),
+      waitSemaphoresPtr,
+      waitDstStageMasksPtr,
+      1,
+      this->mCommandBuffer.get(),
+      static_cast<uint32_t>(signalSemaphores.size()),
+      signalSemaphoresPtr);
 
     KP_LOG_DEBUG(
       "Kompute sequence submitting command buffer into compute queue");
@@ -153,11 +188,20 @@ Sequence::submitCommandBuffer(const vk::SubmitInfo& submitInfo)
 
 std::shared_ptr<Sequence>
 Sequence::evalAsync(std::shared_ptr<OpBase> op)
+{
+    return this->evalAsync(op, {}, {}, {});
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAsync(std::shared_ptr<OpBase> op,
+                    const std::vector<vk::Semaphore>& waitSemaphores,
+                    const std::vector<vk::PipelineStageFlags>& waitDstStageMasks,
+                    const std::vector<vk::Semaphore>& signalSemaphores)
 {
     this->clear();
     this->record(op);
-    this->evalAsync();
-    return shared_from_this();
+    return this->evalAsync(
+      waitSemaphores, waitDstStageMasks, signalSemaphores);
 }
 
 std::shared_ptr<Sequence>
diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp
@@ -164,6 +164,25 @@ class Sequence : public std::enable_shared_from_this<Sequence>
      * @return Boolean stating whether execution was successful.
      */
     std::shared_ptr<Sequence> evalAsync();
+    /**
+     * Eval Async sends all recorded operations as a submit job and allows
+     * submit-level GPU synchronization by providing wait and signal semaphores.
+     * EvalAwait() must ALWAYS be called after to ensure the sequence is
+     * terminated correctly.
+     *
+     * @param waitSemaphores Semaphores that must be signaled before this submit
+     * starts executing.
+     * @param waitDstStageMasks Pipeline stages at which to wait for each
+     * semaphore. If empty and waitSemaphores is not empty, defaults to
+     * vk::PipelineStageFlagBits::eAllCommands for each wait semaphore.
+     * @param signalSemaphores Semaphores that this submit will signal when it
+     * completes.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> evalAsync(
+      const std::vector<vk::Semaphore>& waitSemaphores,
+      const std::vector<vk::PipelineStageFlags>& waitDstStageMasks,
+      const std::vector<vk::Semaphore>& signalSemaphores);
     /**
      * Clears currnet operations to record provided one in the vector of
      * operations into the gpu as a submit job without a barrier. EvalAwait()
@@ -173,6 +192,27 @@ class Sequence : public std::enable_shared_from_this<Sequence>
      * @return Boolean stating whether execution was successful.
      */
     std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
+    /**
+     * Clears current operations, records the provided one and submits with
+     * optional wait/signal semaphores for submit-level GPU synchronization.
+     * EvalAwait() must ALWAYS be called after to ensure the sequence is
+     * terminated correctly.
+     *
+     * @param op Operation to record prior to submit.
+     * @param waitSemaphores Semaphores that must be signaled before this submit
+     * starts executing.
+     * @param waitDstStageMasks Pipeline stages at which to wait for each
+     * semaphore. If empty and waitSemaphores is not empty, defaults to
+     * vk::PipelineStageFlagBits::eAllCommands for each wait semaphore.
+     * @param signalSemaphores Semaphores that this submit will signal when it
+     * completes.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> evalAsync(
+      std::shared_ptr<OpBase> op,
+      const std::vector<vk::Semaphore>& waitSemaphores,
+      const std::vector<vk::PipelineStageFlags>& waitDstStageMasks,
+      const std::vector<vk::Semaphore>& signalSemaphores);
     /**
      * Eval sends all the recorded and stored operations in the vector of
      * operations into the gpu as a submit job with a barrier.
diff --git a/test/TestSequence.cpp b/test/TestSequence.cpp
@@ -243,3 +243,63 @@ TEST(TestSequence, CorrectSequenceRunningError)
 
     EXPECT_EQ(tensorOut->vector(), std::vector<float>({ 2, 4, 6 }));
 }
+
+TEST(TestSequence, EvalAsyncSemaphoreOverloadSupportsEmptySyncLists)
+{
+    kp::Manager mgr;
+
+    std::shared_ptr<kp::Sequence> sq = mgr.sequence();
+
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 1, 2, 3 });
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor({ 2, 2, 2 });
+    std::shared_ptr<kp::TensorT<float>> tensorOut = mgr.tensor({ 0, 0, 0 });
+
+    sq->eval<kp::OpSyncDevice>({ tensorA, tensorB, tensorOut });
+
+    std::vector<uint32_t> spirv = compileSource(R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        layout(set = 0, binding = 0) buffer bina { float tina[]; };
+        layout(set = 0, binding = 1) buffer binb { float tinb[]; };
+        layout(set = 0, binding = 2) buffer bout { float tout[]; };
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+            tout[index] = tina[index] * tinb[index];
+        }
+    )");
+
+    std::shared_ptr<kp::Algorithm> algo =
+      mgr.algorithm({ tensorA, tensorB, tensorOut }, spirv);
+
+    sq->record<kp::OpAlgoDispatch>(algo)->record<kp::OpSyncLocal>(
+      { tensorA, tensorB, tensorOut });
+
+    EXPECT_NO_THROW(sq->evalAsync({}, {}, {}));
+    EXPECT_NO_THROW(sq->evalAwait());
+
+    EXPECT_EQ(tensorOut->vector(), std::vector<float>({ 2, 4, 6 }));
+}
+
+TEST(TestSequence, EvalAsyncSemaphoreOverloadValidatesWaitMaskCount)
+{
+    kp::Manager mgr;
+
+    std::shared_ptr<kp::Sequence> sq = mgr.sequence();
+
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 1, 2, 3 });
+
+    sq->record<kp::OpSyncDevice>({ tensorA });
+
+    std::vector<vk::Semaphore> waitSemaphores = { vk::Semaphore{} };
+    std::vector<vk::PipelineStageFlags> waitDstStageMasks = {
+        vk::PipelineStageFlagBits::eComputeShader,
+        vk::PipelineStageFlagBits::eTransfer
+    };
+    std::vector<vk::Semaphore> signalSemaphores = {};
+
+    EXPECT_ANY_THROW(
+      sq->evalAsync(waitSemaphores, waitDstStageMasks, signalSemaphores));
+}