@@ -1496,8 +1496,16 @@ Stub::GetCUDAMemoryPoolAddress(std::unique_ptr<IPCMessage>& ipc_message)
14961496 *(ipc_message->ResponseMutex ())};
14971497 cuda_pool_message_ptr->waiting_on_stub = true ;
14981498 ipc_message->ResponseCondition ()->notify_all ();
1499- while (cuda_pool_message_ptr->waiting_on_stub ) {
1500- ipc_message->ResponseCondition ()->wait (lock);
1499+ // This handler runs on the single ParentToStubMQMonitor thread,
1500+ // which is also the only thread that delivers decoupled BLS responses,
1501+ // so it must not block on the success path.
1502+ // It should only wait when an error message has been written to
1503+ // error_string_shm, so the parent can finish reading it before
1504+ // this function returns and frees that shared memory.
1505+ if (has_exception) {
1506+ while (cuda_pool_message_ptr->waiting_on_stub ) {
1507+ ipc_message->ResponseCondition ()->wait (lock);
1508+ }
15011509 }
15021510 }
15031511#endif
@@ -1849,7 +1857,6 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
18491857 auto stub = Stub::GetOrCreateInstance ();
18501858 py::object loop =
18511859 py::module_::import (" asyncio" ).attr (" get_running_loop" )();
1852- // Capture 'stub' by value (it is a shared_ptr).
18531860 py::cpp_function callback = [stub, infer_request, decoupled]() {
18541861 std::shared_ptr<InferResponse> response =
18551862 infer_request->Exec (decoupled);
0 commit comments