Update

pskiran1 · pskiran1 · commit 4b9b1ee11fa8 · 2026-06-12T20:44:10.000+05:30
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
@@ -1496,8 +1496,16 @@ Stub::GetCUDAMemoryPoolAddress(std::unique_ptr<IPCMessage>& ipc_message)
         *(ipc_message->ResponseMutex())};
     cuda_pool_message_ptr->waiting_on_stub = true;
     ipc_message->ResponseCondition()->notify_all();
-    while (cuda_pool_message_ptr->waiting_on_stub) {
-      ipc_message->ResponseCondition()->wait(lock);
+    // This handler runs on the single ParentToStubMQMonitor thread,
+    // which is also the only thread that delivers decoupled BLS responses,
+    // so it must not block on the success path.
+    // It should only wait when an error message has been written to
+    // error_string_shm, so the parent can finish reading it before 
+    // this function returns and frees that shared memory.
+    if (has_exception) {
+      while (cuda_pool_message_ptr->waiting_on_stub) {
+        ipc_message->ResponseCondition()->wait(lock);
+      }
     }
   }
 #endif
@@ -1849,7 +1857,6 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
             auto stub = Stub::GetOrCreateInstance();
             py::object loop =
                 py::module_::import("asyncio").attr("get_running_loop")();
-            // Capture 'stub' by value (it is a shared_ptr).
             py::cpp_function callback = [stub, infer_request, decoupled]() {
               std::shared_ptr<InferResponse> response =
                   infer_request->Exec(decoupled);