NVIDIA
diff --git a/‎cpp/tests/unit_tests/executor/transferAgentTest.cpp‎
Lines changed: 103 additions & 0 deletions b/‎cpp/tests/unit_tests/executor/transferAgentTest.cpp‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/disaggregation/transceiver.py‎
Lines changed: 11 additions & 4 deletions b/‎tensorrt_llm/_torch/disaggregation/transceiver.py‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/kv_cache_manager_v2.py‎
Lines changed: 6 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/kv_cache_manager_v2.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 56 additions & 74 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 56 additions & 74 deletions
@@ -21,7 +21,9 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include <atomic>
 #include <filesystem>
+#include <thread>
 #include <vector>
 
 namespace fs = std::filesystem;
@@ -376,6 +378,107 @@ TEST_P(TransferAgentTest, SyncMessage)
     xferAgent1->invalidateRemoteAgent(agent0);
 }
 
+// Status must survive destruction of its owning agent (#14137 UAF-safety): the
+// status holds a weak_ptr<nixlAgent>; once the agent is reset the weak_ptr expires
+// and orphaned queries must report failure rather than dereference a dangling agent.
+TEST_P(TransferAgentTest, StatusOutlivesAgent)
+{
+    std::string const agent0{"agent0"}, agent1{"agent1"};
+    BaseAgentConfig config0{agent0, true, false, true}, config1{agent1, true, false, true};
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
+
+    std::vector<char> memory0(100, 10);
+    std::vector<char> memory1(100, 1);
+
+    // RegisteredHostMemory holds a raw agent pointer and deregisters in its
+    // dtor, so it must NOT outlive its agent. Scope it (and the transfer) so it
+    // deregisters while both agents are alive; only `status` is kept past here.
+    std::unique_ptr<TransferStatus> status;
+    {
+        RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get());
+        RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get());
+
+        auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+        xferAgent0->loadRemoteAgent(agent1, connectionInfo);
+        while (!xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()))
+        {
+        }
+
+        TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1};
+        status = xferAgent0->submitTransferRequests(writeReq);
+        TLLM_CHECK(status->wait() == TransferState::kSUCCESS);
+    }
+
+    // Destroy the owning agent BEFORE the status. shutdown() resets the
+    // shared_ptr<nixlAgent>, expiring the status's weak_ptr.
+    xferAgent0.reset();
+
+    // Orphaned queries are safe and report failure (no use-after-free):
+    // wait()/isCompleted() see mWeakAgent.lock() == nullptr and return
+    // kFAILURE/false instead of dereferencing the freed agent.
+    EXPECT_FALSE(status->isCompleted());
+    EXPECT_EQ(status->wait(0), TransferState::kFAILURE);
+    // `status` destructor runs at scope exit: weak_ptr.lock() == nullptr ->
+    // early return (no releaseXferReq on a dangling agent).
+}
+
+// Concurrent submitTransferRequests (#14137): submit holds a std::shared_lock and
+// copies reqParams per-request, so many threads can submit at once without racing
+// a shared mExtraParams. All concurrently-submitted transfers must still succeed.
+TEST_P(TransferAgentTest, ConcurrentSubmit)
+{
+    std::string const agent0{"agent0"}, agent1{"agent1"};
+    BaseAgentConfig config0{agent0, true, false, true}, config1{agent1, true, false, true};
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
+
+    std::vector<char> memory0(100, 10);
+    std::vector<char> memory1(100, 1);
+    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get());
+    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get());
+
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
+    while (!xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()))
+    {
+    }
+
+    constexpr int kNumThreads = 8;
+    std::vector<std::thread> threads;
+    std::vector<std::unique_ptr<TransferStatus>> statuses(kNumThreads);
+    std::atomic<int> ready{0};
+    for (int i = 0; i < kNumThreads; ++i)
+    {
+        threads.emplace_back(
+            [&, i]()
+            {
+                TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1};
+                ready.fetch_add(1);
+                while (ready.load() < kNumThreads)
+                {
+                    // Align thread starts to maximize submit contention.
+                }
+                statuses[i] = xferAgent0->submitTransferRequests(writeReq);
+            });
+    }
+    for (auto& t : threads)
+    {
+        t.join();
+    }
+    for (auto& status : statuses)
+    {
+        TLLM_CHECK(status);
+        EXPECT_EQ(status->wait(), TransferState::kSUCCESS);
+    }
+    TLLM_CHECK(memory0 == memory1);
+    xferAgent0->invalidateRemoteAgent(agent1);
+}
+
 INSTANTIATE_TEST_SUITE_P(AvailableBackends, TransferAgentTest, ::testing::ValuesIn(getAvailableBackends()),
     [](::testing::TestParamInfo<TransferAgentTest::ParamType> const& info) { return info.param; });
 
 
@@ -517,8 +517,12 @@ def request_and_receive_async(self, req: LlmRequest):
     def check_context_transfer_status(
         self, at_least_request_num: Optional[int], mark_complete: bool = False
     ):
-        # Skip the tp_allgather in _ctx_consensus when this transceiver never sends (pure GEN role).
-        if not self._ever_had_send_session:
+        # Skip the consensus collectives when this transceiver never sends (pure GEN role).
+        # Guarded with pp_size==1 (not _ctx_need_pp_sync): under pipeline parallelism the
+        # per-rank send marker flips asymmetrically across PP stages, so short-circuiting here
+        # would let some ranks skip the pp_allgather barrier while peers enter it -> deadlock
+        # (e.g. ADP+PP tp4_pp2_dp_both). With PP=1 there is no cross-stage consensus barrier.
+        if not self._ever_had_send_session and not self._ctx_need_pp_sync:
             return [], []
         block_all = at_least_request_num is None
         wait_num = at_least_request_num if not block_all else 0
@@ -573,8 +577,11 @@ def check_context_transfer_status(
         return completed, failed
 
     def check_gen_transfer_status(self, at_least_request_num: Optional[int]):
-        # Skip the allgather in _gen_consensus when this transceiver never receives (pure CTX role).
-        if not self._ever_had_recv_session:
+        # Skip the consensus collectives when this transceiver never receives (pure CTX role).
+        # Guarded with pp_size==1 (not _ctx_need_pp_sync): see check_context_transfer_status --
+        # under PP the per-rank recv marker flips asymmetrically across stages, so an early
+        # return would desync the consensus barrier; only short-circuit when PP is absent.
+        if not self._ever_had_recv_session and not self._ctx_need_pp_sync:
             return [], [], []
         block_all = at_least_request_num is None
         wait_num = at_least_request_num if not block_all else 0
 
@@ -1171,6 +1171,12 @@ def trim_to_history(self, req: LlmRequest, history_length: int) -> bool:
         try:
             return kv_cache.resize(target_capacity, history_length=history_length)
         except Exception as e:
+            # Best-effort SWA trim: resize() can raise more than ValueError
+            # under v2 KV-cache + uneven-PP disagg (e.g. internal state
+            # assertions). A failed trim MUST degrade gracefully (return
+            # False) -- letting the exception propagate aborts KV-block
+            # release, leaking storage slots and killing the run. Do not
+            # narrow this except.
             logger.warning(
                 f"trim_to_history failed for req {req.py_request_id} "
                 f"(capacity={kv_cache.capacity}, target_history={history_length}): {e}"
 
@@ -517,8 +517,6 @@ def __init__(
         self.num_scheduled_requests: int = 0
         self.benchmark_req_queues_size = int(
             os.environ.get("TLLM_BENCHMARK_REQ_QUEUES_SIZE", 0))
-        self.benchmark_fill_stall_timeout_s = float(
-            os.environ.get("TLLM_BENCHMARK_FILL_STALL_TIMEOUT_S", 60.0))
 
         # list of requests in each PP micro batch
         self.num_micro_batches = max(self.dist.pp_size,
@@ -588,6 +586,14 @@ def __init__(
         def on_detected():
             logger.error(
                 f"Hang detected on rank {self.global_rank} in PyExecutor.")
+            # Surface a concrete error to local waiters (e.g.
+            # _await_single_response) the same way _event_loop_wrapper does,
+            # without calling _handle_errors here: _handle_errors triggers
+            # tp_gather/allgather collectives, which are unsafe to run from
+            # the hang-detector thread while the worker thread is hung.
+            if self._event_loop_error is None:
+                self._event_loop_error = RuntimeError(
+                    f"Hang detected on rank {self.global_rank} in PyExecutor.")
             self.shutdown_event.set()
             self.is_shutdown = True
 
@@ -2740,67 +2746,30 @@ def _prepare_and_schedule_batch(self):
             # scheduler could not allocate KV for any of them, the benchmark
             # will hang forever because in-progress generation requests won't
             # release their KV cache.
-            #
-            # Only watch during the fill phase: once fill completes the count
-            # stays at its target value through the entire decode, which would
-            # otherwise look like a stall. With ADP, requests are sharded
-            # across TP ranks so the comparison must use the global count
-            # (allgather) against the global target.
-            if (self.is_benchmark_disagg and self._benchmark_fill_phase_active
-                    and not self.is_warmup):
-                # NOTE: keep the gate condition free of any per-rank state
-                # (e.g. `fitting_disagg_gen_init_requests`).  The
-                # `tp_allgather` below is a collective and every ADP rank
-                # must participate together; otherwise ranks desync and a
-                # later allgather mixes payload shapes (list[int] from
-                # gather_all_rank_states vs int from the gate's
-                # _is_benchmark_disagg_fill_complete), producing TypeErrors
-                # like "argument after * must be an iterable, not int" or
-                # "unsupported operand type(s) for +: 'int' and 'list'".
-                # The per-rank "still has fitting requests" hint is folded
-                # into the same allgather so we can suppress the stall
-                # check globally when any rank is still making progress.
-                local_ready_gen = sum(
-                    1 for req in self.active_requests if req.state in (
-                        LlmRequestState.DISAGG_GENERATION_TRANS_COMPLETE,
-                        LlmRequestState.GENERATION_IN_PROGRESS,
-                    ))
-                local_has_fitting = 1 if fitting_disagg_gen_init_requests else 0
-                if self.enable_attention_dp:
-                    responses = self.dist.tp_allgather(
-                        [local_ready_gen, local_has_fitting])
-                    total_ready_gen = sum(r[0] for r in responses)
-                    any_rank_has_fitting = any(r[1] for r in responses)
-                else:
-                    total_ready_gen = local_ready_gen
-                    any_rank_has_fitting = bool(local_has_fitting)
-
-                if not any_rank_has_fitting:
-                    now = time.time()
-                    last_count = getattr(self, "_bench_disagg_last_gen_count",
-                                         None)
-                    last_change_time = getattr(
-                        self, "_bench_disagg_last_gen_count_time", None)
-                    if (last_count != total_ready_gen
-                            or last_change_time is None):
-                        self._bench_disagg_last_gen_count = total_ready_gen
-                        self._bench_disagg_last_gen_count_time = now
-                    elif (now - last_change_time
-                          > self.benchmark_fill_stall_timeout_s
-                          and total_ready_gen < self.benchmark_req_queues_size):
-                        error_msg = (
-                            f"Benchmark gen request count stalled at "
-                            f"{total_ready_gen} "
-                            f"for {now - last_change_time:.0f}s "
-                            f"(target {self.benchmark_req_queues_size}, "
-                            f"fetched={self.num_fetch_requests}). "
-                            f"Likely causes: KV transfer stuck, KV cache pool "
-                            f"too small, or transceiver deadlock. Aborting all "
-                            f"active requests.")
-                        logger.error(error_msg)
-                        self._handle_errors(error_msg,
-                                            requests=self.active_requests)
-                        return None, None
+            if (self.benchmark_req_queues_size > 0 and not self.is_warmup
+                    and not fitting_disagg_gen_init_requests):
+                stuck_init_requests = [
+                    req for req in self.active_requests
+                    if req.is_disagg_generation_init_state
+                ]
+                # Only fail once all benchmark requests have been fetched
+                # so that _handle_errors covers every request and every
+                # client receives an error response.
+                if (stuck_init_requests and self.num_fetch_requests
+                        >= self.benchmark_req_queues_size):
+                    error_msg = (
+                        f"Insufficient KV cache for gen-only benchmark mode: "
+                        f"{len(stuck_init_requests)} request(s) are waiting for "
+                        f"KV cache allocation but the scheduler could not fit "
+                        f"any of them. Increase free_gpu_memory_fraction or "
+                        f"reduce TLLM_BENCHMARK_REQ_QUEUES_SIZE (currently "
+                        f"{self.benchmark_req_queues_size}).")
+                    logger.error(error_msg)
+                    # Fail all active and waiting requests so every
+                    # client receives an error instead of hanging.
+                    self._handle_errors(error_msg,
+                                        requests=self.active_requests)
+                    return None, None
 
         self.num_scheduled_requests = scheduled_batch.batch_size
         logger.debug(
@@ -4402,13 +4371,17 @@ def _check_disagg_ctx_schedulable_status(self,
     def _count_schedulable_active_requests(self) -> int:
         """Count active requests eligible for scheduling.
 
-        Excludes GENERATION_TO_COMPLETE (V2 scheduler skips state
-        >= GENERATION_TO_COMPLETE) and, in disaggregated mode, requests
-        still awaiting KV cache transfer.
+        Excludes GENERATION_TO_COMPLETE only under the V2 KV-cache manager,
+        whose scheduler skips state >= GENERATION_TO_COMPLETE. The V1
+        scheduler still forwards those requests, so excluding them there
+        would undercount and spuriously insert an ADP dummy on top of a real
+        request -- overflowing a small batch (e.g. max_batch_size=1). In
+        disaggregated mode, also exclude requests still awaiting KV transfer.
         """
 
         def _is_to_complete(req) -> bool:
-            return req.state == LlmRequestState.GENERATION_TO_COMPLETE
+            return (self._is_kv_manager_v2
+                    and req.state == LlmRequestState.GENERATION_TO_COMPLETE)
 
         if self.kv_cache_transceiver is None:
             return sum(1 for req in self.active_requests
@@ -4456,14 +4429,21 @@ def _should_skip_dummy_for_benchmark_disagg(
     def _update_adp_dummy_role(self, candidates: List[LlmRequest]) -> None:
         if not self.enable_attention_dp or self.kv_cache_transceiver is None:
             return
+        has_ctx = False
+        has_gen = False
         for req in candidates:
             rt = getattr(req, "llm_request_type", None)
             if rt == LlmRequestType.LLMREQUEST_TYPE_CONTEXT_ONLY:
-                self._adp_dummy_is_gen = False
-                return
-            if rt == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY:
-                self._adp_dummy_is_gen = True
-                return
+                has_ctx = True
+            elif rt == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY:
+                has_gen = True
+        # Prefer the CTX role when both types are present this iteration: a CTX
+        # dummy is padded to max_num_tokens so idle ranks keep MoE all-to-all
+        # token counts comparable with ranks doing real context work.
+        if has_ctx:
+            self._adp_dummy_is_gen = False
+        elif has_gen:
+            self._adp_dummy_is_gen = True
 
     @nvtx_range("_pad_attention_dp_dummy_request")
     def _pad_attention_dp_dummy_request(self):
@@ -5523,12 +5503,14 @@ def _handle_responses(self, emit_first_iter: bool = True):
                 bool(timed_out_requests)))
             if any_timed_out:
                 self._handle_errors(error_msg="Request timed out (KV transfer)",
-                                    requests=timed_out_requests)
+                                    requests=timed_out_requests,
+                                    charge_budget=False)
         else:
             for req in timed_out_requests:
                 self._handle_errors(
                     error_msg=f"Request {req.py_request_id} timed out",
-                    requests=[req])
+                    requests=[req],
+                    charge_budget=False)
         return requests_to_terminate + requests_finished_by_transfer
 
     def _await_any_response(self,