[None][fix] Restore benchmark-disagg immediate fail-fast in _prepare_and_schedule_batch

Shixiaowei02 · Shixiaowei02 · commit d29ca88d286f · 2026-06-10T19:26:17.000-07:00
The cherry-pick of #14042-era "Fix deepseekv4 stall" replaced main's immediate "Insufficient KV cache for gen-only benchmark mode" guard with a time-based gen-count stall watchdog. On current main that watchdog is both superseded and incompatible: main already handles the ADP fill-completion case via _is_benchmark_disagg_fill_complete (per-rank allgather), and the time-based watchdog never fires on a single scheduling iteration, so it regressed tests/unittest/_torch/executor/test_benchmark_disagg.py (TestFailFastDuringBenchmarkFill, TestFillPhaseEndToEnd) which assert an immediate fail-fast when all benchmark requests are fetched (or the fill phase is over) and the scheduler can fit no INIT request. Restore main's immediate guard (fail once all requests are fetched and no INIT request fits, suppressed during warmup) and drop the now-unused benchmark_fill_stall_timeout_s. Verified on 8xB200: test_benchmark_disagg.py, test_py_executor.py and the tp4_pp2_dp_both transceiver cases all pass (240 passed, 0 failed). Signed-off-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -504,8 +504,6 @@ def __init__(
         self.num_scheduled_requests: int = 0
         self.benchmark_req_queues_size = int(
             os.environ.get("TLLM_BENCHMARK_REQ_QUEUES_SIZE", 0))
-        self.benchmark_fill_stall_timeout_s = float(
-            os.environ.get("TLLM_BENCHMARK_FILL_STALL_TIMEOUT_S", 60.0))
 
         # list of requests in each PP micro batch
         self.num_micro_batches = max(self.dist.pp_size,
@@ -2672,67 +2670,30 @@ def _prepare_and_schedule_batch(self):
             # scheduler could not allocate KV for any of them, the benchmark
             # will hang forever because in-progress generation requests won't
             # release their KV cache.
-            #
-            # Only watch during the fill phase: once fill completes the count
-            # stays at its target value through the entire decode, which would
-            # otherwise look like a stall. With ADP, requests are sharded
-            # across TP ranks so the comparison must use the global count
-            # (allgather) against the global target.
-            if (self.is_benchmark_disagg and self._benchmark_fill_phase_active
-                    and not self.is_warmup):
-                # NOTE: keep the gate condition free of any per-rank state
-                # (e.g. `fitting_disagg_gen_init_requests`).  The
-                # `tp_allgather` below is a collective and every ADP rank
-                # must participate together; otherwise ranks desync and a
-                # later allgather mixes payload shapes (list[int] from
-                # gather_all_rank_states vs int from the gate's
-                # _is_benchmark_disagg_fill_complete), producing TypeErrors
-                # like "argument after * must be an iterable, not int" or
-                # "unsupported operand type(s) for +: 'int' and 'list'".
-                # The per-rank "still has fitting requests" hint is folded
-                # into the same allgather so we can suppress the stall
-                # check globally when any rank is still making progress.
-                local_ready_gen = sum(
-                    1 for req in self.active_requests if req.state in (
-                        LlmRequestState.DISAGG_GENERATION_TRANS_COMPLETE,
-                        LlmRequestState.GENERATION_IN_PROGRESS,
-                    ))
-                local_has_fitting = 1 if fitting_disagg_gen_init_requests else 0
-                if self.enable_attention_dp:
-                    responses = self.dist.tp_allgather(
-                        [local_ready_gen, local_has_fitting])
-                    total_ready_gen = sum(r[0] for r in responses)
-                    any_rank_has_fitting = any(r[1] for r in responses)
-                else:
-                    total_ready_gen = local_ready_gen
-                    any_rank_has_fitting = bool(local_has_fitting)
-
-                if not any_rank_has_fitting:
-                    now = time.time()
-                    last_count = getattr(self, "_bench_disagg_last_gen_count",
-                                         None)
-                    last_change_time = getattr(
-                        self, "_bench_disagg_last_gen_count_time", None)
-                    if (last_count != total_ready_gen
-                            or last_change_time is None):
-                        self._bench_disagg_last_gen_count = total_ready_gen
-                        self._bench_disagg_last_gen_count_time = now
-                    elif (now - last_change_time
-                          > self.benchmark_fill_stall_timeout_s
-                          and total_ready_gen < self.benchmark_req_queues_size):
-                        error_msg = (
-                            f"Benchmark gen request count stalled at "
-                            f"{total_ready_gen} "
-                            f"for {now - last_change_time:.0f}s "
-                            f"(target {self.benchmark_req_queues_size}, "
-                            f"fetched={self.num_fetch_requests}). "
-                            f"Likely causes: KV transfer stuck, KV cache pool "
-                            f"too small, or transceiver deadlock. Aborting all "
-                            f"active requests.")
-                        logger.error(error_msg)
-                        self._handle_errors(error_msg,
-                                            requests=self.active_requests)
-                        return None, None
+            if (self.benchmark_req_queues_size > 0 and not self.is_warmup
+                    and not fitting_disagg_gen_init_requests):
+                stuck_init_requests = [
+                    req for req in self.active_requests
+                    if req.is_disagg_generation_init_state
+                ]
+                # Only fail once all benchmark requests have been fetched
+                # so that _handle_errors covers every request and every
+                # client receives an error response.
+                if (stuck_init_requests and self.num_fetch_requests
+                        >= self.benchmark_req_queues_size):
+                    error_msg = (
+                        f"Insufficient KV cache for gen-only benchmark mode: "
+                        f"{len(stuck_init_requests)} request(s) are waiting for "
+                        f"KV cache allocation but the scheduler could not fit "
+                        f"any of them. Increase free_gpu_memory_fraction or "
+                        f"reduce TLLM_BENCHMARK_REQ_QUEUES_SIZE (currently "
+                        f"{self.benchmark_req_queues_size}).")
+                    logger.error(error_msg)
+                    # Fail all active and waiting requests so every
+                    # client receives an error instead of hanging.
+                    self._handle_errors(error_msg,
+                                        requests=self.active_requests)
+                    return None, None
 
         self.num_scheduled_requests = scheduled_batch.batch_size
         logger.debug(