[TRTLLM-12669][fix] refresh is_all_greedy_sample before CUDA graph key selection

zhaoyangwang-nvidia · zhaoyangwang-nvidia · commit 764edb76f74d · 2026-06-12T20:15:33.000-07:00
The one-engine CUDA graph key includes is_all_greedy_sample to dispatch
between the argmax fast-path and the advanced-sampling graph variant. The flag
was only (re)computed inside populate_sampling_params_for_one_model, which runs
in _prepare_inputs AFTER maybe_get_cuda_graph has already built the key. The key
therefore used the previous iteration's stale flag, and warmup left it False
(from the advanced-sampling capture pass). On the first real decode iteration a
greedy batch would then replay the advanced-sampling graph while populate skips
filling the sampling/draft_probs buffers, reading uninitialized slot-indexed
data. For MTP with num_nextn&gt;=2 this hung the executor (Hang detected on rank 0).

Fix:
- Extract the greediness detection into _scan_one_model_sampling (single source
  of truth) and add update_is_all_greedy_sample, called before the graph key is
  built so the key matches the buffers populate fills. populate now reuses the
  same scan.
- Defensively reset spec_metadata.is_all_greedy_sample to True after CUDA graph
  warmup so the stale capture-only False does not seed the first iteration.

Signed-off-by: ZhaoyangWang &lt;zhaoyangw@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1331,6 +1331,13 @@ def _run_capture_pass(force_non_greedy: bool, label: str) -> None:
             _run_capture_pass(force_non_greedy=True, label="advanced sampling")
         # Set the value back to the original value after cuda graph warmups are complete
         self.enable_spec_decode = self.is_spec_decode
+        # The advanced-sampling capture pass above leaves is_all_greedy_sample
+        # set to False on spec_metadata. Reset it to the default so the first
+        # real iteration's graph-key selection is not seeded with this
+        # capture-only value. (update_is_all_greedy_sample refreshes it every
+        # iteration; this is a defensive guard.)
+        if self.spec_metadata is not None:
+            self.spec_metadata.is_all_greedy_sample = True
 
     def _capture_piecewise_cuda_graphs(self, resource_manager: ResourceManager):
         """Captures piecewise CUDA graphs for context/prefill steps via torch.compile."""
@@ -4584,6 +4591,17 @@ def forward(self,
                 scheduled_requests, resource_manager,
                 self.runtime_draft_len) as padded_requests:
 
+            # Refresh is_all_greedy_sample for the *current* batch BEFORE the
+            # CUDA graph key is built below. The key includes this flag to pick
+            # the argmax vs advanced-sampling graph variant; populate (inside
+            # _prepare_inputs) runs later and fills the matching GPU buffers.
+            # Without this pre-scan the key would use the previous iteration's
+            # stale value and could replay the advanced graph against
+            # unpopulated (greedy) buffers, hanging the run (e.g. MTP nextn>=2).
+            if spec_metadata is not None:
+                spec_metadata.update_is_all_greedy_sample(
+                    padded_requests.all_requests())
+
             maybe_attn_metadata, maybe_spec_metadata, key = self.cuda_graph_runner.maybe_get_cuda_graph(
                 padded_requests,
                 enable_spec_decode=self.enable_spec_decode,
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
@@ -36,6 +36,7 @@
 
 if TYPE_CHECKING:
     from ..pyexecutor.guided_decoder import CapturableGuidedDecoder
+    from ..pyexecutor.llm_request import LlmRequest
 
 if IS_FLASHINFER_AVAILABLE:
     import flashinfer
@@ -574,25 +575,20 @@ def maybe_capture_hidden_states(self, layer_id: int,
         model. Use this method to record them. By default, does nothing.
         """
 
-    def populate_sampling_params_for_one_model(
-            self, requests: list["LlmRequest"]) -> None:
-        """
-        Set up topp/topk/temperatures for 1-model sampler.
+    def _scan_one_model_sampling(
+        self, requests: list["LlmRequest"]
+    ) -> tuple[list[tuple[float, int, float, int]], list[int]]:
+        """Single source of truth for one-engine sampling-param detection.
 
-        Scans sampling configs to set skip_*/is_all_greedy_sample flags. When
-        any request needs sampling, also builds per-token/per-request lists
-        and copies them to GPU buffers; all-greedy batches skip this entirely.
+        Scans the batch's sampling configs and sets skip_*/has_greedy_requests/
+        is_all_greedy_sample (honoring the warmup capture override). Returns
+        ``(per_request_normalized, per_request_slot_ids)`` for buffer
+        population. Does NOT allocate or fill GPU buffers, so it is safe to call
+        before the CUDA graph key is built.
         """
         from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState
         from tensorrt_llm.sampling_params import SamplingParams
 
-        if not self.spec_dec_mode.use_one_engine():
-            return
-
-        if self.temperatures is None:
-            # Ensures determinism across ranks.
-            torch.manual_seed(0)
-
         # Need to use a very small value for temperature when disabled to avoid division by 0
         DISABLE_TEMP_VAL = 1e-5
         # Very large values disable topk.
@@ -708,6 +704,44 @@ def _normalize_request_sampling_params(
                 for (_, _, _, num_tokens) in per_request_normalized
             ]
 
+        return per_request_normalized, per_request_slot_ids
+
+    def update_is_all_greedy_sample(self, requests: list["LlmRequest"]) -> None:
+        """Refresh ``is_all_greedy_sample`` for the *current* batch.
+
+        Must be called BEFORE the CUDA graph key is built (the key includes
+        ``is_all_greedy_sample`` to choose the argmax vs advanced-sampling graph
+        variant). ``populate_sampling_params_for_one_model`` runs later, inside
+        ``_prepare_inputs``, and re-derives the same flag while filling the GPU
+        sampling buffers. Computing the flag here first keeps the selected graph
+        consistent with the buffers ``populate`` fills; otherwise the key would
+        use the previous iteration's stale value and could replay the advanced
+        graph against unpopulated (greedy) buffers, which can hang/corrupt the
+        run (notably for MTP with num_nextn>=2).
+        """
+        if not self.spec_dec_mode.use_one_engine():
+            return
+        self._scan_one_model_sampling(requests)
+
+    def populate_sampling_params_for_one_model(
+            self, requests: list["LlmRequest"]) -> None:
+        """
+        Set up topp/topk/temperatures for 1-model sampler.
+
+        Scans sampling configs to set skip_*/is_all_greedy_sample flags. When
+        any request needs sampling, also builds per-token/per-request lists
+        and copies them to GPU buffers; all-greedy batches skip this entirely.
+        """
+        if not self.spec_dec_mode.use_one_engine():
+            return
+
+        if self.temperatures is None:
+            # Ensures determinism across ranks.
+            torch.manual_seed(0)
+
+        per_request_normalized, per_request_slot_ids = (
+            self._scan_one_model_sampling(requests))
+
         tokens_per_request = (self.max_total_draft_tokens + 1 if
                               self.is_spec_dec_tree else self.max_draft_len + 1)
         # Warmup batches may exceed max_num_requests * tokens_per_request (e.g.