[TRTLLM-12669][fix] Pre-capture both greedy and advanced sampling CUDA graphs during warmup

zhaoyangwang-nvidia · zhaoyangwang-nvidia · commit 485792d08782 · 2026-06-03T00:44:19.000-07:00
On-the-fly CUDA graph capture is disabled outside the warmup window
(allow_capture context manager) because it can resize the shared
cuda_graph_workspace tensor and invalidate addresses baked into previously
captured graphs. As a result, the (is_all_greedy_sample=False) graph key
introduced for one-engine spec dec was never captured: warmup only ran
dummy requests with greedy sampling params, so inference batches with
temperature / top_k / top_p fell back to eager.

Fix: run the warmup capture loop twice for one-engine spec dec. The first
pass captures the greedy fast-path (existing behavior). The second pass
flips spec_metadata.is_all_greedy_sample to False before forward so
maybe_get_cuda_graph computes the non-greedy key, and sets a runtime
attribute that populate_sampling_params_for_one_model honors to override
the dummy-request-derived greedy detection and substitute synthetic
non-greedy values into the per-request buffers.

Other paths are unaffected: non-one-engine spec dec and non-spec dec
default is_all_greedy_sample to True, so the second pass is skipped.

End-to-end (qwen3_8b_eagle3, bs=32, T=0.7/top_k=50/top_p=0.9):
  rej_off baseline:        TPS=3713.73
  rej_on (before fix):     TPS=3854.01 (+3.8%; non-greedy ran eager)
  rej_on (after fix):      TPS=6013.58 (+62.0%; non-greedy uses graph)

Signed-off-by: ZhaoyangWang &lt;zhaoyangw@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1071,31 +1071,61 @@ def _capture_generation_cuda_graphs(self,
         else:
             max_seq_len_list = [effective_max_seq_len]
 
-        for bs, draft_len in graphs_to_capture:
-            if bs > self.batch_size:
-                continue
-
-            for max_seq_len in max_seq_len_list:
-                warmup_request = self._create_cuda_graph_warmup_request(
-                    resource_manager, bs, draft_len, max_seq_len)
-                with self._release_batch_context(warmup_request,
-                                                 resource_manager) as batch:
-                    if batch is None:
-                        # No KV cache space, cannot continue capturing graphs
+        def _run_capture_pass(force_non_greedy: bool, label: str) -> None:
+            spec_metadata = getattr(self, 'spec_metadata', None)
+            if force_non_greedy and spec_metadata is not None:
+                spec_metadata._force_non_greedy_for_capture = True
+                # maybe_get_cuda_graph reads spec_metadata.is_all_greedy_sample
+                # to build the graph cache key BEFORE populate runs inside
+                # _prepare_inputs. Pre-flip it here so the very first capture
+                # in this pass uses the non-greedy key; populate's override
+                # below will keep it False on every subsequent iteration.
+                spec_metadata.is_all_greedy_sample = False
+            try:
+                for bs, draft_len in graphs_to_capture:
+                    if bs > self.batch_size:
                         continue
-                    logger.info(
-                        f"Run generation-only CUDA graph warmup for batch size={bs}, draft_len={draft_len}, max_seq_len={max_seq_len}"
-                    )
-                    self.enable_spec_decode = draft_len > 0 or self.is_draft_model or (
-                        self.spec_config is not None
-                        and self.spec_config.spec_dec_mode.use_one_engine())
-                    self._update_draft_inference_state_for_warmup(
-                        batch, draft_len > 0, resource_manager)
-                    self.runtime_draft_len = draft_len
-                    self.forward(batch,
-                                 new_tensors_device=None,
-                                 resource_manager=resource_manager)
-                    torch.cuda.synchronize()
+
+                    for max_seq_len in max_seq_len_list:
+                        warmup_request = self._create_cuda_graph_warmup_request(
+                            resource_manager, bs, draft_len, max_seq_len)
+                        with self._release_batch_context(
+                                warmup_request, resource_manager) as batch:
+                            if batch is None:
+                                # No KV cache space, cannot continue capturing graphs
+                                continue
+                            logger.info(
+                                f"Run generation-only CUDA graph warmup ({label}) "
+                                f"for batch size={bs}, draft_len={draft_len}, "
+                                f"max_seq_len={max_seq_len}")
+                            self.enable_spec_decode = draft_len > 0 or self.is_draft_model or (
+                                self.spec_config is not None and
+                                self.spec_config.spec_dec_mode.use_one_engine())
+                            self._update_draft_inference_state_for_warmup(
+                                batch, draft_len > 0, resource_manager)
+                            self.runtime_draft_len = draft_len
+                            self.forward(batch,
+                                         new_tensors_device=None,
+                                         resource_manager=resource_manager)
+                            torch.cuda.synchronize()
+            finally:
+                if force_non_greedy and spec_metadata is not None:
+                    spec_metadata._force_non_greedy_for_capture = False
+
+        # Pass 1: greedy fast-path (dummy requests carry no sampling params,
+        # so is_all_greedy_sample is naturally True).
+        _run_capture_pass(force_non_greedy=False, label="greedy")
+        # Pass 2: advanced sampling variant. Required because on-the-fly capture
+        # is disabled outside warmup, so any inference batch that contains a
+        # non-greedy request would otherwise fall back to eager. Only meaningful
+        # for one-engine spec dec (where is_all_greedy_sample participates in
+        # the graph key); other paths default to True and would never key into
+        # this variant.
+        needs_non_greedy_capture = (
+            self.spec_config is not None
+            and self.spec_config.spec_dec_mode.use_one_engine())
+        if needs_non_greedy_capture:
+            _run_capture_pass(force_non_greedy=True, label="advanced sampling")
         # Set the value back to the original value after cuda graph warmups are complete
         self.enable_spec_decode = self.is_spec_decode
 
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
@@ -647,6 +647,23 @@ def _normalize_request_sampling_params(
         self.is_all_greedy_sample = (self.skip_temperature and self.skip_top_k
                                      and self.skip_top_p)
 
+        # Warmup-time override (set via runtime attribute by the model engine):
+        # force the advanced-sampling code path so the CUDA graph for the
+        # (is_all_greedy_sample=False) key gets captured. Dummy warmup requests
+        # carry no sampling params, so the natural detection above always
+        # returns True; this branch substitutes synthetic non-greedy scalars
+        # into the per-request data and lets Phase 2 run normally to populate
+        # the GPU buffers used by the captured kernels.
+        if getattr(self, '_force_non_greedy_for_capture', False):
+            self.skip_temperature = False
+            self.skip_top_k = False
+            self.skip_top_p = False
+            self.is_all_greedy_sample = False
+            per_request_normalized = [
+                (0.7, 50, 0.9, num_tokens)
+                for (_, _, _, num_tokens) in per_request_normalized
+            ]
+
         tokens_per_request = (self.max_total_draft_tokens + 1 if
                               self.is_spec_dec_tree else self.max_draft_len + 1)
         required_flat_size = tokens_per_request * self.max_num_requests