[TRTLLM-12669][chore] Address review feedback

zhaoyangwang-nvidia · zhaoyangwang-nvidia · commit d6fd85256b9b · 2026-06-03T19:52:01.000-07:00
- llm_args: keep allow_advanced_sampling as a deprecated no-op field
  with a logger warning when explicitly set, so removing it isn't an
  abrupt API break
- llm_args: add TODO above the Eagle3-only rejection-sampling whitelist
  to track extending support to MTP / DraftTarget / PARD / DFlash /
  SaveHiddenStates / SA and unifying the dispatch in SpecMetadata
- cuda_graph_runner: type spec_metadata as Optional[SpecMetadata]
  instead of Optional[Any]
- model_engine: always initialize self.spec_metadata = None so the
  capture-pass can access it directly without a getattr() fallback
- eagle3.draft_decoder: drop the dead Optional/_draft_sampler_greedy
  fallback; spec_metadata and batch_size are always passed by the
  sole caller

Signed-off-by: ZhaoyangWang &lt;zhaoyangw@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -19,6 +19,7 @@
 from ..memory_buffer_utils import get_memory_buffers
 from ..modules.multi_stream_utils import with_multi_stream
 from ..speculative.eagle3 import Eagle3ResourceManager
+from ..speculative.interface import SpecMetadata
 from ..speculative.spec_sampler_base import SampleStateTensorsSpec
 from ..speculative.utils import get_draft_kv_cache_manager
 from ..utils import make_weak_ref, piecewise_cuda_graph
@@ -206,7 +207,7 @@ def get_graph_key(
             batch: ScheduledRequests,
             new_tensors_device: Optional[SampleStateTensors] = None,
             spec_resource_manager: Optional[BaseResourceManager] = None,
-            spec_metadata: Optional[Any] = None):
+            spec_metadata: Optional[SpecMetadata] = None):
         batch_size = batch.batch_size
 
         # Get the sequence length mode.
@@ -248,7 +249,7 @@ def maybe_get_cuda_graph(
         batch: ScheduledRequests,
         enable_spec_decode: bool,
         attn_metadata: Any,
-        spec_metadata: Optional[Any] = None,
+        spec_metadata: Optional[SpecMetadata] = None,
         draft_tokens_cuda: Optional[torch.Tensor] = None,
         new_tensors_device: Optional[SampleStateTensors] = None,
         spec_resource_manager: Optional[BaseResourceManager] = None,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -484,7 +484,6 @@ def __init__(
             sparse_attn_config=self.sparse_attention_config)
 
         if self.is_spec_decode:
-            self.spec_metadata = None
             update_spec_config_from_model_config(self.spec_config,
                                                  self.model.config)
             max_num_draft_tokens = self.max_draft_loop_tokens * self.batch_size
@@ -538,6 +537,7 @@ def __init__(
         # the model engine.
         self.attn_metadata = None
         self.encoder_attn_metadata = None
+        self.spec_metadata = None
         self.iter_states = {}
         self._cuda_graph_mem_pool = self._torch_compile_backend._graph_pool_handle if self._torch_compile_enabled else None
 
@@ -1214,7 +1214,7 @@ def _capture_generation_cuda_graphs(self,
             max_seq_len_list = [effective_max_seq_len]
 
         def _run_capture_pass(force_non_greedy: bool, label: str) -> None:
-            spec_metadata = getattr(self, 'spec_metadata', None)
+            spec_metadata = self.spec_metadata
             if force_non_greedy and spec_metadata is not None:
                 spec_metadata._force_non_greedy_for_capture = True
                 # maybe_get_cuda_graph reads spec_metadata.is_all_greedy_sample
diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py
@@ -819,14 +819,13 @@ def draft_decoder(
         self,
         logits: torch.Tensor,
         draft_model: nn.Module,
-        spec_metadata: Optional[Eagle3OneModelSpecMetadata] = None,
-        batch_size: Optional[int] = None,
+        spec_metadata: Eagle3OneModelSpecMetadata,
+        batch_size: int,
         draft_step: Optional[int] = None,
     ):
         '''
-        Sample draft tokens. When spec_metadata + batch_size are provided, use
-        the target's per-request sampling params (temperature/top_k/top_p);
-        otherwise fall back to argmax.
+        Sample draft tokens using the target's per-request sampling params
+        (temperature/top_k/top_p).
 
         When rejection sampling is enabled and draft_step is provided, take the
         single-pass path that also scatters the draft prob distribution into the
@@ -835,23 +834,20 @@ def draft_decoder(
         Args:
             logits: [batch_size, vocab_size] - Draft model logits.
             draft_model: The draft model.
-            spec_metadata: Carries per-request sampling param tensors. When
-                None, sampling is forced greedy.
+            spec_metadata: Carries per-request sampling param tensors.
             batch_size: Active requests, used to slice per-request tensors.
             draft_step: Current draft step index (0..max_draft_len-1). Required
                 for the rejection-sampling code path so probs are written to
                 the correct slice of spec_metadata.draft_probs.
         '''
 
         d2t = getattr(draft_model.model, "d2t", None)
-        if spec_metadata is not None and batch_size is not None:
-            if (spec_metadata.use_rejection_sampling and draft_step is not None
-                    and not spec_metadata.is_all_greedy_sample):
-                return self._draft_sampler_advanced_for_rejection(
-                    logits, spec_metadata, batch_size, d2t, draft_step)
-            return self._draft_sampler_advanced(logits, spec_metadata,
-                                                batch_size, d2t)
-        return self._draft_sampler_greedy(logits, d2t)
+        if (spec_metadata.use_rejection_sampling and draft_step is not None
+                and not spec_metadata.is_all_greedy_sample):
+            return self._draft_sampler_advanced_for_rejection(
+                logits, spec_metadata, batch_size, d2t, draft_step)
+        return self._draft_sampler_advanced(logits, spec_metadata, batch_size,
+                                            d2t)
 
     def prepare_1st_drafter_inputs(
         self,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1066,6 +1066,14 @@ class DecodingBaseConfig(StrictBaseModel):
         "false to fall back to exact-match verification on non-greedy batches. "
         "The non-dynamic-tree one-model path requires FlashInfer.")
 
+    allow_advanced_sampling: bool = Field(
+        default=False,
+        status="deprecated",
+        description=
+        "DEPRECATED: no-op kept for backward compatibility. Will be removed "
+        "in a future release. Non-greedy sampling is now auto-detected per "
+        "request; this flag no longer has any effect.")
+
     # If set, drafting is allowed to use chain drafter.
     _allow_chain_drafter: bool = PrivateAttr(True)
     # If set, drafting uses greedy sampling, irrespective of sampling parameters.
@@ -1130,6 +1138,23 @@ def validate_rejection_sampling_config(self):
             self.use_rejection_sampling = False
         return self
 
+    @model_validator(mode='before')
+    @classmethod
+    def _warn_deprecated_allow_advanced_sampling(cls, data):
+        """Warn when users set the deprecated allow_advanced_sampling flag.
+
+        Non-greedy sampling is now auto-detected per request and always
+        available, so the flag is a no-op; warn loudly so callers update
+        their configs before the flag is removed.
+        """
+        if isinstance(data, dict) and 'allow_advanced_sampling' in data:
+            logger.warning(
+                "DecodingBaseConfig: 'allow_advanced_sampling' is deprecated "
+                "and will be removed in a future release. The flag has no "
+                "effect — non-greedy sampling is now auto-detected per "
+                "request.")
+        return data
+
     @model_validator(mode='after')
     # 1. Validate that max_concurrency and draft_len_schedule are mutually exclusive.
     # 2. If max_concurrency is set, translate it to the corresponding draft_len_schedule.
@@ -4423,6 +4448,11 @@ def validate_speculative_config(self):
                 # Rejection sampling is only wired up for Eagle3 one-model paths.
                 # Silently fall back for other spec types so the new default
                 # (True) does not break them.
+                # TODO: extend rejection sampling to the remaining speculative
+                # decoding paths (MTP / DraftTarget / PARD / DFlash /
+                # SaveHiddenStates / SA) and unify the dispatch in SpecMetadata
+                # so new spec algorithms get rejection sampling for free; once
+                # all paths are covered this whitelist guard can be removed.
                 self.speculative_config.use_rejection_sampling = False
 
             if isinstance(self.speculative_config, PARDDecodingConfig):