[TRTLLM-12669][perf] Reuse draft probs to drop redundant softmax + cut rejection-path overhead

zhaoyangwang-nvidia · zhaoyangwang-nvidia · commit e173cbfb89a2 · 2026-06-03T06:06:43.000-07:00
This commit refactors the rejection-sampling draft path to compute the
filtered + normalized prob distribution exactly once per draft step, and
folds three independent optimizations into one PR-coherent change:

1. Single-pass compute_probs + sample on draft side
   _draft_sampler_advanced_for_rejection now calls a new
   sampling_batch_spec_dec_one_model_for_rejection which returns both the
   sampled token AND the probs in one go. The probs are scattered into the
   slot-indexed draft_probs buffer immediately, so the previous separate
   _compute_and_store_draft_probs path (which redundantly re-ran
   temperature + top_k + top_p + softmax on the cloned logits) is gone.

2. Faster compute_probs_from_logits via flashinfer fast path
   compute_probs_from_logits now composes flashinfer's radix-based O(N)
   kernels (top_k_mask_logits → fused softmax+temp → top_p_renorm_probs)
   when CUDA + flashinfer are available. The previous C++ op path triggered
   torch.sort fallback (O(N log N) per row) due to a hard-coded kMax=0,
   which severely under-utilized SMs at small batch sizes. C++ op and
   PyTorch CPU paths are retained as fallbacks.

3. Pre-allocated full_draft_probs buffer
   The (max_num_requests, max_draft_len, vocab_size) scratch used to pad
   draft probs to target vocab is now zero-filled once at prepare() and
   reused across iters, saving ~25 us/iter of 64 MB zero-fill. Only
   allocated when use_rejection_sampling=True.

The eagle3 draft loop is simplified accordingly: it no longer accumulates
a draft_logits_list or invokes _compute_and_store_draft_probs after the
loop; per-step scatter happens inside _draft_sampler_advanced_for_rejection
keyed on the (already-required) draft_step index.

Net effect on llama70b bs=32 (T=0.7/top_k=50/top_p=0.9, MT-bench 2000):
  ΔTPS recovered from -32% (post-refactor with sort fallback) and
  -12% (pre-refactor with double softmax) to ~-5% (flashinfer fast path).
The remaining gap is fundamental: llama70b's Eagle3 draft already tracks
the target closely (AR uplift only +2%), so the inherent rejection
sampling overhead (chain_speculative_sampling kernel + target_probs +
d2t padding ≈ ~340 us/iter ≈ 1.5%) is not fully offset by the small AR
gain. qwen8b/qwen235b with ΔAR +9%~+14% remain solidly net positive.

Signed-off-by: ZhaoyangWang &lt;zhaoyangw@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py
@@ -661,7 +661,6 @@ def _forward_linear_draft_loop(self, inputs, attn_metadata, spec_metadata,
         """Original linear draft loop (1 token per layer)."""
         runtime_draft_len = spec_metadata.runtime_draft_len
         next_draft_tokens = []
-        draft_logits_list = []
         position_ids = inputs["position_ids"]
 
         with self.draft_kv_cache_context(attn_metadata, draft_kv_cache_manager):
@@ -714,11 +713,11 @@ def _forward_linear_draft_loop(self, inputs, attn_metadata, spec_metadata,
                                                             d2t,
                                                             draft_step=i)
 
-                if spec_metadata.use_rejection_sampling:
-                    draft_logits_list.append(logits.clone())
-
-                new_draft_token = self.draft_decoder(logits, draft_model,
-                                                     spec_metadata, batch_size)
+                new_draft_token = self.draft_decoder(logits,
+                                                     draft_model,
+                                                     spec_metadata,
+                                                     batch_size,
+                                                     draft_step=i)
                 next_draft_tokens.append(new_draft_token)
                 # update inputs
                 hidden_states = hidden_states_to_save[gather_ids]
@@ -759,19 +758,18 @@ def _forward_linear_draft_loop(self, inputs, attn_metadata, spec_metadata,
                 gen_draft_tokens)
             next_draft_tokens[num_contexts:] = gen_draft_tokens
 
-        # Skip when the whole batch is greedy: _can_use_rejection_sampling will
-        # bypass the rejection path anyway, so computing draft probs is wasted.
-        if (spec_metadata.use_rejection_sampling and draft_logits_list
-                and not spec_metadata.is_all_greedy_sample):
-            d2t_param = getattr(draft_model.model, "d2t", None)
-            spec_metadata.d2t = d2t_param.data if d2t_param is not None else None
-            self._compute_and_store_draft_probs(draft_logits_list,
-                                                spec_metadata, batch_size)
-        elif spec_metadata.use_rejection_sampling:
-            # No draft probs were written this iter (all-greedy or empty draft
-            # loop). Invalidate the buffer so the next iter does not read stale
-            # data if it transitions back to a non-greedy mix.
-            spec_metadata.draft_probs_valid = False
+        # Probs were already scattered into the slot-indexed buffer by
+        # _draft_sampler_advanced_for_rejection on each draft step (non-greedy
+        # batches only). All-greedy batches skip storage — rejection sampling
+        # will be bypassed by _can_use_rejection_sampling. Finalize the validity
+        # flag and d2t for next-iter target-side verification.
+        if spec_metadata.use_rejection_sampling:
+            if not spec_metadata.is_all_greedy_sample:
+                d2t_param = getattr(draft_model.model, "d2t", None)
+                spec_metadata.d2t = d2t_param.data if d2t_param is not None else None
+                spec_metadata.draft_probs_valid = True
+            else:
+                spec_metadata.draft_probs_valid = False
 
         return next_draft_tokens
 
@@ -802,22 +800,34 @@ def draft_decoder(
         draft_model: nn.Module,
         spec_metadata: Optional[Eagle3OneModelSpecMetadata] = None,
         batch_size: Optional[int] = None,
+        draft_step: Optional[int] = None,
     ):
         '''
         Sample draft tokens. When spec_metadata + batch_size are provided, use
         the target's per-request sampling params (temperature/top_k/top_p);
         otherwise fall back to argmax.
 
+        When rejection sampling is enabled and draft_step is provided, take the
+        single-pass path that also scatters the draft prob distribution into the
+        slot-indexed buffer (avoids a redundant softmax later).
+
         Args:
             logits: [batch_size, vocab_size] - Draft model logits.
             draft_model: The draft model.
             spec_metadata: Carries per-request sampling param tensors. When
                 None, sampling is forced greedy.
             batch_size: Active requests, used to slice per-request tensors.
+            draft_step: Current draft step index (0..max_draft_len-1). Required
+                for the rejection-sampling code path so probs are written to
+                the correct slice of spec_metadata.draft_probs.
         '''
 
         d2t = getattr(draft_model.model, "d2t", None)
         if spec_metadata is not None and batch_size is not None:
+            if (spec_metadata.use_rejection_sampling and draft_step is not None
+                    and not spec_metadata.is_all_greedy_sample):
+                return self._draft_sampler_advanced_for_rejection(
+                    logits, spec_metadata, batch_size, d2t, draft_step)
             return self._draft_sampler_advanced(logits, spec_metadata,
                                                 batch_size, d2t)
         return self._draft_sampler_greedy(logits, d2t)
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
@@ -27,7 +27,8 @@
 
 from .one_model_sampler import (compute_probs_from_logits,
                                 rejection_sampling_one_model,
-                                sampling_batch_spec_dec_one_model)
+                                sampling_batch_spec_dec_one_model,
+                                sampling_batch_spec_dec_one_model_for_rejection)
 
 # Environment variable name for forcing the number of accepted tokens in speculative decoding
 FORCE_NUM_ACCEPTED_TOKENS_ENV_VAR = "TLLM_SPEC_DECODE_FORCE_NUM_ACCEPTED_TOKENS"
@@ -479,6 +480,13 @@ class SpecMetadata:
     batch_slot_ids: Optional[torch.Tensor] = None
     # Draft-to-target vocab offset tensor.
     d2t: Optional[torch.Tensor] = None
+    # Pre-allocated scratch for draft probs expanded to the target vocab size.
+    # Filled with zeros once at prepare(); each rejection iter only overwrites
+    # the positions selected by d2t (or [:draft_vocab] when there is no d2t),
+    # so the zeros outside those positions persist across iterations and we
+    # avoid a per-iter 64 MB zero-fill on the (max_num_requests, max_draft_len,
+    # vocab_size) tensor. Shape: [max_num_requests, max_draft_len, vocab_size].
+    full_draft_probs: Optional[torch.Tensor] = None
 
     def __post_init__(self):
         pass
@@ -501,6 +509,16 @@ def prepare(self):
             self.batch_slot_ids = torch.empty((self.max_num_requests, ),
                                               dtype=torch.long,
                                               device='cuda')
+        if (self.use_rejection_sampling and self.full_draft_probs is None
+                and self.vocab_size > 0):
+            # Zero-fill once. Subsequent iters only overwrite the d2t-mapped
+            # positions (constant across iters since d2t is model-static), so
+            # untouched positions stay 0 forever — saves the per-iter 64 MB
+            # zero-fill in _sample_and_accept_draft_tokens_rejection.
+            self.full_draft_probs = torch.zeros(
+                (self.max_num_requests, self.max_draft_len, self.vocab_size),
+                dtype=torch.float32,
+                device='cuda')
 
     def create_cuda_graph_metadata(self, max_batch_size: int):
         """
@@ -692,7 +710,7 @@ def _normalize_request_sampling_params(
 
         # Always-populate the per-request slot id table when rejection sampling
         # is configured: it's tiny (max_num_requests longs) and needed at
-        # _compute_and_store_draft_probs time to scatter draft probs by slot.
+        # draft-sampler time to scatter draft probs by slot.
         if self.use_rejection_sampling and self.batch_slot_ids is not None:
             self.batch_slot_ids[:len(per_request_slot_ids)].copy_(
                 torch.tensor(per_request_slot_ids,
@@ -1157,7 +1175,7 @@ def _sample_and_accept_draft_tokens_rejection(
                       spec_metadata.top_ps[gen_start:gen_end])
 
             target_probs_flat = compute_probs_from_logits(
-                gen_logits.clone(), temperatures, top_ks, top_ps)
+                gen_logits, temperatures, top_ks, top_ps)
             target_probs = target_probs_flat.reshape(num_gens,
                                                      runtime_draft_len + 1,
                                                      vocab_size)
@@ -1171,10 +1189,17 @@ def _sample_and_accept_draft_tokens_rejection(
                 f"{runtime_draft_len}")
             d2t = getattr(spec_metadata, "d2t", None)
             if draft_vocab_size != vocab_size:
-                full_draft_probs = torch.zeros(
-                    (num_gens, runtime_draft_len, vocab_size),
-                    dtype=torch.float32,
-                    device=device)
+                # Use the pre-allocated buffer from spec_metadata.prepare()
+                # (zero-filled once at init; untouched positions stay 0). Falls
+                # back to per-iter allocation if the buffer is not configured,
+                # e.g. when use_rejection_sampling was off at prepare() time.
+                if spec_metadata.full_draft_probs is not None:
+                    full_draft_probs = spec_metadata.full_draft_probs[:num_gens]
+                else:
+                    full_draft_probs = torch.zeros(
+                        (num_gens, runtime_draft_len, vocab_size),
+                        dtype=torch.float32,
+                        device=device)
                 if d2t is not None:
                     assert d2t.numel() == draft_vocab_size, (
                         f"d2t size mismatch: {d2t.numel()} != {draft_vocab_size}"
@@ -1295,62 +1320,71 @@ def _draft_sampler_advanced(
 
         return draft_tokens.type(torch.int32)
 
-    def _compute_and_store_draft_probs(
+    def _draft_sampler_advanced_for_rejection(
         self,
-        draft_logits_list: List[torch.Tensor],
-        spec_metadata: SpecMetadata,
+        logits: torch.Tensor,
+        spec_metadata: "SpecMetadata",
         batch_size: int,
+        d2t: Optional[torch.Tensor] = None,
+        draft_step: int = 0,
     ):
         """
-        Compute draft probabilities and store them for next-step rejection
-        sampling. The storage is keyed by py_seq_slot, so the data is robust
-        to batch composition shifts across iterations (chunking ctxs, gen
-        completion, new ctxs joining).
+        Rejection-sampling-aware variant of ``_draft_sampler_advanced``.
+
+        Single-pass compute + sample + scatter: computes the per-request prob
+        distribution once via TRT-LLM's fused ``compute_probs_from_logits``
+        (temp + top_k + top_p + softmax + greedy override in one CUDA kernel),
+        samples the draft token from that distribution, and scatters the same
+        probs into the slot-indexed ``spec_metadata.draft_probs`` buffer for
+        next-iter rejection verification. Replaces the previous two-stage path
+        (flashinfer fused sampling kernel + a redundant softmax pass to store
+        probs).
+
+        All-greedy batches take the cheaper argmax path —
+        ``_can_use_rejection_sampling`` will bypass rejection for those anyway.
         """
-        draft_tokens_per_request = len(draft_logits_list)
-        vocab_size = draft_logits_list[0].shape[-1]
-        device = draft_logits_list[0].device
-
-        draft_logits = torch.stack(draft_logits_list, dim=0)
-        draft_logits_flat = draft_logits.transpose(0, 1).reshape(-1, vocab_size)
-
-        num_draft_tokens = batch_size * draft_tokens_per_request
-        if spec_metadata.request_temperatures is not None:
-            draft_temps = spec_metadata.request_temperatures[:batch_size].repeat_interleave(
-                draft_tokens_per_request)
-            draft_top_ks = (
-                spec_metadata.request_top_ks[:batch_size].repeat_interleave(
-                    draft_tokens_per_request) if not spec_metadata.skip_top_k
-                and spec_metadata.request_top_ks is not None else None)
-            draft_top_ps = (
-                spec_metadata.request_top_ps[:batch_size].repeat_interleave(
-                    draft_tokens_per_request) if not spec_metadata.skip_top_p
-                and spec_metadata.request_top_ps is not None else None)
-        else:
-            draft_temps = torch.ones(num_draft_tokens, device=device)
-            draft_top_ks = None
-            draft_top_ps = None
-
-        draft_probs_flat = compute_probs_from_logits(draft_logits_flat,
-                                                     draft_temps, draft_top_ks,
-                                                     draft_top_ps)
-        # [batch_size, draft_len, draft_vocab]
-        draft_probs_per_request = draft_probs_flat.reshape(
-            batch_size, draft_tokens_per_request, vocab_size)
-
-        # Scatter into draft_probs[slot] for each request in the current batch.
-        # spec_metadata.draft_probs is shaped [max_num_requests, max_draft_len,
-        # vocab_size]. Different iterations may have different batch
-        # compositions, but a given request's data always lives at its
-        # py_seq_slot row, so reads at the next iter pick up the right data.
+        if spec_metadata.is_all_greedy_sample:
+            return self._draft_sampler_greedy(logits, d2t)
+
+        temperatures = spec_metadata.request_temperatures[:batch_size]
+        top_ks = spec_metadata.request_top_ks[:batch_size]
+        top_ps = spec_metadata.request_top_ps[:batch_size]
+
+        if self.seed is None:
+            self.seed = torch.tensor([0],
+                                     dtype=torch.int64,
+                                     device=logits.device)
+            self.offset = torch.tensor([0],
+                                       dtype=torch.int64,
+                                       device=logits.device)
+        self.seed += 1
+        self.seed %= (2**31)
+
+        draft_tokens, probs = sampling_batch_spec_dec_one_model_for_rejection(
+            logits,
+            temperatures,
+            top_ks,
+            top_ps,
+            seed=self.seed,
+            offset=self.offset,
+        )
+
+        # Scatter probs into the slot-indexed buffer (shaped
+        # [max_num_requests, max_draft_len, vocab_size]). Each request's data
+        # always lands at its stable py_seq_slot row regardless of batch
+        # composition shifts across iterations.
         assert spec_metadata.batch_slot_ids is not None, (
             "batch_slot_ids must be populated by "
             "populate_sampling_params_for_one_model before draft probs storage")
         batch_slots = spec_metadata.batch_slot_ids[:batch_size]
-        spec_metadata.draft_probs[batch_slots, :draft_tokens_per_request, :
-                                  vocab_size] = draft_probs_per_request
-        spec_metadata.draft_probs_last_dim = vocab_size
-        spec_metadata.draft_probs_valid = True
+        vocab = probs.shape[-1]
+        spec_metadata.draft_probs[batch_slots, draft_step, :vocab] = probs
+        spec_metadata.draft_probs_last_dim = vocab
+
+        if d2t is not None:
+            draft_tokens = d2t[draft_tokens] + draft_tokens
+
+        return draft_tokens.type(torch.int32)
 
     def _execute_guided_decoder_if_present(self, logits):
         """Execute guided decoder on target model logits if available."""
diff --git a/tensorrt_llm/_torch/speculative/one_model_sampler.py b/tensorrt_llm/_torch/speculative/one_model_sampler.py
@@ -5,9 +5,20 @@
 from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
 
 if IS_FLASHINFER_AVAILABLE:
-    from flashinfer.sampling import chain_speculative_sampling, top_k_top_p_sampling_from_logits
+    from flashinfer.sampling import (
+        chain_speculative_sampling,
+        sampling_from_probs,
+        top_k_mask_logits,
+        top_k_top_p_sampling_from_logits,
+        top_p_renorm_probs,
+    )
+    from flashinfer.sampling import softmax as flashinfer_softmax
 else:
     chain_speculative_sampling = None
+    sampling_from_probs = None
+    flashinfer_softmax = None
+    top_k_mask_logits = None
+    top_p_renorm_probs = None
     top_k_top_p_sampling_from_logits = None
 
 
@@ -114,9 +125,29 @@ def compute_probs_from_logits(
     skip_temperature: bool = False,
 ) -> torch.Tensor:
     """
-    Compute probabilities from logits with temperature, top-k, and top-p applied.
+    Compute filtered + normalized probs from logits (temperature + top_k +
+    top_p + softmax). Picks the fastest path for the input device:
+
+    1. CUDA + flashinfer: ``top_k_mask_logits`` → fused ``softmax+temp`` →
+       ``top_p_renorm_probs`` (all O(N) radix). ``skip_temperature`` ignored.
+    2. CUDA, no flashinfer: ``compute_probs_from_logits_op`` (sort-based,
+       O(N log N)).
+    3. CPU: manual PyTorch fallback.
     """
+    if logits.is_cuda and IS_FLASHINFER_AVAILABLE:
+        # Fast path: flashinfer composition (O(N) per row, friendly to small
+        # batch sizes). skip_temperature is ignored — flashinfer's softmax
+        # always applies the temperature tensor.
+        if top_k is not None:
+            logits = top_k_mask_logits(logits, top_k)
+        probs = flashinfer_softmax(logits, temperatures)
+        if top_p is not None:
+            probs = top_p_renorm_probs(probs, top_p)
+        return probs
+
     if logits.is_cuda:
+        # CUDA without flashinfer: fall back to the C++ op (slower sort-based
+        # top-k path, but works without flashinfer).
         return torch.ops.trtllm.compute_probs_from_logits_op(
             logits, temperatures, top_k, top_p, skip_temperature
         )
@@ -125,7 +156,6 @@ def compute_probs_from_logits(
         logits = apply_temperature(logits, temperatures)
     logits = apply_top_k_top_p(logits, top_k, top_p)
     probs = logits.softmax(dim=-1, dtype=torch.float32)
-
     # Greedy rows should remain exactly one-hot so rejection sampling does not
     # spuriously reject numerically-near argmax tokens.
     greedy_temp_threshold = 1e-4
@@ -135,6 +165,29 @@ def compute_probs_from_logits(
     return torch.where(is_greedy.unsqueeze(1), one_hot, probs)
 
 
+def sampling_batch_spec_dec_one_model_for_rejection(
+    logits: torch.Tensor,
+    temperatures: torch.Tensor,
+    top_k: torch.Tensor,
+    top_p: torch.Tensor,
+    seed: Optional[torch.Tensor] = None,
+    offset: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rejection-sampling-aware draft sampler: returns BOTH the sampled tokens
+    AND the prob distribution they were sampled from, so the downstream
+    rejection-sampling path can reuse the probs without a second softmax +
+    temp/top_k/top_p pass.
+    """
+    if sampling_from_probs is None:
+        raise RuntimeError(
+            "Rejection sampling for one-model speculative decoding requires flashinfer"
+        )
+    probs = compute_probs_from_logits(logits, temperatures, top_k, top_p)
+    tokens = sampling_from_probs(probs, deterministic=True, seed=seed, offset=offset)
+    return tokens, probs
+
+
 def rejection_sampling_one_model(
     draft_probs: torch.Tensor,
     draft_token_ids: torch.Tensor,