[TRTLLM-12669][perf] Cache d2t target indices in spec metadata

zhaoyangwang-nvidia · zhaoyangwang-nvidia · commit beb4b32bb979 · 2026-06-03T18:42:20.000-07:00
The d2t-projected target vocab indices computed inside the rejection-path
d2t padding step (arange(draft_vocab) + (source + d2t.to(device)) % vocab_size)
were being rebuilt every iteration even though the d2t tensor is model-static.
Cache the result on SpecMetadataBase.d2t_target_indices on first use and
reuse it on subsequent iterations.

Profile breakdown (llama70b bs=32, CUDA graph off) showed
accept_draft.rejection.d2t_padding at 88 us/iter — the second-largest
rejection-path step after compute target_probs (127 us). The index sequence
costs ~10-20 us of that (3-4 kernels: arange + d2t H2D copy + add + mod);
the rest is the slot-indexed scatter into full_draft_probs which is
already pre-allocated.

Verified on llama70b bs=32 over 3 rounds (mean ± stdev):
  Before: rej_on vs rej_off gap ≈ -10.0% (single-run baseline)
  After : rej_on vs rej_off gap = -8.71% ± 0.9% (3-round mean)
Net within-run improvement ≈ +1.3%. qwen235b unchanged (already positive).
Output accuracy verified across 22 (model, bs, mode) configurations: all
1760 outputs terminate normally (EOT or max_tokens), no regressions.

Signed-off-by: ZhaoyangWang &lt;zhaoyangw@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
@@ -487,6 +487,11 @@ class SpecMetadata:
     # avoid a per-iter 64 MB zero-fill on the (max_num_requests, max_draft_len,
     # vocab_size) tensor. Shape: [max_num_requests, max_draft_len, vocab_size].
     full_draft_probs: Optional[torch.Tensor] = None
+    # Cached d2t-projected target vocab indices, computed once on first use
+    # (d2t is a model-static tensor). Replaces the per-iter
+    # arange + (source + d2t) % vocab_size kernel sequence inside the d2t
+    # padding step. Shape: [draft_vocab_size], dtype long.
+    d2t_target_indices: Optional[torch.Tensor] = None
 
     def __post_init__(self):
         pass
@@ -1052,8 +1057,8 @@ def _sample_and_accept_draft_tokens_base(
                                          device=logits.device)
 
         # Sample tokens using per-request sampling parameters
-        target_tokens = self._sample_tokens_for_batch(logits, spec_metadata,
-                                                      num_contexts, batch_size)
+        target_tokens = self._sample_tokens_for_batch(
+            logits, spec_metadata, num_contexts, batch_size)
 
         # Context requests: only accept the sampled token (no draft tokens yet)
         accepted_tokens[:num_contexts, 0] = target_tokens[:num_contexts]
@@ -1067,7 +1072,8 @@ def _sample_and_accept_draft_tokens_base(
         # Compare draft tokens with target tokens using cumulative product
         # Counts consecutive matches from the start
         num_accepted_tokens[num_contexts:] += torch.cumprod(
-            (draft_tokens == gen_target_tokens[:, :runtime_draft_len]).int(),
+            (draft_tokens
+             == gen_target_tokens[:, :runtime_draft_len]).int(),
             dim=-1).sum(1)
 
         # Apply force override if set
@@ -1176,9 +1182,8 @@ def _sample_and_accept_draft_tokens_rejection(
 
             target_probs_flat = compute_probs_from_logits(
                 gen_logits, temperatures, top_ks, top_ps)
-            target_probs = target_probs_flat.reshape(num_gens,
-                                                     runtime_draft_len + 1,
-                                                     vocab_size)
+            target_probs = target_probs_flat.reshape(
+                num_gens, runtime_draft_len + 1, vocab_size)
 
             draft_vocab_size = draft_probs.shape[-1]
             assert draft_probs.shape[0] == num_gens, (
@@ -1190,11 +1195,13 @@ def _sample_and_accept_draft_tokens_rejection(
             d2t = getattr(spec_metadata, "d2t", None)
             if draft_vocab_size != vocab_size:
                 # Use the pre-allocated buffer from spec_metadata.prepare()
-                # (zero-filled once at init; untouched positions stay 0). Falls
-                # back to per-iter allocation if the buffer is not configured,
-                # e.g. when use_rejection_sampling was off at prepare() time.
+                # (zero-filled once at init; untouched positions stay 0).
+                # Falls back to per-iter allocation if the buffer is not
+                # configured, e.g. when use_rejection_sampling was off at
+                # prepare() time.
                 if spec_metadata.full_draft_probs is not None:
-                    full_draft_probs = spec_metadata.full_draft_probs[:num_gens]
+                    full_draft_probs = spec_metadata.full_draft_probs[:
+                                                                      num_gens]
                 else:
                     full_draft_probs = torch.zeros(
                         (num_gens, runtime_draft_len, vocab_size),
@@ -1204,11 +1211,17 @@ def _sample_and_accept_draft_tokens_rejection(
                     assert d2t.numel() == draft_vocab_size, (
                         f"d2t size mismatch: {d2t.numel()} != {draft_vocab_size}"
                     )
-                    d2t = d2t.to(device=device)
-                    source_indices = torch.arange(draft_vocab_size,
-                                                  device=device,
-                                                  dtype=torch.long)
-                    target_indices = (source_indices + d2t) % vocab_size
+                    # d2t is model-static; compute target_indices once and
+                    # cache on spec_metadata to skip the arange + add + mod
+                    # kernel sequence on every iter.
+                    target_indices = spec_metadata.d2t_target_indices
+                    if target_indices is None:
+                        source_indices = torch.arange(draft_vocab_size,
+                                                      device=device,
+                                                      dtype=torch.long)
+                        target_indices = (source_indices +
+                                          d2t.to(device=device)) % vocab_size
+                        spec_metadata.d2t_target_indices = target_indices
                     full_draft_probs[:, :runtime_draft_len,
                                      target_indices] = draft_probs
                 else: