[#15022][fix] Guided decoding (xgrammar) + EAGLE-3 + draft_len_schedule reaching 0 crashes during CUDA graph capture, "bitmask must have the same batch size as logits" (#15023)

chungen04 · web-flow · commit 00ed78ca0f79 · 2026-06-11T16:10:47.000-07:00
Signed-off-by: chungen04 &lt;b09901027@ntu.edu.tw&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
@@ -259,8 +259,12 @@ def _build(self, requests: GuidedRequests) -> List[Tuple[int, str]]:
                     matcher.fill_next_token_bitmask(self.bitmask_host, offset)
                     self.token_mask_host[offset] = 1
                     self.num_guided_tokens[slot] += 1
-                    # Process draft tokens
-                    for i, tid in enumerate(req.draft_tokens, 1):
+                    # Process draft tokens. Bound by the layout's draft length:
+                    # the new_tokens buffer always holds the static max, but only
+                    # `max_num_draft_tokens` slots are reserved this iteration.
+                    for i, tid in enumerate(
+                            req.draft_tokens[:requests.max_num_draft_tokens],
+                            1):
                         accepted = matcher.accept_token(tid)
                         if not accepted:
                             break
@@ -332,9 +336,13 @@ def _apply_bitmask(self,
             d2t=d2t)
 
     @nvtx_range("GuidedDecoder.add_batch")
-    def add_batch(self, scheduled_requests: ScheduledRequests) -> None:
+    def add_batch(self,
+                  scheduled_requests: ScheduledRequests,
+                  runtime_draft_len: Optional[int] = None) -> None:
+        num_draft_tokens = (self.max_num_draft_tokens
+                            if runtime_draft_len is None else runtime_draft_len)
         self.requests = GuidedRequests.from_scheduled_requests(
-            scheduled_requests, self.max_num_draft_tokens)
+            scheduled_requests, num_draft_tokens)
 
     @nvtx_range("GuideDecoder.build")
     def build(self) -> List[Tuple[int, str]]:
@@ -470,9 +478,14 @@ def __init__(self,
     @nvtx_range("GuidedDecoder.add_batch")
     def add_batch(self,
                   scheduled_requests: ScheduledRequests,
-                  new_tokens: Optional[torch.Tensor] = None) -> None:
+                  new_tokens: Optional[torch.Tensor] = None,
+                  runtime_draft_len: Optional[int] = None) -> None:
+        # See GuidedDecoder.add_batch: the layout must follow the runtime draft
+        # length so the captured graph's bitmask matches the target logits.
+        num_draft_tokens = (self.max_num_draft_tokens
+                            if runtime_draft_len is None else runtime_draft_len)
         self.requests = GuidedRequests.from_scheduled_requests(
-            scheduled_requests, self.max_num_draft_tokens)
+            scheduled_requests, num_draft_tokens)
         if new_tokens is not None:
             self.new_tokens.copy_(new_tokens.squeeze(-1), non_blocking=True)
         self.queue.put((self.requests, new_tokens is not None))
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -2639,8 +2639,10 @@ def _prepare_tp_inputs(
 
         # Must be before the update of py_batch_idx
         if self.guided_decoder is not None:
-            self.guided_decoder.add_batch(scheduled_requests,
-                                          new_tokens=new_tokens_device)
+            self.guided_decoder.add_batch(
+                scheduled_requests,
+                new_tokens=new_tokens_device,
+                runtime_draft_len=self.runtime_draft_len)
 
         if self._can_use_incremental_update(scheduled_requests,
                                             new_tokens_device,