fix: correct misleading docstrings and rename _apply_guided_bitmask → _prepare_guided_bitmask

windreamer · windreamer · commit ecc143b09398 · 2026-06-05T09:52:29.000+08:00
- _apply_guided_bitmask: rename to _prepare_guided_bitmask and update
  docstring to clarify it only allocates/fills the bitmask; callers
  (and Eagle3 vocab translation) are responsible for actual application.
- _accept_guided_tokens: replace 'original grammar matchers' with
  'provided grammar matchers' since they are typically forked copies
  created by SpecModelAgent.
diff --git a/lmdeploy/pytorch/spec_decode/proposers/base.py b/lmdeploy/pytorch/spec_decode/proposers/base.py
@@ -68,12 +68,14 @@ def __init__(self, specdecode_config: SpecDecodeConfig, device: torch.device = N
         # Set by SpecModelAgent after construction
         self.guided_decoding_manager = None
 
-    async def _apply_guided_bitmask(self, logits: torch.Tensor,
+    async def _prepare_guided_bitmask(self, logits: torch.Tensor,
                                     guided_processors: dict | None) -> torch.Tensor | None:
-        """Apply guided-decoding bitmask to draft logits and return the
-        allocated bitmask (caller may need it for e.g. d2t translation).
+        """Allocate and fill a guided-decoding bitmask for draft logits.
 
-        If no guided processors are active, returns None.
+        Returns the filled bitmask tensor (or None if no guided processors are
+        active).  The caller is responsible for actually applying the bitmask to
+        logits — some proposers (e.g. Eagle3) may need to translate the bitmask
+        to their draft vocabulary first.
 
         CPU-bound xgrammar ``fill_bitmap`` calls are offloaded to a thread
         so they don't block the asyncio event loop.
@@ -92,7 +94,11 @@ def _fill():
 
     async def _accept_guided_tokens(self, draft_token_ids: torch.Tensor,
                                     guided_processors: dict | None):
-        """Accept draft tokens on the original grammar matchers.
+        """Accept draft tokens on the provided grammar matchers.
+
+        In speculative decoding the matchers are typically forked from the
+        originals (created in ``SpecModelAgent._async_model_forward``), so this
+        method accepts on whichever matchers are passed in.
 
         CPU-bound xgrammar ``accept_token`` calls are offloaded to a thread
         so they don't block the asyncio event loop.
diff --git a/lmdeploy/pytorch/spec_decode/proposers/deepseek_mtp.py b/lmdeploy/pytorch/spec_decode/proposers/deepseek_mtp.py
@@ -32,7 +32,7 @@ async def get_outputs(self,
 
         logits = self.get_logits(hidden_states)[0]
 
-        guided_bitmask = await self._apply_guided_bitmask(logits, guided_processors)
+        guided_bitmask = await self._prepare_guided_bitmask(logits, guided_processors)
         if guided_bitmask is not None:
             self.guided_decoding_manager.apply_batched_bitmap(logits, guided_bitmask)
 
diff --git a/lmdeploy/pytorch/spec_decode/proposers/eagle3.py b/lmdeploy/pytorch/spec_decode/proposers/eagle3.py
@@ -113,7 +113,7 @@ async def get_outputs(self,
 
         logits = self.get_logits(hidden_states)[0]
 
-        guided_bitmask = await self._apply_guided_bitmask(logits, guided_processors)
+        guided_bitmask = await self._prepare_guided_bitmask(logits, guided_processors)
         if guided_bitmask is not None:
             draft_bitmask = self._translate_bitmask(guided_bitmask)
             self.guided_decoding_manager.apply_batched_bitmap(logits, draft_bitmask)