fix(distributed): use apply_chat_template(tokenize=True, return_dict=False) for cross-version token ids

cursoragent · FluffyAIcode · cursoragent · commit 372c30f84cd1 · 2026-06-19T11:00:56.000Z
transformers 5.x returns a dict {input_ids, attention_mask} from
apply_chat_template by default -&gt; verifier.prefill got dict keys and raised
'str object cannot be interpreted as an integer' on the Mac (5.x). Adopt the
proven kv_cache_proposer.proposer.encode_chat convention (return_dict=False) in
the distributed integration test fixture AND the demo, replacing the demo's
ad-hoc coercion. Both distributed integration tests pass locally (2 passed).

Co-authored-by: FluffyAIcode &lt;FluffyAIcode@users.noreply.github.com&gt;
diff --git a/scripts/demo_distributed_spec_decode.py b/scripts/demo_distributed_spec_decode.py
@@ -174,24 +174,17 @@ async def _run_verifier_node(args: argparse.Namespace) -> int:
     # Echo-style answers are where the n-gram proposer shines; Qwen3's thinking
     # preamble is novel text the lookup cannot draft. Templates without the
     # variable ignore it harmlessly.
-    # transformers 4.x returns token ids from apply_chat_template(tokenize=True);
-    # 5.x can return a string (or a BatchEncoding) — coerce to a flat List[int]
-    # so this works across both (the Mac engine needs transformers 5.x).
-    _templated = verifier.tokenizer.apply_chat_template(
+    # transformers 5.x returns a dict by default with tokenize=True; request the
+    # legacy flat list-of-ids shape (return_dict=False) so this works on both
+    # 4.x and 5.x — same convention as kv_cache_proposer.proposer.encode_chat
+    # (the Mac engine runs transformers 5.x).
+    prompt_ids = verifier.tokenizer.apply_chat_template(
         [{"role": "user", "content": args.prompt}],
-        add_generation_prompt=True, tokenize=True,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=False,
         enable_thinking=args.enable_thinking,
     )
-    if isinstance(_templated, str):
-        prompt_ids = list(verifier.tokenizer.encode(_templated))
-    else:
-        if hasattr(_templated, "input_ids"):
-            _templated = _templated.input_ids
-        if hasattr(_templated, "tolist"):
-            _templated = _templated.tolist()
-        if _templated and isinstance(_templated[0], (list, tuple)):
-            _templated = _templated[0]
-        prompt_ids = [int(x) for x in _templated]
 
     # --- 4. Greedy baseline (same verifier, local only) --------------
     t0 = time.perf_counter()
diff --git a/tests/integration/test_distributed_spec_decode_real.py b/tests/integration/test_distributed_spec_decode_real.py
@@ -71,9 +71,14 @@ def verifier() -> SinkWindowVerifier:
 
 @pytest.fixture(scope="module")
 def prompt_ids(verifier) -> List[int]:
+    # transformers 5.x returns a dict by default with tokenize=True; request the
+    # legacy flat list-of-ids shape so it matches on 4.x and 5.x (same convention
+    # as kv_cache_proposer.proposer.encode_chat).
     return verifier.tokenizer.apply_chat_template(
         [{"role": "user", "content": PROMPT}],
         add_generation_prompt=True,
+        tokenize=True,
+        return_dict=False,
     )