Luce-Org
diff --git a/‎dflash/scripts/server.py‎
Lines changed: 75 additions & 11 deletions b/‎dflash/scripts/server.py‎
Lines changed: 75 additions & 11 deletions
diff --git a/‎dflash/scripts/test_server.py‎
Lines changed: 58 additions & 0 deletions b/‎dflash/scripts/test_server.py‎
Lines changed: 58 additions & 0 deletions
@@ -696,7 +696,11 @@ def build_app(target: Path, draft: Path | None, bin_path: Path, budget: int, max
               verify_mode: str = "ddtree",
               extra_daemon_args: list[str] | None = None,
               lazy_draft: bool = False,
-              verbose_daemon: bool = False) -> FastAPI:
+              verbose_daemon: bool = False,
+              mtp_gguf: Path | None = None,
+              mtp_gamma: int = 3,
+              mtp_draft_source: str = "chain",
+              mtp_draft_topk: int = 1) -> FastAPI:
     import asyncio
     if _extra_daemon_has_target_sharding(extra_daemon_args):
         if prefix_cache_slots > 0 or prefill_cache_slots > 0:
@@ -753,6 +757,19 @@ async def _openai_compat_error_handler(_request: Request, exc: OpenAICompatError
         cmd = [bin_abs, str(target), "--daemon",
                f"--max-ctx={max_ctx}",
                f"--stream-fd={stream_fd_val}"]
+    elif mtp_gguf is not None:
+        # MTP mode: no --draft (MTP head lives inside target or mtp_gguf),
+        # no DFlash flags. Daemon dispatches to MTP code path via --mtp-gguf.
+        cmd = [bin_abs, str(target), "--daemon",
+               f"--max-ctx={max_ctx}",
+               f"--stream-fd={stream_fd_val}",
+               f"--mtp-gguf={mtp_gguf}",
+               f"--gamma={mtp_gamma}",
+               "--draft-source", mtp_draft_source]
+        if mtp_draft_source == "mtp_topk":
+            cmd.append(f"--draft-topk={mtp_draft_topk}")
+        if extra_daemon_args:
+            cmd.extend(extra_daemon_args)
     else:
         if draft is None:
             raise SystemExit("qwen35 arch requires --draft <draft.gguf|model.safetensors>")
@@ -999,6 +1016,8 @@ def _maybe_compress(msgs: list[dict], prompt_bin: Path, prompt_ids: list[int],
             pass
         return new_bin, new_ids
 
+    _vocab_size: int = getattr(tokenizer, "vocab_size", 0) or 0
+
     def _token_stream(r, n_gen, timing=None):
         generated = 0
         hit_stop = False
@@ -1011,6 +1030,8 @@ def _token_stream(r, n_gen, timing=None):
                 if timing is not None:
                     timing["daemon_done"] = True
                 break
+            if _vocab_size and not (0 <= tok_id < _vocab_size):
+                continue
             if timing and timing.get("t_first_tok") is None:
                 timing["t_first_tok"] = time.monotonic()
             if hit_stop:
@@ -1048,6 +1069,8 @@ async def _astream_tokens(r, n_gen, timing=None):
                 if timing is not None:
                     timing["daemon_done"] = True
                 break
+            if _vocab_size and not (0 <= tok_id < _vocab_size):
+                continue
             if timing and timing.get("t_first_tok") is None:
                 timing["t_first_tok"] = time.monotonic()
             if hit_stop:
@@ -1413,9 +1436,15 @@ def emit_delta(text, kind):
                                         accumulated_content += pre
                                         out = emit_delta(pre, "content")
                                         if out: yield out
-                                        if which == "think":
+                                        if which == "think" and _thinking_enabled(req.chat_template_kwargs):
                                             window = window[idx + len(THINK_OPEN_TAG):]
                                             mode = "reasoning"
+                                        elif which == "think":
+                                            # thinking disabled — keep tag in content
+                                            accumulated_content += THINK_OPEN_TAG
+                                            out = emit_delta(THINK_OPEN_TAG, "content")
+                                            if out: yield out
+                                            window = window[idx + len(THINK_OPEN_TAG):]
                                         elif which == "think_close":
                                             window = window[idx + len(THINK_CLOSE_TAG):]
                                         else:
@@ -1594,10 +1623,11 @@ def emit_delta(text, kind):
             i = first_stop_match(text, stops)
             if i != -1:
                 text = text[:i]
-        # Parse reasoning and tool calls
-        thinking_enabled = True
-        if req.chat_template_kwargs:
-            thinking_enabled = req.chat_template_kwargs.get("enable_thinking", True)
+        # Parse reasoning and tool calls. Match the prompt-rendering default
+        # (enable_thinking=False) so that spontaneous <think> tags from Qwen3.6
+        # are kept in content instead of stripped into an empty message when
+        # the model runs out of tokens before emitting </think>.
+        thinking_enabled = _thinking_enabled(req.chat_template_kwargs)
         cleaned, tool_calls = parse_tool_calls(text, tools=req.tools)
         _remember_tool_call_text(text, tool_calls)
         cleaned, reasoning = parse_reasoning(
@@ -2230,9 +2260,7 @@ async def _responses_non_stream(
             except Exception: pass
 
         text = tokenizer.decode(tokens, skip_special_tokens=True)
-        thinking_enabled = True
-        if chat_req.chat_template_kwargs:
-            thinking_enabled = chat_req.chat_template_kwargs.get("enable_thinking", True)
+        thinking_enabled = _thinking_enabled(chat_req.chat_template_kwargs)
         cleaned, tool_calls = parse_tool_calls(text, tools=chat_req.tools)
         _remember_tool_call_text(text, tool_calls)
         cleaned, reasoning = parse_reasoning(
@@ -2420,9 +2448,16 @@ async def sse() -> AsyncIterator[str]:
                                         yield _resp_sse("response.output_text.delta", {
                                             "item_id": msg_item_id, "output_index": 0,
                                             "content_index": 0, "delta": pre})
-                                    if which == "think":
+                                    if which == "think" and _thinking_enabled(chat_req.chat_template_kwargs):
                                         window = window[idx + len(THINK_OPEN_TAG):]
                                         mode = "reasoning"
+                                    elif which == "think":
+                                        # thinking disabled — keep tag in content
+                                        accumulated_text += THINK_OPEN_TAG
+                                        yield _resp_sse("response.output_text.delta", {
+                                            "item_id": msg_item_id, "output_index": 0,
+                                            "content_index": 0, "delta": THINK_OPEN_TAG})
+                                        window = window[idx + len(THINK_OPEN_TAG):]
                                     elif which == "think_close":
                                         window = window[idx + len(THINK_CLOSE_TAG):]
                                     else:
@@ -2650,6 +2685,20 @@ def main():
                     help="Pass --draft-feature-mirror to test_dflash (safe cross-GPU feature path)")
     ap.add_argument("--peer-access", action="store_true",
                     help="Pass --peer-access to test_dflash (prefer P2P memcpy when available)")
+    # ── MTP (Multi-Token Prediction) speculator ──────────────────────────────
+    # When --mtp-gguf is set, the daemon runs MTP-head speculation instead of
+    # DFlash+DDTree. --draft is ignored (the MTP head is in the same GGUF as
+    # target, or a separate fused GGUF). Prefix-cache slots are auto-disabled
+    # in MTP mode because RESTORE does not snapshot MTP head KV yet.
+    ap.add_argument("--mtp-gguf", type=Path, default=None,
+                    help="Path to MTP-fused GGUF. When set, daemon runs MTP "
+                         "speculation; --draft and DFlash flags are ignored.")
+    ap.add_argument("--mtp-gamma", type=int, default=3,
+                    help="MTP chain depth (default 3; recommended D=3 per matrix bench)")
+    ap.add_argument("--mtp-draft-source", choices=["chain", "mtp_topk"], default="chain",
+                    help="MTP draft generation strategy (default chain)")
+    ap.add_argument("--mtp-draft-topk", type=int, default=1,
+                    help="Top-K for mtp_topk draft source (default 1, ignored for chain)")
     add_cli_flags(ap)
     args = ap.parse_args()
     prefill_cfg = config_from_args(args)
@@ -2695,6 +2744,17 @@ def main():
         # through the laguna daemon now, so --prefill-compression and
         # --prefix-cache-slots behave the same as on the qwen35 path.
         draft = None
+    elif args.mtp_gguf is not None:
+        # MTP mode: --draft is ignored; MTP head lives in the target (or in --mtp-gguf
+        # if separate). Force prefix/prefill cache off — RESTORE doesn't snapshot
+        # MTP head KV yet (planned for a follow-up PR).
+        if not args.mtp_gguf.is_file():
+            raise SystemExit(f"--mtp-gguf not found at {args.mtp_gguf}")
+        draft = None
+        if args.prefix_cache_slots > 0 or args.prefill_cache_slots > 0:
+            print("  [cfg] MTP mode: disabling prefix/prefill cache (MTP head KV snapshot not implemented)")
+            args.prefix_cache_slots = 0
+            args.prefill_cache_slots = 0
     else:
         draft = resolve_draft(args.draft) if args.draft.is_dir() else args.draft
         if not draft.is_file():
@@ -2726,7 +2786,11 @@ def main():
                     verify_mode=args.verify_mode,
                     extra_daemon_args=placement.daemon_args or None,
                     lazy_draft=args.lazy_draft,
-                    verbose_daemon=args.verbose_daemon)
+                    verbose_daemon=args.verbose_daemon,
+                    mtp_gguf=args.mtp_gguf,
+                    mtp_gamma=args.mtp_gamma,
+                    mtp_draft_source=args.mtp_draft_source,
+                    mtp_draft_topk=args.mtp_draft_topk)
 
     import uvicorn
     logging.basicConfig(
 
@@ -23,6 +23,7 @@ def mock_tokenizer():
     tokenizer.encode.return_value = [1]
     tokenizer.decode.return_value = "hello"
     tokenizer.apply_chat_template.return_value = "prompt"
+    tokenizer.vocab_size = 151936
     return tokenizer
 
 
@@ -939,3 +940,60 @@ def test_responses_instructions_and_developer_merged(mock_os_read, mock_pipe,
     assert len(system_msgs) == 1
     assert "Top-level instructions." in system_msgs[0]["content"]
     assert "Developer context." in system_msgs[0]["content"]
+
+
+# ─── out-of-range token filtering (OverflowError regression) ───────
+
+@patch("server.os.pipe")
+@patch("server.os.read")
+def test_out_of_range_token_non_streaming_returns_200(
+        mock_os_read, mock_pipe, mock_tokenizer, app):
+    """Daemon emits a negative sentinel-like token (-2) that is not the EOS
+    sentinel (-1).  Without filtering, tokenizer.decode([-2]) raises
+    OverflowError → 500.  After the fix the token is silently dropped and
+    the endpoint returns 200 with empty content rather than crashing."""
+    mock_pipe.return_value = (1, 2)
+    # Make decode raise for any negative token to mirror HF tokenizer behaviour
+    def _decode(ids, **_kw):
+        if any(t < 0 or t >= 151936 for t in ids):
+            raise OverflowError("out of range integral type conversion attempted")
+        return "hello"
+    mock_tokenizer.decode.side_effect = _decode
+    # Daemon stream: bogus token (-2) then EOS sentinel (-1)
+    mock_os_read.side_effect = [struct.pack("<i", -2), struct.pack("<i", -1)]
+
+    client = TestClient(app)
+    response = client.post("/v1/chat/completions", json={
+        "model": MODEL_NAME,
+        "messages": [{"role": "user", "content": "hi"}],
+        "stream": False,
+    })
+
+    assert response.status_code == 200
+    data = response.json()
+    assert "choices" in data
+    assert data["choices"][0]["finish_reason"] == "stop"
+
+
+@patch("server.os.pipe")
+@patch("server.os.read")
+def test_out_of_range_token_streaming_returns_200(
+        mock_os_read, mock_pipe, mock_tokenizer, app):
+    """Same contract for the streaming path: bad token is dropped, no crash."""
+    mock_pipe.return_value = (1, 2)
+    def _decode(ids, **_kw):
+        if any(t < 0 or t >= 151936 for t in ids):
+            raise OverflowError("out of range integral type conversion attempted")
+        return ""
+    mock_tokenizer.decode.side_effect = _decode
+    mock_os_read.side_effect = [struct.pack("<i", -2), struct.pack("<i", -1)]
+
+    client = TestClient(app)
+    response = client.post("/v1/chat/completions", json={
+        "model": MODEL_NAME,
+        "messages": [{"role": "user", "content": "hi"}],
+        "stream": True,
+    })
+
+    assert response.status_code == 200
+    assert "data: [DONE]" in response.text