diff --git a/src/runtime/rkllm_runtime.py b/src/runtime/rkllm_runtime.py
index f3fbe35..7c440e7 100644
--- a/src/runtime/rkllm_runtime.py
+++ b/src/runtime/rkllm_runtime.py
@@ -13,10 +13,24 @@
- try_load(): factory; returns None on any failure so app.py's
lifespan falls back to MockBackend cleanly.
-The chat template + tool-call grammar is Qwen 2.5's native
-function-calling format. Qwen emits tool calls wrapped in
-{"name":"...","arguments":{...}}; we append
-results back as {...}. The runbook +
+Chat template: Qwen 2.5 + Qwen 3 share the same ChatML envelope
+(<|im_start|>{role}\\n{content}<|im_end|>). Qwen 3 adds hybrid
+thinking mode — the assistant turn is prefixed with `\\n`,
+the model emits chain-of-thought, then ``, then the
+structured answer. We detect Qwen 3 from the model filename and:
+ 1. Inject the `\\n` prefix into the assistant turn so the
+ model always reasons before answering (highest-intelligence mode).
+ 2. Strip the `...` content from history BEFORE the
+ next turn — keeps KV cache bounded as the tool-call loop runs.
+ This matches the Qwen 3 model-card guidance: "historical model
+ output should only include the final output, not the thinking".
+ 3. Keep the think content in the SSE `thought` event so the user
+ sees the model's reasoning live (frontend can collapse the
+ bubble if it gets verbose).
+
+Tool-call grammar (Qwen-family-agnostic): tool calls wrapped in
+{"name":"...","arguments":{...}}; results
+appended back as {...}. Runbook +
tool definitions come in via system prompt at session start.
"""
from __future__ import annotations
@@ -44,7 +58,7 @@
DEFAULT_SO_PATH = "/lib/librkllmrt.so"
DEFAULT_FALLBACK_SO_PATH = "/app/vendor/rkllm/librkllmrt.so"
DEFAULT_MODEL_DIR = "/uniondrive/blox-ai/model"
-DEFAULT_MODEL_FILENAME = "qwen2.5-3b-instruct-rk3588-w8a8.rkllm"
+DEFAULT_MODEL_FILENAME = "qwen3-1.7b-rk3588-w8a8.rkllm"
# ---------------------------------------------------------------------------
@@ -272,7 +286,13 @@ def _on_token(self, result_ptr, userdata, state):
def init_model(
self,
max_context_len: int = 8192,
- max_new_tokens: int = 2048,
+ # max_new_tokens raised to 3072 with the Qwen 3 1.7B + thinking-
+ # mode swap (advisor catch). Thinking blocks empirically run
+ # 500-1500 tokens; the structured response adds another 200-500.
+ # The prior 2048 was tight enough to truncate mid-verdict on
+ # hard prompts, which manifests as no in the output
+ # → strip_think returns empty → user sees nothing useful.
+ max_new_tokens: int = 3072,
temperature: float = 0.6,
top_k: int = 20,
top_p: float = 0.8,
@@ -500,18 +520,99 @@ def _build_system_prompt(runbook_text: str = "", max_runbook_chars: int = 2000)
return SYSTEM_PROMPT_TEMPLATE.format(tool_list=tool_list, runbook_excerpt=excerpt)
-# Qwen 2.5 chat template tokens
+# Qwen 2.5 + Qwen 3 chat template tokens (identical ChatML envelope)
_QWEN_IM_START = "<|im_start|>"
_QWEN_IM_END = "<|im_end|>"
-
-def _build_chat_prompt(system: str, history: list[dict]) -> str:
- """Format using Qwen 2.5's ChatML template. history is a list of
- {role: user|assistant|tool, content: str} dicts."""
+# Qwen 3 hybrid-thinking sentinel — literal text the model emits when
+# thinking mode is on. Per Qwen 3 model card / tokenizer_config.json,
+# `` / `` are NOT special tokens; they pass through
+# verbatim even with skip_special_token=True. Verified at first lab
+# inference (log the raw generate() output once on the new .rkllm to
+# confirm — see comment on RKLLMBackend._strip_think_for_history).
+_THINK_OPEN = ""
+_THINK_CLOSE = ""
+_THINK_RE = re.compile(r".*?", re.DOTALL)
+
+
+def _is_qwen3_model(model_path: Optional[str]) -> bool:
+ """Detect Qwen 3 model from filename. Matches `qwen3` or `qwen-3`
+ case-insensitive. Used to gate the thinking-mode wiring so devices
+ that haven't yet downloaded the Qwen 3 file (and still have an old
+ Qwen 2.5 cached) continue using the legacy non-thinking format.
+
+ Filename-based rather than tokenizer-introspecting because RKLLM-
+ quantized models don't expose tokenizer config the way HF models
+ do — the .rkllm file is opaque tensors. Filename is the only signal
+ we have at backend-construction time."""
+ if not model_path:
+ return False
+ name = os.path.basename(model_path).lower()
+ return "qwen3" in name or "qwen-3" in name
+
+
+def _strip_think(text: str) -> str:
+ """Drop Qwen 3 ... reasoning from `text`.
+
+ Caller MUST only invoke when thinking mode is on (output produced
+ with the `\\n` assistant prefix). The contract: the output
+ starts INSIDE a think block (because the prefix already opened
+ one), continues with chain-of-thought prose, hits ``, then
+ contains the structured answer. We return everything after the
+ first ``.
+
+ Edge cases:
+ - TRUNCATED (no `` anywhere): the model hit
+ max_new_tokens mid-thought. The whole output is internal
+ reasoning with no usable structured content — return empty.
+ Caller treats this as a prose-only / no-verdict turn and
+ force-verdicts.
+ - SELF-WRAPPED (model emits its own `...` pair
+ AFTER the prefix closure, e.g. it changes mind mid-answer):
+ defensively sub out any further pairs in the tail.
+ - TRAILING-OPEN (model started a new `` near the end and
+ ran out of tokens): drop from the orphan open tag to end of
+ string so it doesn't bleed into history.
+
+ advisor-flagged: an earlier `rfind` variant would have dropped
+ content between multiple closing tags. split("", 1) does
+ the right thing in a single pass."""
+ if _THINK_CLOSE not in text:
+ return ""
+ text = text.split(_THINK_CLOSE, 1)[1]
+ text = _THINK_RE.sub("", text)
+ open_idx = text.find(_THINK_OPEN)
+ if open_idx != -1:
+ text = text[:open_idx]
+ return text
+
+
+def _build_chat_prompt(
+ system: str,
+ history: list[dict],
+ enable_thinking: bool = False,
+) -> str:
+ """Format using Qwen 2.5 / Qwen 3 ChatML template. history is a
+ list of {role: user|assistant|tool, content: str} dicts.
+
+ enable_thinking=True (Qwen 3 path): inject the `\\n` prefix
+ into the assistant turn so the model starts inside the think block.
+ Matches `apply_chat_template(enable_thinking=True)` from the
+ Hugging Face tokenizer config — required for highest-intelligence
+ mode on Qwen 3."""
parts = [f"{_QWEN_IM_START}system\n{system}{_QWEN_IM_END}"]
for msg in history:
parts.append(f"{_QWEN_IM_START}{msg['role']}\n{msg['content']}{_QWEN_IM_END}")
- parts.append(f"{_QWEN_IM_START}assistant\n")
+ assistant_prefix = f"{_QWEN_IM_START}assistant\n"
+ if enable_thinking:
+ # Tokenizer template emits exactly `\n` (with trailing
+ # newline) immediately after the role marker's newline. Match
+ # that byte-for-byte. If this drifts from the HF template by a
+ # whitespace character the model is mildly confused but still
+ # functional — verify on lab by inspecting one full prompt
+ # before generate().
+ assistant_prefix += f"{_THINK_OPEN}\n"
+ parts.append(assistant_prefix)
return "\n".join(parts)
@@ -754,13 +855,22 @@ def strip_blocks(raw_text: str) -> str:
@dataclass
class RKLLMBackend:
- """Production backend: real Qwen 2.5 3B + tool-call loop.
+ """Production backend: real Qwen 2.5 / Qwen 3 + tool-call loop.
Construct via try_load() which injects the executor + signer hooks
so the backend can run diag tools inline and mint real HMAC tokens
for recommended_action events. The bridge in tool_call_loop.py
detects `consumes_tool_results=True` and skips its own tool_call
- interception (the backend handles it end-to-end)."""
+ interception (the backend handles it end-to-end).
+
+ Qwen 3 thinking mode is wired automatically based on the model's
+ filename (see _is_qwen3_model). When ON:
+ - assistant prefix gets `\\n` injected (model thinks first)
+ - SSE thought event receives the raw output WITH think content
+ (UI transparency — frontend can collapse the bubble)
+ - history entries strip ... before next turn
+ (KV cache stays bounded across the tool-call loop)
+ """
name: str = "rkllm"
loaded: bool = False
@@ -770,6 +880,9 @@ class RKLLMBackend:
_tool_executor: Optional[Callable[[str, dict], Awaitable[dict]]] = None
_action_signer: Optional[Callable[[str], str]] = None
_runbook_loader: Optional[Any] = None
+ # Set by try_load() from the resolved model_path. Controls assistant-
+ # prefix injection + per-turn history rewriting.
+ _enable_thinking: bool = False
# Tells the bridge: don't intercept tool_call events; we handle them.
consumes_tool_results: bool = True
@@ -851,7 +964,9 @@ async def run_troubleshoot(
history.append({"role": "user", "content": FORCE_VERDICT_DIRECTIVE})
force_verdict_attempted = True
- full_prompt = _build_chat_prompt(system_prompt, history)
+ full_prompt = _build_chat_prompt(
+ system_prompt, history, enable_thinking=self._enable_thinking,
+ )
try:
output = await loop.run_in_executor(
@@ -876,19 +991,46 @@ async def run_troubleshoot(
}
return
- # Track this turn's assistant output in conversation history.
- history.append({"role": "assistant", "content": output})
+ # Qwen 3 thinking mode: the raw output starts INSIDE a
+ # block (because we prepended `\n` to the prefix). The
+ # structured response (tool calls, verdict, recommendations)
+ # only exists AFTER ``. Pre-strip so:
+ # - parsers can't be tripped by stray XML mentions inside
+ # the model's reasoning prose ("I should call ")
+ # - history stores the bounded post-think form (Qwen 3 model-
+ # card guidance: "historical output should not include the
+ # thinking")
+ # For the SSE thought event we keep the FULL raw output so the
+ # user sees live reasoning — frontend can default-collapse.
+ output_for_parsing = (
+ _strip_think(output) if self._enable_thinking else output
+ )
- # Surface the model's prose as a thought event
- thought_text = strip_blocks(output)
+ # Track this turn's assistant output in conversation history.
+ # KV cache only sees the stripped form on subsequent turns.
+ history.append({"role": "assistant", "content": output_for_parsing})
+
+ # Surface the model's POST-THINK prose as a thought event.
+ # User preference (literal): hide content from UI
+ # too, not just from KV. We feed strip_blocks the already-
+ # de-thinked text so chain-of-thought reasoning never reaches
+ # the SSE stream. If the post-think prose is empty (turn
+ # consisted only of structured blocks), emit a short synthetic
+ # marker so the stream isn't silent on slow BLE transports.
+ thought_text = strip_blocks(output_for_parsing)
if thought_text:
# SSE thought schema: minLength 1, maxLength 4000
yield {"type": "thought", "payload": thought_text[:4000]}
+ elif self._enable_thinking:
+ # Qwen 3 turn was 100% structured output after ;
+ # avoid a silent stretch by emitting a tiny marker.
+ yield {"type": "thought", "payload": "Analyzing diagnostics..."}
- # Parse blocks
- verdict = parse_verdict(output)
- recommendations = parse_recommendations(output)
- tool_calls = parse_tool_calls(output)
+ # Parse blocks from the post-think text so XML mentions inside
+ # reasoning prose can't pollute the parse results.
+ verdict = parse_verdict(output_for_parsing)
+ recommendations = parse_recommendations(output_for_parsing)
+ tool_calls = parse_tool_calls(output_for_parsing)
# Run each tool call inline + feed result back as tool_response
if tool_calls and self._tool_executor is not None:
@@ -1043,6 +1185,19 @@ def try_load(model_path_override: Optional[str] = None) -> Optional[RKLLMBackend
except (RKLLMLoadError, OSError) as e:
logger.warning("RKLLM init failed: %s; MockBackend stays wired", e)
return None
- backend = RKLLMBackend(loaded=True, _runtime=runtime)
- logger.info("RKLLMBackend loaded (so=%s, model=%s)", so_path, model_path)
+ # Qwen 3 thinking mode is filename-gated so devices that still have
+ # an old Qwen 2.5 cached (Qwen 3 download not yet completed) continue
+ # using the legacy non-thinking format. The download_model.sh cleanup
+ # logic removes the stale 1.5B AFTER the new Qwen 3 SHA verifies, so
+ # this detection flips on automatically once the new file lands.
+ enable_thinking = _is_qwen3_model(model_path)
+ backend = RKLLMBackend(
+ loaded=True,
+ _runtime=runtime,
+ _enable_thinking=enable_thinking,
+ )
+ logger.info(
+ "RKLLMBackend loaded (so=%s, model=%s, thinking=%s)",
+ so_path, model_path, enable_thinking,
+ )
return backend
diff --git a/tests/test_rkllm_runtime.py b/tests/test_rkllm_runtime.py
index 1b7931d..4c3007d 100644
--- a/tests/test_rkllm_runtime.py
+++ b/tests/test_rkllm_runtime.py
@@ -626,6 +626,309 @@ async def collect():
assert any(FORCE_VERDICT_DIRECTIVE in p for p in prompts_seen)
+# ---------------------------------------------------------------------------
+# Qwen 3 thinking-mode swap (2026-05-26)
+# ---------------------------------------------------------------------------
+
+def test_default_model_filename_is_qwen3():
+ """Sanity check: the in-container default points at the active Qwen 3
+ file. Drives both find_model_path() fallback AND filename-based
+ thinking-mode detection."""
+ from src.runtime import rkllm_runtime as mod
+ assert mod.DEFAULT_MODEL_FILENAME == "qwen3-1.7b-rk3588-w8a8.rkllm"
+
+
+def test_is_qwen3_model_detects_canonical_filename():
+ from src.runtime.rkllm_runtime import _is_qwen3_model
+ assert _is_qwen3_model("/uniondrive/model/qwen3-1.7b-rk3588-w8a8.rkllm") is True
+ # Hyphenated variant
+ assert _is_qwen3_model("/uniondrive/model/qwen-3-1.7b-rk3588-w8a8.rkllm") is True
+ # Case-insensitive
+ assert _is_qwen3_model("/path/Qwen3-1.7B-rk3588.rkllm") is True
+
+
+def test_is_qwen3_model_rejects_qwen_2_5():
+ """Critical for rollout safety: devices that still have the prior
+ Qwen 2.5 cached must NOT have thinking-mode enabled (the model
+ doesn't support `` tags, would emit junk if we prepend the
+ prefix). The cleanup of stale 1.5B only happens AFTER the new
+ Qwen 3 download verifies, so during the transition window both
+ files coexist and find_model_path() may pick the 1.5B."""
+ from src.runtime.rkllm_runtime import _is_qwen3_model
+ assert _is_qwen3_model("/path/qwen2.5-1.5b-instruct-rk3588-w8a8.rkllm") is False
+ assert _is_qwen3_model("/path/qwen2.5-3b-instruct-rk3588-w8a8.rkllm") is False
+ assert _is_qwen3_model("/path/deepseek-llm-7b-chat.rkllm") is False
+ assert _is_qwen3_model(None) is False
+ assert _is_qwen3_model("") is False
+
+
+def test_strip_think_drops_full_block():
+ """Standard shape: the assistant prefix injected `\\n` so the
+ raw output starts inside the think block. After the first ``
+ is the structured response."""
+ from src.runtime.rkllm_runtime import _strip_think
+ raw = (
+ "Let me reason about this for a moment. The user said disconnected.\n"
+ "First I should check heartbeat.\n"
+ '{"name":"diag/summary","arguments":{}}'
+ )
+ out = _strip_think(raw)
+ assert "" in out
+ assert "Let me reason about this" not in out
+ assert "" not in out
+
+
+def test_strip_think_returns_empty_when_truncated_mid_think():
+ """Model hit max_new_tokens mid-thought — no `` ever
+ emitted. Whole output is internal reasoning; nothing structured to
+ surface. Caller (run_troubleshoot) treats this as a prose-only
+ turn and force-verdicts on the next iteration."""
+ from src.runtime.rkllm_runtime import _strip_think
+ truncated = (
+ "Let me think step by step. The user reports a slow connection.\n"
+ "Possible causes include kubo, ipfs_cluster, or wireguard. I should"
+ )
+ assert _strip_think(truncated) == ""
+
+
+def test_strip_think_handles_self_wrapped_pair_after_main_close():
+ """Defensive: model emits its own `X` block in the
+ middle of the structured response (e.g., changes its mind). Must
+ sub out the inner pair too."""
+ from src.runtime.rkllm_runtime import _strip_think
+ raw = (
+ "reasoning prose\n"
+ '{"name":"diag/internet","arguments":{}}\n'
+ "actually, let me also check time\n"
+ '{"name":"diag/time","arguments":{}}'
+ )
+ out = _strip_think(raw)
+ assert "" not in out
+ assert "" not in out
+ assert "diag/internet" in out
+ assert "diag/time" in out
+
+
+def test_strip_think_drops_trailing_unclosed_open():
+ """Model started a new think block near the end of its turn but ran
+ out of tokens before closing. Drop from the orphan `` to end
+ of string so the partial reasoning doesn't bleed into history."""
+ from src.runtime.rkllm_runtime import _strip_think
+ raw = (
+ "reasoning\n"
+ '{"name":"diag/summary","arguments":{}}\n'
+ "wait, let me also"
+ )
+ out = _strip_think(raw)
+ assert "" in out
+ assert "" not in out
+ assert "wait, let me also" not in out
+
+
+def test_build_chat_prompt_enable_thinking_injects_think_prefix():
+ """With enable_thinking=True the assistant prefix gets `\\n`
+ appended so the model starts inside the think block. This matches
+ `apply_chat_template(enable_thinking=True)` from the HF tokenizer."""
+ from src.runtime.rkllm_runtime import _build_chat_prompt
+ p = _build_chat_prompt(
+ system="SYS",
+ history=[{"role": "user", "content": "diagnose"}],
+ enable_thinking=True,
+ )
+ assert p.endswith("<|im_start|>assistant\n\n")
+
+
+def test_build_chat_prompt_default_no_think_prefix():
+ """Default (Qwen 2.5 legacy path): no `` prefix injected.
+ Critical for rollout safety — devices on old cached models must
+ not get the prefix because their tokenizer doesn't know about it."""
+ from src.runtime.rkllm_runtime import _build_chat_prompt
+ p = _build_chat_prompt(system="SYS", history=[{"role": "user", "content": "x"}])
+ assert p.endswith("<|im_start|>assistant\n")
+ assert "" not in p
+
+
+def test_try_load_sets_enable_thinking_for_qwen3_model():
+ """try_load() must wire the filename-based thinking detection so
+ the backend dataclass carries the correct mode for run_troubleshoot.
+ Without this end-to-end wiring the prompt prefix injection never
+ fires and the model produces non-thinking-mode output."""
+ from src.runtime import rkllm_runtime as mod
+ fake_lib = MagicMock()
+ fake_lib.rkllm_init = MagicMock(return_value=0)
+ fake_lib.rkllm_destroy = MagicMock(return_value=0)
+ qwen3_path = "/uniondrive/model/qwen3-1.7b-rk3588-w8a8.rkllm"
+ with patch.object(mod, "find_so_path", return_value="/fake/lib.so"), \
+ patch.object(mod, "find_model_path", return_value=qwen3_path), \
+ patch("ctypes.CDLL", return_value=fake_lib):
+ backend = mod.try_load()
+ assert backend is not None
+ assert backend._enable_thinking is True
+
+
+def test_try_load_leaves_thinking_off_for_qwen_2_5_model():
+ """Rollout-safety regression: a device with the old 1.5B still
+ cached (Qwen 3 .rkllm not yet downloaded) must boot with thinking
+ OFF — otherwise the model tokenizes the `\\n` prefix as raw
+ text and emits garbled output."""
+ from src.runtime import rkllm_runtime as mod
+ fake_lib = MagicMock()
+ fake_lib.rkllm_init = MagicMock(return_value=0)
+ fake_lib.rkllm_destroy = MagicMock(return_value=0)
+ legacy_path = "/uniondrive/model/qwen2.5-1.5b-instruct-rk3588-w8a8.rkllm"
+ with patch.object(mod, "find_so_path", return_value="/fake/lib.so"), \
+ patch.object(mod, "find_model_path", return_value=legacy_path), \
+ patch("ctypes.CDLL", return_value=fake_lib):
+ backend = mod.try_load()
+ assert backend is not None
+ assert backend._enable_thinking is False
+
+
+def test_run_troubleshoot_strips_think_from_history_in_qwen3_mode():
+ """The whole point of the Qwen 3 swap: per-turn `` content
+ must NOT accumulate in KV cache across the tool-call loop. Verify
+ by inspecting the prompt sent on turn N+1 — it should NOT contain
+ turn N's chain-of-thought, only the post-`` structured
+ output (tool calls + prose)."""
+ import asyncio
+ from src.runtime.rkllm_runtime import RKLLMBackend
+
+ turn_outputs = [
+ # Turn 0: think + tool call. Output starts INSIDE think block
+ # (prefix already injected via prompt).
+ ("CoT-TURN-0-INTERNAL: I should check overall system state.\n"
+ '{"name":"diag/summary","arguments":{}}'),
+ # Turn 1: think + verdict. CoT-TURN-1 should also not show up
+ # on turn 2, but more importantly CoT-TURN-0 must be GONE.
+ ("CoT-TURN-1-INTERNAL: Everything looks fine.\n"
+ '{"summary":"healthy","severity":"green","root_cause":"n_a"}'),
+ ]
+ turn_idx = {"i": 0}
+ prompts_seen = []
+
+ class FakeRuntime:
+ def generate(self, prompt, timeout_s=90.0):
+ prompts_seen.append(prompt)
+ i = turn_idx["i"]
+ turn_idx["i"] = i + 1
+ return turn_outputs[i] if i < len(turn_outputs) else "(end)"
+
+ async def fake_executor(tool, args):
+ return {"overall": "green", "subsystems": {}}
+
+ backend = RKLLMBackend(
+ loaded=True, _runtime=FakeRuntime(), _enable_thinking=True,
+ )
+ backend.wire_runtime_deps(
+ tool_executor=fake_executor, action_signer=lambda x: "f" * 64,
+ )
+
+ async def collect():
+ return [ev async for ev in backend.run_troubleshoot("x", session_id="sid")]
+
+ asyncio.run(collect())
+
+ # Turn 1's prompt MUST NOT contain turn 0's internal CoT — that's
+ # the entire point of the KV-bloat-safe history rewrite.
+ assert len(prompts_seen) >= 2, "Expected at least two turns"
+ turn_1_prompt = prompts_seen[1]
+ assert "CoT-TURN-0-INTERNAL" not in turn_1_prompt, (
+ "Per-turn content leaked into next turn's prompt — "
+ "history rewrite is broken; KV cache will bloat across the "
+ "tool-call loop. This is the regression Qwen 3 model-card "
+ "guidance explicitly warns against."
+ )
+ # The structured tool_call from turn 0 MUST still be in turn 1's
+ # prompt (otherwise the model loses context of what it called).
+ assert "diag/summary" in turn_1_prompt
+
+
+def test_run_troubleshoot_hides_think_content_from_thought_event():
+ """User preference (literal): 'if process can be hidden ...
+ it is preferred to hide it'. Chain-of-thought must NOT reach the SSE
+ stream — only the post-think prose (and structured tool/verdict
+ events) are user-visible.
+
+ Verifies the strip happens at the SSE boundary AND that when the
+ post-think prose is empty (turn was 100% structured output), a
+ short synthetic marker is emitted so the stream isn't silent."""
+ import asyncio
+ from src.runtime.rkllm_runtime import RKLLMBackend
+
+ class FakeRuntime:
+ def generate(self, prompt, timeout_s=90.0):
+ # Turn output: CoT inside the think block, then ONLY a
+ # verdict block. No post-think prose. The thought event
+ # should fall back to the synthetic "Analyzing..." marker.
+ return (
+ "CHAIN_OF_THOUGHT_CONTENT_SHOULD_NEVER_REACH_SSE\n"
+ '{"summary":"x","severity":"green","root_cause":"y"}'
+ )
+
+ backend = RKLLMBackend(
+ loaded=True, _runtime=FakeRuntime(), _enable_thinking=True,
+ )
+ backend.wire_runtime_deps(
+ tool_executor=None, action_signer=lambda x: "f" * 64,
+ )
+
+ async def collect():
+ return [ev async for ev in backend.run_troubleshoot("x")]
+
+ events = asyncio.run(collect())
+ thoughts = [e for e in events if e["type"] == "thought"]
+ # Hard requirement: think content never appears in ANY thought event
+ for t in thoughts:
+ assert "CHAIN_OF_THOUGHT_CONTENT_SHOULD_NEVER_REACH_SSE" not in t["payload"], (
+ f"Think content leaked into SSE thought event — user-visible: "
+ f"{t['payload']!r}"
+ )
+ # Non-silent stream: a synthetic marker should fill the gap when
+ # the post-think prose is empty
+ assert any("Analyzing" in t["payload"] for t in thoughts), (
+ "Expected synthetic 'Analyzing diagnostics...' marker when "
+ "post-think prose is empty"
+ )
+
+
+def test_run_troubleshoot_surfaces_post_think_prose_when_present():
+ """Sibling test: when the model DOES emit non-block prose after
+ , that prose reaches the SSE thought event verbatim.
+ Confirms we strip think content but not the actual user-visible
+ reasoning the model offers after."""
+ import asyncio
+ from src.runtime.rkllm_runtime import RKLLMBackend
+
+ class FakeRuntime:
+ def generate(self, prompt, timeout_s=90.0):
+ return (
+ "HIDDEN_COT_CONTENT\n"
+ "VISIBLE_PROSE_AFTER_THINK that explains what's happening.\n"
+ '{"summary":"x","severity":"green","root_cause":"y"}'
+ )
+
+ backend = RKLLMBackend(
+ loaded=True, _runtime=FakeRuntime(), _enable_thinking=True,
+ )
+ backend.wire_runtime_deps(
+ tool_executor=None, action_signer=lambda x: "f" * 64,
+ )
+
+ async def collect():
+ return [ev async for ev in backend.run_troubleshoot("x")]
+
+ events = asyncio.run(collect())
+ thoughts = [e for e in events if e["type"] == "thought"]
+ assert any("VISIBLE_PROSE_AFTER_THINK" in t["payload"] for t in thoughts), (
+ "Post-think prose should reach the SSE thought event"
+ )
+ for t in thoughts:
+ assert "HIDDEN_COT_CONTENT" not in t["payload"], (
+ "Think content must NEVER leak into SSE, even when post-think "
+ "prose is also present"
+ )
+
+
def test_run_troubleshoot_force_verdict_at_max_turns_minus_one():
"""Even when the model is happily calling tools but never finalizes,
the backend injects the force-verdict directive at MAX_TURNS-1 and