feat: add streaming output for /search

lzwjava · lzwjava · commit 0b9f96ae858e · 2026-06-05T22:44:44.000+08:00
Responses now stream token-by-token instead of waiting for the full
response. Uses SSE parsing for both Copilot and OpenRouter providers.
diff --git a/iclaw/github_api.py b/iclaw/github_api.py
@@ -47,8 +47,31 @@ class UnsupportedModelError(Exception):
     pass
 
 
-def chat(messages, copilot_token, model="gpt-4o", tools=None):
-    payload = {"model": model, "messages": messages, "stream": False}
+def _parse_sse(resp):
+    """Parse SSE stream from chat completions API, yielding content chunks."""
+    for line in resp.iter_lines():
+        if not line:
+            continue
+        line = line.decode("utf-8") if isinstance(line, bytes) else line
+        if not line.startswith("data: "):
+            continue
+        data = line[6:]
+        if data == "[DONE]":
+            break
+        try:
+            import json
+
+            chunk = json.loads(data)
+            delta = chunk.get("choices", [{}])[0].get("delta", {})
+            content = delta.get("content")
+            if content:
+                yield content
+        except (ValueError, KeyError, IndexError):
+            continue
+
+
+def chat(messages, copilot_token, model="gpt-4o", tools=None, stream=False):
+    payload = {"model": model, "messages": messages, "stream": stream}
     if tools:
         payload["tools"] = tools
         payload["tool_choice"] = "auto"
@@ -57,6 +80,7 @@ def chat(messages, copilot_token, model="gpt-4o", tools=None):
         f"{COPILOT_API_BASE}/chat/completions",
         headers={"Authorization": f"Bearer {copilot_token}", **COPILOT_HEADERS},
         json=payload,
+        stream=stream,
     )
     if not resp.ok:
         if resp.status_code == 400 and "unsupported_api_for_model" in resp.text:
@@ -66,4 +90,6 @@ def chat(messages, copilot_token, model="gpt-4o", tools=None):
         raise RuntimeError(
             f"Chat API error: {resp.status_code} {resp.reason}\n{resp.text}"
         )
+    if stream:
+        return _parse_sse(resp)
     return resp.json()["choices"][0]["message"]
diff --git a/iclaw/main.py b/iclaw/main.py
@@ -53,10 +53,10 @@
 ]
 
 
-def _chat(provider, token, messages, model, tools=None):
+def _chat(provider, token, messages, model, tools=None, stream=False):
     if provider == "openrouter":
-        return openrouter.chat(messages, token, model, tools=tools)
-    return chat(messages, token, model, tools=tools)
+        return openrouter.chat(messages, token, model, tools=tools, stream=stream)
+    return chat(messages, token, model, tools=tools, stream=stream)
 
 
 def main():
@@ -208,13 +208,22 @@ def main():
                 ):
                     provider_token = get_copilot_token(github_token)
                     token_expiry = time.monotonic() + TOKEN_REFRESH_INTERVAL
-                response_message = _chat(
-                    model_provider, provider_token, messages, current_model, tools=TOOLS
+                chunks = _chat(
+                    model_provider,
+                    provider_token,
+                    messages,
+                    current_model,
+                    tools=TOOLS,
+                    stream=True,
                 )
-                reply = response_message.get("content", "")
+                print()
+                reply = ""
+                for chunk in chunks:
+                    print(chunk, end="", flush=True)
+                    reply += chunk
+                print("\n")
                 messages.append({"role": "assistant", "content": reply})
                 last_reply = reply
-                log.log_info(f"\n{reply}\n")
             except UnsupportedModelError as e:
                 print(f"Error: {e}", file=sys.stderr)
                 print("Please select a different model with /model", file=sys.stderr)
diff --git a/iclaw/providers/openrouter.py b/iclaw/providers/openrouter.py
@@ -28,8 +28,31 @@ def get_models(api_key):
     return resp.json().get("data", [])
 
 
-def chat(messages, api_key, model, tools=None):
-    payload = {"model": model, "messages": messages, "stream": False}
+def _parse_sse(resp):
+    """Parse SSE stream from chat completions API, yielding content chunks."""
+    for line in resp.iter_lines():
+        if not line:
+            continue
+        line = line.decode("utf-8") if isinstance(line, bytes) else line
+        if not line.startswith("data: "):
+            continue
+        data = line[6:]
+        if data == "[DONE]":
+            break
+        try:
+            import json
+
+            chunk = json.loads(data)
+            delta = chunk.get("choices", [{}])[0].get("delta", {})
+            content = delta.get("content")
+            if content:
+                yield content
+        except (ValueError, KeyError, IndexError):
+            continue
+
+
+def chat(messages, api_key, model, tools=None, stream=False):
+    payload = {"model": model, "messages": messages, "stream": stream}
     if tools:
         payload["tools"] = tools
         payload["tool_choice"] = "auto"
@@ -38,11 +61,14 @@ def chat(messages, api_key, model, tools=None):
         f"{OPENROUTER_API_BASE}/chat/completions",
         headers=_auth_headers(api_key),
         json=payload,
+        stream=stream,
     )
     if not resp.ok:
         if resp.status_code == 404:
             raise UnsupportedModelError(f'Model "{model}" not found on OpenRouter')
         raise RuntimeError(
             f"Chat API error: {resp.status_code} {resp.reason}\n{resp.text}"
         )
+    if stream:
+        return _parse_sse(resp)
     return resp.json()["choices"][0]["message"]