feat: 引入多 LLM Provider 抽象并优化前端交互防抖与调试日志

shijiashuai · shijiashuai · commit ada0904b3dcd · 2025-11-24T14:34:33.000+08:00
在 `DialogueService` 中新增 `LLM_PROVIDER` 和 `LLM_BASE_URL` 环境变量，添加 `_call_llm` 私有方法统一封装 HTTP 请求逻辑，当前实现为 OpenAI Chat Completions 接口，为后续对接其他 Provider 预留扩展点。在 `_append_session_messages` 中添加历史截断调试日志，记录 session_id 和最终长度。

在 `AdvancedDigitalHumanPage` 中优化 Chat Dock 交互体验：输入框回
diff --git a/changelog/2025-11-24-voice-and-audio-integration.md b/changelog/2025-11-24-voice-and-audio-integration.md
@@ -82,3 +82,36 @@
     - 一般说话但希望有一定口型/动态时用 `speak`；
     - 只有在没有合适动作或需要静止时才用 `idle`。
   - 强调：**无论何种情况严禁输出 JSON 以外的任何文字、注释或解释**，确保前端解析稳定。
+
+## 多 LLM Provider 抽象（预留扩展点）
+
+- 在 `DialogueService` 中引入轻量级 Provider 抽象：
+  - 新增环境变量：
+    - `LLM_PROVIDER`：当前使用的 LLM 提供方标识，默认 `openai`；
+    - `LLM_BASE_URL`：可选，覆盖默认的 OpenAI Chat Completions URL，方便对接 OpenAI 兼容网关。
+  - 新增私有方法 `_call_llm(messages)`：
+    - 统一封装 HTTP 请求逻辑，当前实现为调用 OpenAI Chat Completions 接口；
+    - 记录调试日志：`provider`、`model`、`messages` 数量等；
+    - 当 `LLM_PROVIDER` 不是 `openai` 时，会输出告警日志并暂时回退到 OpenAI，实现“先有接口，再慢慢接其他 Provider”的策略。
+
+## 前端交互与调试体验微调
+
+- 高级页面 Chat Dock：
+  - 输入框回车发送逻辑增加防抖：在 `isChatLoading` 或 `isRecording` 时禁止再次触发 `handleChatSend`，避免重复请求。
+  - 输入框占位文案根据状态切换：
+    - 录音中：显示 `Listening... press mic again to stop`；
+    - 加载中：显示 `Thinking...`；
+    - 其他情况：保持原有 `Type a message to interact...`。
+  - 发送按钮：
+    - 在 `isChatLoading` 为 `true` 时禁用按钮，防止重复发送；
+    - 同时保留加载态的圆形 spinner。
+  - 录音按钮：
+    - 在 `isChatLoading` 时禁用，避免在模型回复过程中开启新的录音；
+    - 增加 `disabled` 的视觉反馈（透明度和光标样式）。
+- 调试日志：
+  - 在前端 `AdvancedDigitalHumanPage` 中：
+    - 对每次 LLM 返回的 `emotion`/`action` 输出 `console.debug`，便于在 DevTools 中观察映射效果；
+    - 在切换录音状态时输出 `console.debug`，方便排查麦克风交互问题。
+  - 在后端 `DialogueService` 中：
+    - 每次调用 LLM 时输出 provider、model 与消息数量；
+    - 在会话历史被截断时输出包含 `session_id` 和最终长度的调试日志，便于观察内存行为。
diff --git a/server/app/services/dialogue.py b/server/app/services/dialogue.py
@@ -13,6 +13,8 @@ class DialogueService:
   def __init__(self) -> None:
     self.api_key = os.getenv("OPENAI_API_KEY")
     self.model = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
+    self.provider = os.getenv("LLM_PROVIDER", "openai").lower()
+    self.base_url = os.getenv("LLM_BASE_URL")
     self._session_messages: dict[str, list[dict[str, str]]] = {}
     try:
       self.max_session_messages = int(os.getenv("DIALOGUE_MAX_SESSION_MESSAGES", "10"))
@@ -87,21 +89,7 @@ async def generate_reply(
       )
 
     try:
-      async with httpx.AsyncClient(timeout=20.0) as client:
-        resp = await client.post(
-          "https://api.openai.com/v1/chat/completions",
-          headers={
-            "Authorization": f"Bearer {self.api_key}",
-            "Content-Type": "application/json",
-          },
-          json={
-            "model": self.model,
-            "messages": messages,
-            "temperature": 0.7,
-          },
-        )
-      resp.raise_for_status()
-      data = resp.json()
+      data = await self._call_llm(messages)
       content = data["choices"][0]["message"]["content"]
 
       try:
@@ -146,6 +134,29 @@ async def generate_reply(
         "action": "idle",
       }
 
+  async def _call_llm(self, messages: list[dict[str, str]]) -> Dict[str, Any]:
+    provider = (self.provider or "openai").lower()
+    logger.debug("Calling LLM provider=%s model=%s messages=%d", provider, self.model, len(messages))
+
+    if provider != "openai":
+      logger.warning("LLM_PROVIDER=%s 未实现，暂时使用 openai 作为回退", provider)
+
+    url = self.base_url or "https://api.openai.com/v1/chat/completions"
+    headers = {
+      "Authorization": f"Bearer {self.api_key}",
+      "Content-Type": "application/json",
+    }
+    payload = {
+      "model": self.model,
+      "messages": messages,
+      "temperature": 0.7,
+    }
+
+    async with httpx.AsyncClient(timeout=20.0) as client:
+      resp = await client.post(url, headers=headers, json=payload)
+    resp.raise_for_status()
+    return resp.json()
+
   def _get_session_messages(self, session_id: str) -> list[dict[str, str]]:
     return self._session_messages.get(session_id, [])
 
@@ -158,9 +169,17 @@ def _append_session_messages(
       return
     history = self._session_messages.get(session_id, [])
     history.extend(new_messages)
+    truncated = False
     if len(history) > self.max_session_messages:
       history = history[-self.max_session_messages :]
+      truncated = True
     self._session_messages[session_id] = history
+    logger.debug(
+      "Session %s history size=%d%s",
+      session_id,
+      len(history),
+      " (truncated)" if truncated else "",
+    )
 
 
 dialogue_service = DialogueService()
diff --git a/src/pages/AdvancedDigitalHumanPage.tsx b/src/pages/AdvancedDigitalHumanPage.tsx
@@ -72,6 +72,7 @@ export default function AdvancedDigitalHumanPage() {
     setIsChatLoading(true);
     try {
       const res = await sendUserInput({ userText: content, sessionId: 'demo-session' });
+      console.debug('LLM response', { emotion: res.emotion, action: res.action });
       const assistantMessage = { id: Date.now() + 1, role: 'assistant' as const, text: res.replyText };
       setChatMessages((prev) => [...prev, assistantMessage]);
 
@@ -94,6 +95,7 @@ export default function AdvancedDigitalHumanPage() {
   };
 
   const handleToggleRecording = () => {
+    console.debug('Toggle recording', { from: isRecording });
     if (isRecording) {
       asrService.stop();
       setRecording(false);
@@ -282,15 +284,22 @@ export default function AdvancedDigitalHumanPage() {
             type="text"
             value={chatInput}
             onChange={(e) => setChatInput(e.target.value)}
-            onKeyDown={(e) => e.key === 'Enter' && handleChatSend()}
-            placeholder="Type a message to interact..."
+            onKeyDown={(e) => e.key === 'Enter' && !isChatLoading && !isRecording && handleChatSend()}
+            placeholder={
+              isRecording
+                ? 'Listening... press mic again to stop'
+                : isChatLoading
+                  ? 'Thinking...'
+                  : 'Type a message to interact...'
+            }
             className="flex-1 bg-transparent border-none outline-none text-white placeholder-white/30 text-sm h-10"
           />
 
           <div className="flex items-center gap-2 pr-1">
             <button
               onClick={handleToggleRecording}
-              className={`p-3 rounded-xl transition-all duration-300 ${
+              disabled={isChatLoading}
+              className={`p-3 rounded-xl transition-all duration-300 disabled:opacity-50 disabled:cursor-not-allowed ${
                 isRecording 
                   ? 'bg-red-500 text-white shadow-[0_0_15px_rgba(239,68,68,0.5)]' 
                   : 'hover:bg-white/10 text-white/70 hover:text-white'
@@ -301,7 +310,7 @@ export default function AdvancedDigitalHumanPage() {
             
             <button
               onClick={() => handleChatSend()}
-              disabled={!chatInput.trim() && !isChatLoading}
+              disabled={isChatLoading || !chatInput.trim()}
               className="p-3 bg-white/10 hover:bg-white/20 disabled:opacity-50 disabled:cursor-not-allowed rounded-xl text-white transition-colors"
             >
               {isChatLoading ? (