fix(websearch): 统一前后端网页搜索引用提取逻辑，增加前端 refs 降级获取

piexian · piexian · commit 22e2c8bcf9d7 · 2026-04-05T02:24:01.000+08:00
- 重构 web_search_utils.py 为分层结构，新增 build_web_search_refs()
  和 _extract_ref_indices() 支持从 &lt;ref&gt; 标签提取引用索引
- 简化 chat.py/live_chat.py 中 ref 提取为调用 build_web_search_refs()
- MessageList.vue 新增 getMessageRefs() 在后端未返回 refs 时前端自行降级提取
- 修复 chat.py 中消息保存条件判断逻辑
diff --git a/astrbot/core/utils/web_search_utils.py b/astrbot/core/utils/web_search_utils.py
@@ -1,4 +1,5 @@
 import json
+import re
 from typing import Any
 from urllib.parse import urlparse
 
@@ -29,9 +30,9 @@ def normalize_web_search_base_url(
     return normalized
 
 
-def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict:
-    web_search_results = {}
-
+def _iter_web_search_result_items(
+    accumulated_parts: list[dict[str, Any]],
+):
     for part in accumulated_parts:
         if part.get("type") != "tool_call" or not part.get("tool_calls"):
             continue
@@ -52,13 +53,78 @@ def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict:
                 continue
 
             for item in result_data.get("results", []):
-                if not isinstance(item, dict):
-                    continue
-                if idx := item.get("index"):
-                    web_search_results[idx] = {
-                        "url": item.get("url"),
-                        "title": item.get("title"),
-                        "snippet": item.get("snippet"),
-                    }
+                if isinstance(item, dict):
+                    yield item
+
+
+def _extract_ref_indices(accumulated_text: str) -> list[str]:
+    ref_indices: list[str] = []
+    seen_indices: set[str] = set()
+
+    for match in re.finditer(r"<ref>(.*?)</ref>", accumulated_text):
+        ref_index = match.group(1).strip()
+        if not ref_index or ref_index in seen_indices:
+            continue
+        ref_indices.append(ref_index)
+        seen_indices.add(ref_index)
+
+    return ref_indices
+
+
+def collect_web_search_ref_items(
+    accumulated_parts: list[dict[str, Any]],
+    favicon_cache: dict[str, str] | None = None,
+) -> list[dict[str, Any]]:
+    web_search_refs: list[dict[str, Any]] = []
+    seen_indices: set[str] = set()
+
+    for item in _iter_web_search_result_items(accumulated_parts):
+        ref_index = item.get("index")
+        if not ref_index or ref_index in seen_indices:
+            continue
+
+        payload = {
+            "index": ref_index,
+            "url": item.get("url"),
+            "title": item.get("title"),
+            "snippet": item.get("snippet"),
+        }
+        if favicon_cache and payload["url"] in favicon_cache:
+            payload["favicon"] = favicon_cache[payload["url"]]
+
+        web_search_refs.append(payload)
+        seen_indices.add(ref_index)
+
+    return web_search_refs
+
+
+def build_web_search_refs(
+    accumulated_text: str,
+    accumulated_parts: list[dict[str, Any]],
+    favicon_cache: dict[str, str] | None = None,
+) -> dict:
+    ordered_refs = collect_web_search_ref_items(accumulated_parts, favicon_cache)
+    if not ordered_refs:
+        return {}
+
+    refs_by_index = {ref["index"]: ref for ref in ordered_refs}
+    ref_indices = _extract_ref_indices(accumulated_text)
+    used_refs = [refs_by_index[idx] for idx in ref_indices if idx in refs_by_index]
+
+    if not used_refs:
+        used_refs = ordered_refs
+
+    return {"used": used_refs}
+
+
+def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict:
+    web_search_results = {}
+
+    for ref in collect_web_search_ref_items(accumulated_parts):
+        web_search_results[ref["index"]] = {
+            "url": ref.get("url"),
+            "title": ref.get("title"),
+            "snippet": ref.get("snippet"),
+        }
 
     return web_search_results
diff --git a/astrbot/dashboard/routes/chat.py b/astrbot/dashboard/routes/chat.py
@@ -1,7 +1,6 @@
 import asyncio
 import json
 import os
-import re
 import uuid
 from contextlib import asynccontextmanager
 from typing import cast
@@ -23,7 +22,7 @@
 from astrbot.core.utils.active_event_registry import active_event_registry
 from astrbot.core.utils.astrbot_path import get_astrbot_data_path
 from astrbot.core.utils.datetime_utils import to_utc_isoformat
-from astrbot.core.utils.web_search_utils import collect_web_search_results
+from astrbot.core.utils.web_search_utils import build_web_search_refs
 
 from .route import Response, Route, RouteContext
 
@@ -216,35 +215,13 @@ async def _create_attachment_from_file(
     def _extract_web_search_refs(
         self, accumulated_text: str, accumulated_parts: list
     ) -> dict:
-        """从消息中提取网页搜索引用。
-
-        Args:
-            accumulated_text: 累积的文本内容
-            accumulated_parts: 累积的消息部分列表
-
-        Returns:
-            包含 used 列表的字典，记录被引用的搜索结果
-        """
-        web_search_results = collect_web_search_results(accumulated_parts)
-        if not web_search_results:
-            return {}
-
-        # 从文本中提取所有 <ref>xxx</ref> 标签并去重
-        ref_indices = {
-            m.strip() for m in re.findall(r"<ref>(.*?)</ref>", accumulated_text)
-        }
-
-        # 构建被引用的结果列表
-        used_refs = []
-        for ref_index in ref_indices:
-            if ref_index not in web_search_results:
-                continue
-            payload = {"index": ref_index, **web_search_results[ref_index]}
-            if favicon := sp.temporary_cache.get("_ws_favicon", {}).get(payload["url"]):
-                payload["favicon"] = favicon
-            used_refs.append(payload)
-
-        return {"used": used_refs} if used_refs else {}
+        """从消息中提取网页搜索引用。"""
+        favicon_cache = sp.temporary_cache.get("_ws_favicon", {})
+        return build_web_search_refs(
+            accumulated_text,
+            accumulated_parts,
+            favicon_cache,
+        )
 
     async def _save_bot_message(
         self,
@@ -446,19 +423,27 @@ async def stream():
                                 accumulated_parts.append(part)
 
                         # 消息结束处理
+                        should_save = False
                         if msg_type == "end":
-                            break
+                            should_save = bool(
+                                accumulated_parts
+                                or accumulated_text
+                                or accumulated_reasoning
+                                or refs
+                                or agent_stats
+                            )
                         elif (
                             (streaming and msg_type == "complete") or not streaming
                             # or msg_type == "break"
                         ):
-                            if (
-                                chain_type == "tool_call"
-                                or chain_type == "tool_call_result"
+                            if chain_type not in (
+                                "tool_call",
+                                "tool_call_result",
+                                "agent_stats",
                             ):
-                                continue
+                                should_save = True
 
-                            # 提取 web_search_tavily 引用
+                        if should_save:
                             try:
                                 refs = self._extract_web_search_refs(
                                     accumulated_text,
@@ -499,6 +484,9 @@ async def stream():
                             # tool_calls = {}
                             agent_stats = {}
                             refs = {}
+
+                        if msg_type == "end":
+                            break
             except BaseException as e:
                 logger.exception(f"WebChat stream unexpected error: {e}", exc_info=True)
             finally:
diff --git a/astrbot/dashboard/routes/live_chat.py b/astrbot/dashboard/routes/live_chat.py
@@ -1,7 +1,6 @@
 import asyncio
 import json
 import os
-import re
 import time
 import uuid
 import wave
@@ -22,7 +21,7 @@
 from astrbot.core.platform.sources.webchat.webchat_queue_mgr import webchat_queue_mgr
 from astrbot.core.utils.astrbot_path import get_astrbot_data_path, get_astrbot_temp_path
 from astrbot.core.utils.datetime_utils import to_utc_isoformat
-from astrbot.core.utils.web_search_utils import collect_web_search_results
+from astrbot.core.utils.web_search_utils import build_web_search_refs
 
 from .route import Route, RouteContext
 
@@ -199,24 +198,12 @@ def _extract_web_search_refs(
         self, accumulated_text: str, accumulated_parts: list
     ) -> dict:
         """从消息中提取 web_search 引用。"""
-        web_search_results = collect_web_search_results(accumulated_parts)
-        if not web_search_results:
-            return {}
-
-        ref_indices = {
-            m.strip() for m in re.findall(r"<ref>(.*?)</ref>", accumulated_text)
-        }
-
-        used_refs = []
-        for ref_index in ref_indices:
-            if ref_index not in web_search_results:
-                continue
-            payload = {"index": ref_index, **web_search_results[ref_index]}
-            if favicon := sp.temporary_cache.get("_ws_favicon", {}).get(payload["url"]):
-                payload["favicon"] = favicon
-            used_refs.append(payload)
-
-        return {"used": used_refs} if used_refs else {}
+        favicon_cache = sp.temporary_cache.get("_ws_favicon", {})
+        return build_web_search_refs(
+            accumulated_text,
+            accumulated_parts,
+            favicon_cache,
+        )
 
     async def _save_bot_message(
         self,
diff --git a/dashboard/src/components/chat/MessageList.vue b/dashboard/src/components/chat/MessageList.vue
@@ -149,7 +149,7 @@
                                 @click="$emit('replyMessage', msg, index)" :title="tm('actions.reply')" />
                             
                             <!-- Refs Visualization -->
-                            <ActionRef :refs="msg.content.refs" @open-refs="openRefsSidebar" />
+                            <ActionRef :refs="getMessageRefs(msg.content)" @open-refs="openRefsSidebar" />
                         </div>
                     </div>
                 </div>
@@ -294,47 +294,95 @@ export default {
         this.extractWebSearchResults();
     },
     methods: {
-        // 从消息中提取 web_search_tavily 的搜索结果
+        extractRefsFromToolCall(toolCall) {
+            if (!WEB_SEARCH_REFERENCE_TOOLS.includes(toolCall?.name) || !toolCall.result) {
+                return [];
+            }
+
+            try {
+                const resultData = typeof toolCall.result === 'string'
+                    ? JSON.parse(toolCall.result)
+                    : toolCall.result;
+
+                if (!resultData?.results || !Array.isArray(resultData.results)) {
+                    return [];
+                }
+
+                const refs = [];
+                const seenIndices = new Set();
+
+                resultData.results.forEach(item => {
+                    if (!item?.index || seenIndices.has(item.index)) {
+                        return;
+                    }
+
+                    refs.push({
+                        index: item.index,
+                        url: item.url,
+                        title: item.title,
+                        snippet: item.snippet
+                    });
+                    seenIndices.add(item.index);
+                });
+
+                return refs;
+            } catch (e) {
+                console.error('Failed to parse web search result:', e);
+                return [];
+            }
+        },
+
+        collectMessageWebSearchRefs(messageParts) {
+            if (!Array.isArray(messageParts)) {
+                return [];
+            }
+
+            const refs = [];
+            const seenIndices = new Set();
+
+            messageParts.forEach(part => {
+                if (part.type !== 'tool_call' || !Array.isArray(part.tool_calls)) {
+                    return;
+                }
+
+                part.tool_calls.forEach(toolCall => {
+                    this.extractRefsFromToolCall(toolCall).forEach(ref => {
+                        if (seenIndices.has(ref.index)) {
+                            return;
+                        }
+                        refs.push(ref);
+                        seenIndices.add(ref.index);
+                    });
+                });
+            });
+
+            return refs;
+        },
+
+        getMessageRefs(content) {
+            if (content?.refs?.used?.length) {
+                return content.refs;
+            }
+
+            const fallbackRefs = this.collectMessageWebSearchRefs(content?.message);
+            return fallbackRefs.length ? { used: fallbackRefs } : null;
+        },
+
+        // 从消息中提取网页搜索结果映射
         extractWebSearchResults() {
             const results = {};
             
             this.messages.forEach(msg => {
                 if (msg.content.type !== 'bot' || !Array.isArray(msg.content.message)) {
                     return;
                 }
-                
-                msg.content.message.forEach(part => {
-                    if (part.type !== 'tool_call' || !Array.isArray(part.tool_calls)) {
-                        return;
-                    }
-                    
-                    part.tool_calls.forEach(toolCall => {
-                        // 检查是否是网页搜索工具调用
-                        if (!WEB_SEARCH_REFERENCE_TOOLS.includes(toolCall.name) || !toolCall.result) {
-                            return;
-                        }
-                        
-                        try {
-                            // 解析工具调用结果
-                            const resultData = typeof toolCall.result === 'string' 
-                                ? JSON.parse(toolCall.result) 
-                                : toolCall.result;
-                            
-                            if (resultData.results && Array.isArray(resultData.results)) {
-                                resultData.results.forEach(item => {
-                                    if (item.index) {
-                                        results[item.index] = {
-                                            url: item.url,
-                                            title: item.title,
-                                            snippet: item.snippet
-                                        };
-                                    }
-                                });
-                            }
-                        } catch (e) {
-                            console.error('Failed to parse web search result:', e);
-                        }
-                    });
+
+                this.collectMessageWebSearchRefs(msg.content.message).forEach(ref => {
+                    results[ref.index] = {
+                        url: ref.url,
+                        title: ref.title,
+                        snippet: ref.snippet
+                    };
                 });
             });
             
diff --git a/tests/unit/test_web_search_utils.py b/tests/unit/test_web_search_utils.py