From 9f8f419b8ad0243855506ff4e68f1d648cea49fa Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Sun, 5 Apr 2026 00:12:03 +0800 Subject: [PATCH 1/6] =?UTF-8?q?feat(websearch):=20=E6=96=B0=E5=A2=9E=20Exa?= =?UTF-8?q?=20=E6=90=9C=E7=B4=A2=E6=8F=90=E4=BE=9B=E5=95=86=EF=BC=8C?= =?UTF-8?q?=E6=94=AF=E6=8C=81=20Tavily/Exa=20API=20Base=20URL=20=E5=8F=AF?= =?UTF-8?q?=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 Exa 搜索提供商,包含三个工具: - web_search_exa:语义搜索,支持 5 种搜索类型和 6 个垂直领域 - exa_extract_web_page:通过 /contents 端点提取网页全文 - exa_find_similar:通过 /findSimilar 端点查找语义相似网页 - Tavily 和 Exa 的 API Base URL 可在 WebUI 中配置,方便代理/自建实例 - 所有联网搜索工具统一添加可配置 timeout 参数(最小 30s) - MessageList.vue 引用解析支持 Exa/BoCha/findSimilar - 更新配置元数据、i18n、路由及 hooks - 更新中英文用户文档,补充 Tavily/BoCha/百度AI搜索的工具参数说明 --- astrbot/builtin_stars/web_searcher/main.py | 402 +++++++++++++++++- astrbot/core/astr_agent_hooks.py | 8 +- astrbot/core/config/default.py | 39 +- astrbot/core/knowledge_base/kb_helper.py | 8 +- .../core/knowledge_base/parsers/url_parser.py | 15 +- astrbot/dashboard/routes/chat.py | 7 +- astrbot/dashboard/routes/live_chat.py | 7 +- dashboard/src/components/chat/MessageList.vue | 5 +- .../en-US/features/config-metadata.json | 12 + .../ru-RU/features/config-metadata.json | 12 + .../zh-CN/features/config-metadata.json | 12 + docs/en/use/websearch.md | 122 +++++- docs/zh/use/websearch.md | 145 ++++++- 13 files changed, 766 insertions(+), 28 deletions(-) diff --git a/astrbot/builtin_stars/web_searcher/main.py b/astrbot/builtin_stars/web_searcher/main.py index cca1b43fb4..14b3e0d90c 100644 --- a/astrbot/builtin_stars/web_searcher/main.py +++ b/astrbot/builtin_stars/web_searcher/main.py @@ -24,6 +24,9 @@ class Main(star.Star): "web_search_tavily", "tavily_extract_web_page", "web_search_bocha", + "web_search_exa", + "exa_extract_web_page", + "exa_find_similar", ] def __init__(self, context: star.Context) -> None: @@ -34,6 +37,9 @@ def __init__(self, context: star.Context) -> None: self.bocha_key_index = 0 self.bocha_key_lock = asyncio.Lock() + self.exa_key_index = 0 + self.exa_key_lock = asyncio.Lock() + # 将 str 类型的 key 迁移至 list[str],并保存 cfg = self.context.get_config() provider_settings = cfg.get("provider_settings") @@ -57,6 +63,14 @@ def __init__(self, context: star.Context) -> None: provider_settings["websearch_bocha_key"] = [] cfg.save_config() + exa_key = provider_settings.get("websearch_exa_key") + if isinstance(exa_key, str): + if exa_key: + provider_settings["websearch_exa_key"] = [exa_key] + else: + provider_settings["websearch_exa_key"] = [] + cfg.save_config() + self.bing_search = Bing() self.sogo_search = Sogo() self.baidu_initialized = False @@ -65,12 +79,16 @@ async def _tidy_text(self, text: str) -> str: """清理文本,去除空格、换行符等""" return text.strip().replace("\n", " ").replace("\r", " ").replace(" ", " ") - async def _get_from_url(self, url: str) -> str: + async def _get_from_url(self, url: str, timeout: int = 30) -> str: """获取网页内容""" + if timeout < 30: + timeout = 30 header = HEADERS header.update({"User-Agent": random.choice(USER_AGENTS)}) async with aiohttp.ClientSession(trust_env=True) as session: - async with session.get(url, headers=header) as response: + async with session.get( + url, headers=header, timeout=aiohttp.ClientTimeout(total=timeout) + ) as response: html = await response.text(encoding="utf-8") doc = Document(html) ret = doc.summary(html_partial=True) @@ -138,10 +156,18 @@ async def _web_search_tavily( self, cfg: AstrBotConfig, payload: dict, + timeout: int = 30, ) -> list[SearchResult]: """使用 Tavily 搜索引擎进行搜索""" tavily_key = await self._get_tavily_key(cfg) - url = "https://api.tavily.com/search" + base_url = ( + cfg.get("provider_settings", {}) + .get("websearch_tavily_base_url", "https://api.tavily.com") + .rstrip("/") + ) + if timeout < 30: + timeout = 30 + url = f"{base_url}/search" header = { "Authorization": f"Bearer {tavily_key}", "Content-Type": "application/json", @@ -151,6 +177,7 @@ async def _web_search_tavily( url, json=payload, headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), ) as response: if response.status != 200: reason = await response.text() @@ -169,10 +196,19 @@ async def _web_search_tavily( results.append(result) return results - async def _extract_tavily(self, cfg: AstrBotConfig, payload: dict) -> list[dict]: + async def _extract_tavily( + self, cfg: AstrBotConfig, payload: dict, timeout: int = 30 + ) -> list[dict]: """使用 Tavily 提取网页内容""" tavily_key = await self._get_tavily_key(cfg) - url = "https://api.tavily.com/extract" + base_url = ( + cfg.get("provider_settings", {}) + .get("websearch_tavily_base_url", "https://api.tavily.com") + .rstrip("/") + ) + if timeout < 30: + timeout = 30 + url = f"{base_url}/extract" header = { "Authorization": f"Bearer {tavily_key}", "Content-Type": "application/json", @@ -182,6 +218,7 @@ async def _extract_tavily(self, cfg: AstrBotConfig, payload: dict) -> list[dict] url, json=payload, headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), ) as response: if response.status != 200: reason = await response.text() @@ -261,14 +298,19 @@ async def ensure_baidu_ai_search_mcp(self, umo: str | None = None) -> None: logger.info("Successfully initialized Baidu AI Search MCP server.") @llm_tool(name="fetch_url") - async def fetch_website_content(self, event: AstrMessageEvent, url: str) -> str: + async def fetch_website_content( + self, event: AstrMessageEvent, url: str, timeout: int = 30 + ) -> str: """Fetch the content of a website with the given web url Args: url(string): The url of the website to fetch content from + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - resp = await self._get_from_url(url) + if timeout < 30: + timeout = 30 + resp = await self._get_from_url(url, timeout=timeout) return resp @llm_tool("web_search_tavily") @@ -283,6 +325,7 @@ async def search_from_tavily( time_range: str = "", start_date: str = "", end_date: str = "", + timeout: int = 30, ) -> str: """A web search tool that uses Tavily to search the web for relevant content. Ideal for gathering current information, news, and detailed web content analysis. @@ -296,8 +339,11 @@ async def search_from_tavily( time_range(string): Optional. The time range back from the current date to include in the search results. This feature is available for both 'general' and 'news' search topics. Must be one of 'day', 'week', 'month', 'year'. start_date(string): Optional. The start date for the search results in the format 'YYYY-MM-DD'. end_date(string): Optional. The end date for the search results in the format 'YYYY-MM-DD'. + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ + if timeout < 30: + timeout = 30 logger.info(f"web_searcher - search_from_tavily: {query}") cfg = self.context.get_config(umo=event.unified_msg_origin) # websearch_link = cfg["provider_settings"].get("web_search_link", False) @@ -324,7 +370,7 @@ async def search_from_tavily( if end_date: payload["end_date"] = end_date - results = await self._web_search_tavily(cfg, payload) + results = await self._web_search_tavily(cfg, payload, timeout=timeout) if not results: return "Error: Tavily web searcher does not return any results." @@ -353,14 +399,18 @@ async def tavily_extract_web_page( event: AstrMessageEvent, url: str = "", extract_depth: str = "basic", + timeout: int = 30, ) -> str: """Extract the content of a web page using Tavily. Args: url(string): Required. An URl to extract content from. extract_depth(string): Optional. The depth of the extraction, must be one of 'basic', 'advanced'. Default is "basic". + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ + if timeout < 30: + timeout = 30 cfg = self.context.get_config(umo=event.unified_msg_origin) if not cfg.get("provider_settings", {}).get("websearch_tavily_key", []): raise ValueError("Error: Tavily API key is not configured in AstrBot.") @@ -373,7 +423,7 @@ async def tavily_extract_web_page( "urls": [url], "extract_depth": extract_depth, } - results = await self._extract_tavily(cfg, payload) + results = await self._extract_tavily(cfg, payload, timeout=timeout) ret_ls = [] for result in results: ret_ls.append(f"URL: {result.get('url', 'No URL')}") @@ -398,9 +448,12 @@ async def _web_search_bocha( self, cfg: AstrBotConfig, payload: dict, + timeout: int = 30, ) -> list[SearchResult]: """使用 BoCha 搜索引擎进行搜索""" bocha_key = await self._get_bocha_key(cfg) + if timeout < 30: + timeout = 30 url = "https://api.bochaai.com/v1/web-search" header = { "Authorization": f"Bearer {bocha_key}", @@ -411,6 +464,7 @@ async def _web_search_bocha( url, json=payload, headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), ) as response: if response.status != 200: reason = await response.text() @@ -440,6 +494,7 @@ async def search_from_bocha( include: str = "", exclude: str = "", count: int = 10, + timeout: int = 30, ) -> str: """ A web search tool based on Bocha Search API, used to retrieve web pages @@ -487,7 +542,11 @@ async def search_from_bocha( - Default: 10 The actual number of returned results may be less than the specified count. + + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ + if timeout < 30: + timeout = 30 logger.info(f"web_searcher - search_from_bocha: {query}") cfg = self.context.get_config(umo=event.unified_msg_origin) # websearch_link = cfg["provider_settings"].get("web_search_link", False) @@ -515,7 +574,7 @@ async def search_from_bocha( if exclude: payload["exclude"] = exclude - results = await self._web_search_bocha(cfg, payload) + results = await self._web_search_bocha(cfg, payload, timeout=timeout) if not results: return "Error: BoCha web searcher does not return any results." @@ -537,6 +596,301 @@ async def search_from_bocha( ret = json.dumps({"results": ret_ls}, ensure_ascii=False) return ret + async def _get_exa_key(self, cfg: AstrBotConfig) -> str: + """并发安全的从列表中获取并轮换 Exa API 密钥。""" + exa_keys = cfg.get("provider_settings", {}).get("websearch_exa_key", []) + if not exa_keys: + raise ValueError("错误:Exa API 密钥未在 AstrBot 中配置。") + + async with self.exa_key_lock: + key = exa_keys[self.exa_key_index] + self.exa_key_index = (self.exa_key_index + 1) % len(exa_keys) + return key + + async def _web_search_exa( + self, + cfg: AstrBotConfig, + payload: dict, + timeout: int = 30, + ) -> list[SearchResult]: + """使用 Exa 搜索引擎进行搜索""" + exa_key = await self._get_exa_key(cfg) + base_url = ( + cfg.get("provider_settings", {}) + .get("websearch_exa_base_url", "https://api.exa.ai") + .rstrip("/") + ) + url = f"{base_url}/search" + header = { + "x-api-key": exa_key, + "Content-Type": "application/json", + } + async with aiohttp.ClientSession(trust_env=True) as session: + async with session.post( + url, + json=payload, + headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), + ) as response: + if response.status != 200: + reason = await response.text() + raise Exception( + f"Exa web search failed: {reason}, status: {response.status}", + ) + data = await response.json() + results = [] + for item in data.get("results", []): + result = SearchResult( + title=item.get("title", ""), + url=item.get("url", ""), + snippet=(item.get("text") or "")[:500], + ) + results.append(result) + return results + + @llm_tool("web_search_exa") + async def search_from_exa( + self, + event: AstrMessageEvent, + query: str, + max_results: int = 10, + search_type: str = "auto", + category: str = "", + timeout: int = 30, + ) -> str: + """A web search tool that uses Exa to search the web for relevant content. + Ideal for gathering current information with semantic search capabilities. + Supports vertical search categories: company, people, research paper, news, personal site, financial report. + + Args: + query(string): Required. Search query. + max_results(number): Optional. The maximum number of results to return. Default is 10. Range is 1-100. + search_type(string): Optional. The type of search, must be one of 'auto', 'neural', 'fast', 'instant', 'deep'. Default is "auto". + category(string): Optional. The category of search. Supported values: 'company'(50M+ company pages), 'people'(1B+ people profiles), 'research paper'(100M+ papers), 'news', 'personal site', 'financial report'. Default is empty (general web search). + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. + + """ + if timeout < 30: + timeout = 30 + logger.info(f"web_searcher - search_from_exa: {query}") + cfg = self.context.get_config(umo=event.unified_msg_origin) + if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): + raise ValueError("Error: Exa API key is not configured in AstrBot.") + + valid_types = ("auto", "neural", "fast", "instant", "deep") + if search_type not in valid_types: + search_type = "auto" + + max_results = max(1, min(max_results, 100)) + + payload = { + "query": query, + "numResults": max_results, + "type": search_type, + "contents": {"text": {"maxCharacters": 500}}, + } + + valid_categories = ( + "company", + "people", + "research paper", + "news", + "personal site", + "financial report", + ) + if category in valid_categories: + payload["category"] = category + + results = await self._web_search_exa(cfg, payload, timeout=timeout) + if not results: + return "Error: Exa web searcher does not return any results." + + ret_ls = [] + ref_uuid = str(uuid.uuid4())[:4] + for idx, result in enumerate(results, 1): + index = f"{ref_uuid}.{idx}" + ret_ls.append( + { + "title": result.title, + "url": result.url, + "snippet": result.snippet, + "index": index, + } + ) + ret = json.dumps({"results": ret_ls}, ensure_ascii=False) + return ret + + async def _extract_exa( + self, cfg: AstrBotConfig, payload: dict, timeout: int = 30 + ) -> list[dict]: + """使用 Exa 提取网页内容""" + exa_key = await self._get_exa_key(cfg) + base_url = ( + cfg.get("provider_settings", {}) + .get("websearch_exa_base_url", "https://api.exa.ai") + .rstrip("/") + ) + if timeout < 30: + timeout = 30 + url = f"{base_url}/contents" + header = { + "x-api-key": exa_key, + "Content-Type": "application/json", + } + async with aiohttp.ClientSession(trust_env=True) as session: + async with session.post( + url, + json=payload, + headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), + ) as response: + if response.status != 200: + reason = await response.text() + raise Exception( + f"Exa content extraction failed: {reason}, status: {response.status}", + ) + data = await response.json() + results: list[dict] = data.get("results", []) + if not results: + raise ValueError( + "Error: Exa content extraction does not return any results.", + ) + return results + + @llm_tool("exa_extract_web_page") + async def exa_extract_web_page( + self, + event: AstrMessageEvent, + url: str = "", + timeout: int = 30, + ) -> str: + """Extract the content of a web page using Exa. + Use this tool when the user wants to extract or summarize content from a specific URL. + + Args: + url(string): Required. A URL to extract content from. + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. + + """ + if timeout < 30: + timeout = 30 + cfg = self.context.get_config(umo=event.unified_msg_origin) + if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): + raise ValueError("Error: Exa API key is not configured in AstrBot.") + + if not url: + raise ValueError("Error: url must be a non-empty string.") + + payload = { + "urls": [url], + "text": True, + } + + results = await self._extract_exa(cfg, payload, timeout=timeout) + ret_ls = [] + for result in results: + ret_ls.append(f"URL: {result.get('url', 'No URL')}") + text = await self._tidy_text(result.get("text", "No content")) + ret_ls.append(f"Content: {text}") + ret = "\n".join(ret_ls) + if not ret: + return "Error: Exa content extraction does not return any results." + return ret + + async def _find_similar_exa( + self, cfg: AstrBotConfig, payload: dict, timeout: int = 30 + ) -> list[SearchResult]: + """使用 Exa 查找相似链接""" + exa_key = await self._get_exa_key(cfg) + base_url = ( + cfg.get("provider_settings", {}) + .get("websearch_exa_base_url", "https://api.exa.ai") + .rstrip("/") + ) + if timeout < 30: + timeout = 30 + url = f"{base_url}/findSimilar" + header = { + "x-api-key": exa_key, + "Content-Type": "application/json", + } + async with aiohttp.ClientSession(trust_env=True) as session: + async with session.post( + url, + json=payload, + headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), + ) as response: + if response.status != 200: + reason = await response.text() + raise Exception( + f"Exa find similar failed: {reason}, status: {response.status}", + ) + data = await response.json() + results = [] + for item in data.get("results", []): + result = SearchResult( + title=item.get("title", ""), + url=item.get("url", ""), + snippet=(item.get("text") or "")[:500], + ) + results.append(result) + return results + + @llm_tool("exa_find_similar") + async def find_similar_links( + self, + event: AstrMessageEvent, + url: str, + max_results: int = 10, + timeout: int = 30, + ) -> str: + """Find web pages that are semantically similar to a given URL. + Use this tool when the user wants to discover content related to a specific webpage they have found interesting. + + Args: + url(string): Required. The URL of the webpage to find similar content for. + max_results(number): Optional. The maximum number of similar results to return. Default is 10. Range is 1-100. + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. + + """ + if timeout < 30: + timeout = 30 + logger.info(f"web_searcher - find_similar_links: {url}") + cfg = self.context.get_config(umo=event.unified_msg_origin) + if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): + raise ValueError("Error: Exa API key is not configured in AstrBot.") + + if not url: + raise ValueError("Error: url must be a non-empty string.") + + num = max(1, min(max_results, 100)) + + payload = { + "url": url, + "numResults": num, + "contents": {"text": {"maxCharacters": 500}}, + } + + results = await self._find_similar_exa(cfg, payload, timeout=timeout) + if not results: + return "Error: Exa find similar does not return any results." + + ret_ls = [] + ref_uuid = str(uuid.uuid4())[:4] + for idx, result in enumerate(results, 1): + index = f"{ref_uuid}.{idx}" + ret_ls.append( + { + "title": result.title, + "url": result.url, + "snippet": result.snippet, + "index": index, + } + ) + ret = json.dumps({"results": ret_ls}, ensure_ascii=False) + return ret + @filter.on_llm_request(priority=-10000) async def edit_web_search_tools( self, @@ -575,6 +929,9 @@ async def edit_web_search_tools( tool_set.remove_tool("tavily_extract_web_page") tool_set.remove_tool("AIsearch") tool_set.remove_tool("web_search_bocha") + tool_set.remove_tool("web_search_exa") + tool_set.remove_tool("exa_extract_web_page") + tool_set.remove_tool("exa_find_similar") elif provider == "tavily": web_search_tavily = func_tool_mgr.get_func("web_search_tavily") tavily_extract_web_page = func_tool_mgr.get_func("tavily_extract_web_page") @@ -586,6 +943,9 @@ async def edit_web_search_tools( tool_set.remove_tool("fetch_url") tool_set.remove_tool("AIsearch") tool_set.remove_tool("web_search_bocha") + tool_set.remove_tool("web_search_exa") + tool_set.remove_tool("exa_extract_web_page") + tool_set.remove_tool("exa_find_similar") elif provider == "baidu_ai_search": try: await self.ensure_baidu_ai_search_mcp(event.unified_msg_origin) @@ -597,6 +957,9 @@ async def edit_web_search_tools( tool_set.remove_tool("web_search_tavily") tool_set.remove_tool("tavily_extract_web_page") tool_set.remove_tool("web_search_bocha") + tool_set.remove_tool("web_search_exa") + tool_set.remove_tool("exa_extract_web_page") + tool_set.remove_tool("exa_find_similar") except Exception as e: logger.error(f"Cannot Initialize Baidu AI Search MCP Server: {e}") elif provider == "bocha": @@ -608,3 +971,22 @@ async def edit_web_search_tools( tool_set.remove_tool("AIsearch") tool_set.remove_tool("web_search_tavily") tool_set.remove_tool("tavily_extract_web_page") + tool_set.remove_tool("web_search_exa") + tool_set.remove_tool("exa_extract_web_page") + tool_set.remove_tool("exa_find_similar") + elif provider == "exa": + web_search_exa = func_tool_mgr.get_func("web_search_exa") + exa_extract_web_page = func_tool_mgr.get_func("exa_extract_web_page") + exa_find_similar = func_tool_mgr.get_func("exa_find_similar") + if web_search_exa and web_search_exa.active: + tool_set.add_tool(web_search_exa) + if exa_extract_web_page and exa_extract_web_page.active: + tool_set.add_tool(exa_extract_web_page) + if exa_find_similar and exa_find_similar.active: + tool_set.add_tool(exa_find_similar) + tool_set.remove_tool("web_search") + tool_set.remove_tool("fetch_url") + tool_set.remove_tool("AIsearch") + tool_set.remove_tool("web_search_tavily") + tool_set.remove_tool("tavily_extract_web_page") + tool_set.remove_tool("web_search_bocha") diff --git a/astrbot/core/astr_agent_hooks.py b/astrbot/core/astr_agent_hooks.py index 09bf32deb4..86f8a6c5b2 100644 --- a/astrbot/core/astr_agent_hooks.py +++ b/astrbot/core/astr_agent_hooks.py @@ -59,7 +59,13 @@ async def on_tool_end( platform_name = run_context.context.event.get_platform_name() if ( platform_name == "webchat" - and tool.name in ["web_search_tavily", "web_search_bocha"] + and tool.name + in [ + "web_search_tavily", + "web_search_bocha", + "web_search_exa", + "exa_find_similar", + ] and len(run_context.messages) > 0 and tool_result and len(tool_result.content) diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 45412bdccb..e34ebc408b 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -108,8 +108,11 @@ "web_search": False, "websearch_provider": "default", "websearch_tavily_key": [], + "websearch_tavily_base_url": "https://api.tavily.com", "websearch_bocha_key": [], "websearch_baidu_app_builder_key": "", + "websearch_exa_key": [], + "websearch_exa_base_url": "https://api.exa.ai", "web_search_link": False, "display_reasoning_text": False, "identifier": False, @@ -3084,7 +3087,13 @@ class ChatProviderTemplate(TypedDict): "provider_settings.websearch_provider": { "description": "网页搜索提供商", "type": "string", - "options": ["default", "tavily", "baidu_ai_search", "bocha"], + "options": [ + "default", + "tavily", + "baidu_ai_search", + "bocha", + "exa", + ], "condition": { "provider_settings.web_search": True, }, @@ -3117,6 +3126,34 @@ class ChatProviderTemplate(TypedDict): "provider_settings.websearch_provider": "baidu_ai_search", }, }, + "provider_settings.websearch_tavily_base_url": { + "description": "Tavily API Base URL", + "type": "string", + "hint": "默认为 https://api.tavily.com,可改为代理地址。", + "condition": { + "provider_settings.websearch_provider": "tavily", + "provider_settings.web_search": True, + }, + }, + "provider_settings.websearch_exa_key": { + "description": "Exa API Key", + "type": "list", + "items": {"type": "string"}, + "hint": "可添加多个 Key 进行轮询。", + "condition": { + "provider_settings.websearch_provider": "exa", + "provider_settings.web_search": True, + }, + }, + "provider_settings.websearch_exa_base_url": { + "description": "Exa API Base URL", + "type": "string", + "hint": "默认为 https://api.exa.ai,可改为代理地址。", + "condition": { + "provider_settings.websearch_provider": "exa", + "provider_settings.web_search": True, + }, + }, "provider_settings.web_search_link": { "description": "显示来源引用", "type": "bool", diff --git a/astrbot/core/knowledge_base/kb_helper.py b/astrbot/core/knowledge_base/kb_helper.py index cb74cb2ba8..912ebbbe80 100644 --- a/astrbot/core/knowledge_base/kb_helper.py +++ b/astrbot/core/knowledge_base/kb_helper.py @@ -518,12 +518,18 @@ async def upload_from_url( "Error: Tavily API key is not configured in provider_settings." ) + tavily_base_url = config.get("provider_settings", {}).get( + "websearch_tavily_base_url", "https://api.tavily.com" + ) + # 阶段1: 从 URL 提取内容 if progress_callback: await progress_callback("extracting", 0, 100) try: - text_content = await extract_text_from_url(url, tavily_keys) + text_content = await extract_text_from_url( + url, tavily_keys, tavily_base_url + ) except Exception as e: logger.error(f"Failed to extract content from URL {url}: {e}") raise OSError(f"Failed to extract content from URL {url}: {e}") from e diff --git a/astrbot/core/knowledge_base/parsers/url_parser.py b/astrbot/core/knowledge_base/parsers/url_parser.py index 2867164a96..d0c41cafa3 100644 --- a/astrbot/core/knowledge_base/parsers/url_parser.py +++ b/astrbot/core/knowledge_base/parsers/url_parser.py @@ -6,12 +6,15 @@ class URLExtractor: """URL 内容提取器,封装了 Tavily API 调用和密钥管理""" - def __init__(self, tavily_keys: list[str]) -> None: + def __init__( + self, tavily_keys: list[str], tavily_base_url: str = "https://api.tavily.com" + ) -> None: """ 初始化 URL 提取器 Args: tavily_keys: Tavily API 密钥列表 + tavily_base_url: Tavily API 基础 URL """ if not tavily_keys: raise ValueError("Error: Tavily API keys are not configured.") @@ -19,6 +22,7 @@ def __init__(self, tavily_keys: list[str]) -> None: self.tavily_keys = tavily_keys self.tavily_key_index = 0 self.tavily_key_lock = asyncio.Lock() + self.tavily_base_url = tavily_base_url.rstrip("/") async def _get_tavily_key(self) -> str: """并发安全的从列表中获取并轮换Tavily API密钥。""" @@ -47,7 +51,7 @@ async def extract_text_from_url(self, url: str) -> str: raise ValueError("Error: url must be a non-empty string.") tavily_key = await self._get_tavily_key() - api_url = "https://api.tavily.com/extract" + api_url = f"{self.tavily_base_url}/extract" headers = { "Authorization": f"Bearer {tavily_key}", "Content-Type": "application/json", @@ -88,16 +92,19 @@ async def extract_text_from_url(self, url: str) -> str: # 为了向后兼容,提供一个简单的函数接口 -async def extract_text_from_url(url: str, tavily_keys: list[str]) -> str: +async def extract_text_from_url( + url: str, tavily_keys: list[str], tavily_base_url: str = "https://api.tavily.com" +) -> str: """ 简单的函数接口,用于从 URL 提取文本内容 Args: url: 要提取内容的网页 URL tavily_keys: Tavily API 密钥列表 + tavily_base_url: Tavily API 基础 URL Returns: 提取的文本内容 """ - extractor = URLExtractor(tavily_keys) + extractor = URLExtractor(tavily_keys, tavily_base_url) return await extractor.extract_text_from_url(url) diff --git a/astrbot/dashboard/routes/chat.py b/astrbot/dashboard/routes/chat.py index a4173ed843..4c4fd0ce84 100644 --- a/astrbot/dashboard/routes/chat.py +++ b/astrbot/dashboard/routes/chat.py @@ -224,7 +224,12 @@ def _extract_web_search_refs( Returns: 包含 used 列表的字典,记录被引用的搜索结果 """ - supported = ["web_search_tavily", "web_search_bocha"] + supported = [ + "web_search_tavily", + "web_search_bocha", + "web_search_exa", + "exa_find_similar", + ] # 从 accumulated_parts 中找到所有 web_search_tavily 的工具调用结果 web_search_results = {} tool_call_parts = [ diff --git a/astrbot/dashboard/routes/live_chat.py b/astrbot/dashboard/routes/live_chat.py index 8d0af938d0..25310cf61a 100644 --- a/astrbot/dashboard/routes/live_chat.py +++ b/astrbot/dashboard/routes/live_chat.py @@ -198,7 +198,12 @@ def _extract_web_search_refs( self, accumulated_text: str, accumulated_parts: list ) -> dict: """从消息中提取 web_search 引用。""" - supported = ["web_search_tavily", "web_search_bocha"] + supported = [ + "web_search_tavily", + "web_search_bocha", + "web_search_exa", + "exa_find_similar", + ] web_search_results = {} tool_call_parts = [ p diff --git a/dashboard/src/components/chat/MessageList.vue b/dashboard/src/components/chat/MessageList.vue index ca86331a86..63c8fabb2c 100644 --- a/dashboard/src/components/chat/MessageList.vue +++ b/dashboard/src/components/chat/MessageList.vue @@ -302,8 +302,9 @@ export default { } part.tool_calls.forEach(toolCall => { - // 检查是否是 web_search_tavily 工具调用 - if (toolCall.name !== 'web_search_tavily' || !toolCall.result) { + // 检查是否是网页搜索工具调用 + const supportedTools = ['web_search_tavily', 'web_search_bocha', 'web_search_exa', 'exa_find_similar']; + if (!supportedTools.includes(toolCall.name) || !toolCall.result) { return; } diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json index 9ae8672826..d28c019f70 100644 --- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json +++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json @@ -117,6 +117,10 @@ "description": "Tavily API Key", "hint": "Multiple keys can be added for rotation." }, + "websearch_tavily_base_url": { + "description": "Tavily API Base URL", + "hint": "Default: https://api.tavily.com. Change to use a proxy or self-hosted instance." + }, "websearch_bocha_key": { "description": "BoCha API Key", "hint": "Multiple keys can be added for rotation." @@ -125,6 +129,14 @@ "description": "Baidu Qianfan Smart Cloud APP Builder API Key", "hint": "Reference: [https://console.bce.baidu.com/iam/#/iam/apikey/list](https://console.bce.baidu.com/iam/#/iam/apikey/list)" }, + "websearch_exa_key": { + "description": "Exa API Key", + "hint": "Multiple keys can be added for rotation." + }, + "websearch_exa_base_url": { + "description": "Exa API Base URL", + "hint": "Default: https://api.exa.ai. Change to use a proxy or self-hosted instance." + }, "web_search_link": { "description": "Display Source Citations" } diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json index 0aa5c791ac..bdc153a763 100644 --- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json +++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json @@ -117,6 +117,10 @@ "description": "API-ключ Tavily", "hint": "Можно добавить несколько ключей для ротации." }, + "websearch_tavily_base_url": { + "description": "Базовый URL API Tavily", + "hint": "По умолчанию: https://api.tavily.com. Можно изменить на прокси-адрес." + }, "websearch_bocha_key": { "description": "API-ключ BoCha", "hint": "Можно добавить несколько ключей для ротации." @@ -125,6 +129,14 @@ "description": "API-ключ Baidu Qianfan APP Builder", "hint": "Ссылка: [https://console.bce.baidu.com/iam/#/iam/apikey/list](https://console.bce.baidu.com/iam/#/iam/apikey/list)" }, + "websearch_exa_key": { + "description": "API-ключ Exa", + "hint": "Можно добавить несколько ключей для ротации." + }, + "websearch_exa_base_url": { + "description": "Базовый URL API Exa", + "hint": "По умолчанию: https://api.exa.ai. Можно изменить на прокси-адрес." + }, "web_search_link": { "description": "Показывать ссылки на источники" } diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json index c04138402e..5d3da77ab1 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json +++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json @@ -119,6 +119,10 @@ "description": "Tavily API Key", "hint": "可添加多个 Key 进行轮询。" }, + "websearch_tavily_base_url": { + "description": "Tavily API Base URL", + "hint": "默认为 https://api.tavily.com,可改为代理地址。" + }, "websearch_bocha_key": { "description": "BoCha API Key", "hint": "可添加多个 Key 进行轮询。" @@ -127,6 +131,14 @@ "description": "百度千帆智能云 APP Builder API Key", "hint": "参考:[https://console.bce.baidu.com/iam/#/iam/apikey/list](https://console.bce.baidu.com/iam/#/iam/apikey/list)" }, + "websearch_exa_key": { + "description": "Exa API Key", + "hint": "可添加多个 Key 进行轮询。" + }, + "websearch_exa_base_url": { + "description": "Exa API Base URL", + "hint": "默认为 https://api.exa.ai,可改为代理地址。" + }, "web_search_link": { "description": "显示来源引用" } diff --git a/docs/en/use/websearch.md b/docs/en/use/websearch.md index 82e77bb937..f226b4357c 100644 --- a/docs/en/use/websearch.md +++ b/docs/en/use/websearch.md @@ -14,18 +14,28 @@ When using a large language model that supports function calling with the web se And other prompts with search intent to trigger the model to invoke the search tool. -AstrBot supports 3 types of web search source integration: `default`, `Tavily`, and `Baidu AI Search`. +AstrBot supports 5 types of web search source integration: `default`, `Tavily`, `Baidu AI Search`, `BoCha`, and `Exa`. -The former uses AstrBot's built-in web search requester to query Google, Bing, and Sogou search engines, performing best in network environments with Google access. **We recommend using Tavily**. +The former uses AstrBot's built-in web search requester to query Google, Bing, and Sogou search engines, performing best in network environments with Google access. **We recommend using Tavily or Exa**. ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) -Go to `Configuration`, scroll down to find Web Search, where you can select `default` (default, not recommended) or `Tavily`. +Go to `Configuration`, scroll down to find Web Search, where you can select `default` (default, not recommended) or `Tavily` or `Exa`. ### default (Not Recommended) If your device is in China and you have a proxy, you can enable the proxy and enter the HTTP proxy address in `Admin Panel - Other Configuration - HTTP Proxy` to apply the proxy. +The default provider exposes two tools: + +- **`web_search`** — Searches the web via Bing and Sogou engines. +- **`fetch_url`** — Extracts the full text content from any given URL. Useful for reading and summarizing web pages when search result snippets are not sufficient. Parameters: + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `url` | string | Yes | — | The URL of the web page to fetch content from | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + ### Tavily Go to [Tavily](https://app.tavily.com/home) to get an API Key, then fill it in the corresponding configuration item. @@ -33,3 +43,109 @@ Go to [Tavily](https://app.tavily.com/home) to get an API Key, then fill it in t If you use Tavily as your web search source, you will get a better experience optimization on AstrBot ChatUI, including citation source display and more: ![](https://files.astrbot.app/docs/source/images/websearch/image1.png) + +To use a proxy or self-hosted instance, modify the `Tavily API Base URL` configuration item. + +The Tavily provider exposes two tools: + +#### 1. Search (`web_search_tavily`) + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `query` | string | Yes | — | Search query | +| `max_results` | number | No | 7 | Maximum number of results to return. Range: 5–20 | +| `search_depth` | string | No | `basic` | Search depth. Must be `basic` or `advanced` | +| `topic` | string | No | `general` | Search topic. Must be `general` or `news` | +| `days` | number | No | 3 | Number of days back from today to include. Only available when `topic` is `news` | +| `time_range` | string | No | — | Time range for results. Must be one of `day`, `week`, `month`, `year`. Available for both `general` and `news` topics | +| `start_date` | string | No | — | Start date for results in `YYYY-MM-DD` format | +| `end_date` | string | No | — | End date for results in `YYYY-MM-DD` format | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +#### 2. Extract Web Page (`tavily_extract_web_page`) + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `url` | string | Yes | — | The URL to extract content from | +| `extract_depth` | string | No | `basic` | Extraction depth. Must be `basic` or `advanced` | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +### Baidu AI Search + +Go to the [BCE Console](https://console.bce.baidu.com/iam/#/iam/apikey/list) to get an API Key, then fill it in the `websearch_baidu_app_builder_key` configuration item. + +Baidu AI Search uses the MCP (Model Context Protocol) to communicate with Baidu's AI Search service. The tool is registered as `AIsearch` internally but commonly referred to as `baidu_ai_search`. Since it operates via MCP, no tool parameters are exposed directly — the model interacts with the service through the MCP protocol. + +### BoCha + +Go to [BoCha](https://www.bocha.ai) to get an API Key, then fill it in the corresponding configuration item. + +The BoCha provider exposes one tool: + +#### Search (`web_search_bocha`) + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `query` | string | Yes | — | Search query | +| `freshness` | string | No | `noLimit` | Time range filter. Supported values: `noLimit`, `oneDay`, `oneWeek`, `oneMonth`, `oneYear`, `YYYY-MM-DD..YYYY-MM-DD` (date range), or `YYYY-MM-DD` (exact date). Using `noLimit` is recommended as the search algorithm will automatically optimize time relevance | +| `summary` | boolean | No | `false` | Whether to include a text summary for each result | +| `include` | string | No | — | Domains to include. Multiple domains separated by `\|` or `,` (max 100 domains). Example: `qq.com\|m.163.com` | +| `exclude` | string | No | — | Domains to exclude. Same format as `include` | +| `count` | number | No | 10 | Number of results to return. Range: 1–50. Actual results may be fewer | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +### Exa + +Go to [Exa](https://dashboard.exa.ai) to get an API Key, then fill it in the corresponding configuration item. + +Exa provides semantic search capabilities powered by neural embeddings, offering three integrated tools for the model to use: + +#### 1. Search (`web_search_exa`) + +The core search tool supports 5 search types: + +- `auto` — Automatically selects the best search mode based on the query (default) +- `neural` — Semantic search using embeddings, ideal for conceptual or natural language queries +- `fast` — Fast keyword-based search for quick results +- `instant` — Near-instant results for simple factual queries +- `deep` — Deep search with thorough result exploration + +Additionally, Exa supports 6 vertical categories for domain-specific searches: + +| Category | Coverage | +|---|---| +| `company` | 50M+ company pages | +| `people` | 1B+ profiles | +| `research paper` | 100M+ academic papers | +| `news` | News articles and reports | +| `personal site` | Personal websites and blogs | +| `financial report` | Financial filings and data | + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `query` | string | Yes | — | Search query | +| `max_results` | number | No | 10 | Maximum number of results to return. Range: 1–100 | +| `search_type` | string | No | `auto` | Search type. Must be one of `auto`, `neural`, `fast`, `instant`, `deep` | +| `category` | string | No | — | Vertical search category. Supported values: `company`, `people`, `research paper`, `news`, `personal site`, `financial report`. Leave empty for general web search | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +#### 2. Content Extraction (`exa_extract_web_page`) + +Extracts full text content from any given URL. The model can use this to read and summarize web pages, articles, or documents when the search result snippet is not sufficient. + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `url` | string | Yes | — | The URL to extract content from | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +#### 3. Find Similar (`exa_find_similar`) + +Finds semantically similar webpages to a given URL. This is a unique Exa feature that allows discovering related content based on neural embeddings rather than keyword matching. + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `url` | string | Yes | — | The URL of the webpage to find similar content for | +| `max_results` | number | No | 10 | Maximum number of similar results to return. Range: 1–100 | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +To use a proxy or self-hosted instance, modify the `Exa API Base URL` configuration. diff --git a/docs/zh/use/websearch.md b/docs/zh/use/websearch.md index 93200c44bf..82a448c4df 100644 --- a/docs/zh/use/websearch.md +++ b/docs/zh/use/websearch.md @@ -13,22 +13,159 @@ AstrBot 内置的网页搜索功能依赖大模型提供 `函数调用` 能力 等等带有搜索意味的提示让大模型触发调用搜索工具。 -AstrBot 支持 3 种网页搜索源接入方式:`默认`、`Tavily`、`百度 AI 搜索`。 +AstrBot 支持 5 种网页搜索源接入方式:`默认`、`Tavily`、`百度 AI 搜索`、`BoCha`、`Exa`。 -前者使用 AstrBot 内置的网页搜索请求器请求 Google、Bing、搜狗搜索引擎,在能够使用 Google 的网络环境下表现最佳。**我们推荐使用 Tavily**。 +前者使用 AstrBot 内置的网页搜索请求器请求 Google、Bing、搜狗搜索引擎,在能够使用 Google 的网络环境下表现最佳。**我们推荐使用 Tavily 或 Exa**。 ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) -进入 `配置`,下拉找到网页搜索,您可选择 `default`(默认,不推荐) 或 `Tavily`。 +进入 `配置`,下拉找到网页搜索,您可选择 `default`(默认,不推荐) 或 `Tavily` 或 `Exa`。 ### default(不推荐) 如果您的设备在国内并且有代理,可以开启代理并在 `管理面板-其他配置-HTTP代理` 填入 HTTP 代理地址以应用代理。 +启用默认搜索后,大模型将获得以下工具: + +#### 网页搜索(web_search) + +使用 Google、Bing、搜狗等搜索引擎进行搜索。 + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `query` | string | 是 | 搜索关键词 | +| `max_results` | number | 否 | 返回的最大搜索结果数量,默认为 5 | + +#### 网页内容提取(fetch_url) + +提取任意 URL 的网页全文内容,可用于让大模型阅读和总结指定网页。 + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `url` | string | 是 | 要提取内容的网页 URL | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + ### Tavily 前往 [Tavily](https://app.tavily.com/home) 得到 API Key,然后填写在相应的配置项。 如果您使用 Tavily 作为网页搜索源,在 AstrBot ChatUI 上将会获得更好的体验优化,包括引用来源展示等: -![](https://files.astrbot.app/docs/source/images/websearch/image1.png) \ No newline at end of file +![](https://files.astrbot.app/docs/source/images/websearch/image1.png) + +如需使用代理或自建实例,可修改 `Tavily API Base URL` 配置项。 + +启用 Tavily 后,大模型将获得以下工具: + +#### 搜索(web_search_tavily) + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `query` | string | 是 | 搜索关键词 | +| `max_results` | number | 否 | 返回的最大结果数量,范围 5-20,默认 7 | +| `search_depth` | string | 否 | 搜索深度,可选 `basic`(默认)或 `advanced` | +| `topic` | string | 否 | 搜索主题,可选 `general`(默认)或 `news` | +| `days` | number | 否 | 从当前日期往前包含的天数,仅在 `topic` 为 `news` 时生效 | +| `time_range` | string | 否 | 时间范围,可选 `day`、`week`、`month`、`year`,对 `general` 和 `news` 均生效 | +| `start_date` | string | 否 | 起始日期,格式 `YYYY-MM-DD` | +| `end_date` | string | 否 | 结束日期,格式 `YYYY-MM-DD` | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + +#### 网页内容提取(tavily_extract_web_page) + +提取任意 URL 的网页全文内容。 + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `url` | string | 是 | 要提取内容的网页 URL | +| `extract_depth` | string | 否 | 提取深度,可选 `basic`(默认)或 `advanced` | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + +### Exa + +前往 [Exa](https://dashboard.exa.ai) 获取 API Key,然后填写在相应的配置项。 + +Exa 提供基于语义理解的搜索能力,相比传统关键词搜索能够更精准地理解搜索意图。启用 Exa 后,大模型将获得以下三个工具: + +#### 搜索(web_search_exa) + +Exa 的核心搜索工具,支持以下搜索类型: + +- `auto`:自动模式,由 Exa 根据查询内容智能选择最佳搜索方式(推荐) +- `neural`:语义搜索,基于嵌入向量匹配,适合模糊或描述性的查询 +- `fast`:快速搜索,优先返回速度,适合简单关键词查询 +- `instant`:即时搜索,适合需要快速获取摘要的场景 +- `deep`:深度搜索,更全面地检索相关结果,适合复杂研究类查询 + +此外,搜索支持按垂直领域筛选结果: + +| 类别 | 说明 | +|------|------| +| `company` | 5000 万+ 公司主页 | +| `people` | 10 亿+ 个人主页/档案 | +| `research paper` | 1 亿+ 研究论文 | +| `news` | 新闻资讯 | +| `personal site` | 个人网站/博客 | +| `financial report` | 财务报告 | + +**工具参数:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `query` | string | 是 | 搜索关键词 | +| `max_results` | number | 否 | 返回的最大结果数量,范围 1-100,默认 10 | +| `search_type` | string | 否 | 搜索类型,可选 `auto`(默认)、`neural`、`fast`、`instant`、`deep` | +| `category` | string | 否 | 垂直领域筛选,默认为空(通用搜索) | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + +#### 内容提取(exa_extract_web_page) + +提取任意 URL 的网页全文内容,可用于让大模型阅读和总结指定网页。您可以直接对大模型说: + +- `帮我总结一下这个链接:https://example.com` +- `读取这个页面的内容:https://example.com` + +**工具参数:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `url` | string | 是 | 要提取内容的网页 URL | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + +#### 相似链接(exa_find_similar) + +Exa 独有的功能,根据给定的 URL 查找语义相似的网页。适合用于扩展阅读、查找同类资源或发现相关内容。 + +**工具参数:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `url` | string | 是 | 用于查找相似内容的网页 URL | +| `max_results` | number | 否 | 返回的最大结果数量,范围 1-100,默认 10 | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + +如需使用代理或自建实例,可修改 `Exa API Base URL` 配置项。 + +### 百度 AI 搜索 + +前往 [百度智能云控制台](https://console.bce.baidu.com/iam/#/iam/apikey/list) 获取 APP Builder API Key,然后填写在相应的配置项。 + +百度 AI 搜索通过 MCP 协议接入,启用后大模型将自动获得 `baidu_ai_search` 工具,无需额外配置工具参数。 + +### BoCha + +前往 [BoCha](https://www.bocha.ai) 获取 API Key,然后填写在相应的配置项。 + +启用 BoCha 后,大模型将获得以下工具: + +#### 搜索(web_search_bocha) + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `query` | string | 是 | 搜索关键词 | +| `freshness` | string | 否 | 时间范围筛选。可选 `noLimit`(默认,推荐)、`oneDay`、`oneWeek`、`oneMonth`、`oneYear`,或指定日期 `YYYY-MM-DD`、日期范围 `YYYY-MM-DD..YYYY-MM-DD`。建议使用 `noLimit`,搜索算法会自动优化时间相关性,手动限制可能导致无结果 | +| `summary` | boolean | 否 | 是否为每个搜索结果包含文本摘要,默认 `false` | +| `include` | string | 否 | 限定搜索域名,多个域名用 `\|` 或 `,` 分隔,最多 100 个。示例:`qq.com` 或 `qq.com\|m.163.com` | +| `exclude` | string | 否 | 排除搜索域名,多个域名用 `\|` 或 `,` 分隔,最多 100 个。示例:`qq.com` 或 `qq.com\|m.163.com` | +| `count` | number | 否 | 返回的搜索结果数量,范围 1-50,默认 10(实际返回数量可能少于指定值) | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | \ No newline at end of file From f0edbb9623a580e8c0d3b5b2a7e4c35cfc33d121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B0=95=E6=B0=99?= <2014440212@qq.com> Date: Sun, 5 Apr 2026 00:36:12 +0800 Subject: [PATCH 2/6] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/en/use/websearch.md | 2 +- docs/zh/use/websearch.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/use/websearch.md b/docs/en/use/websearch.md index f226b4357c..b030c48756 100644 --- a/docs/en/use/websearch.md +++ b/docs/en/use/websearch.md @@ -20,7 +20,7 @@ The former uses AstrBot's built-in web search requester to query Google, Bing, a ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) -Go to `Configuration`, scroll down to find Web Search, where you can select `default` (default, not recommended) or `Tavily` or `Exa`. +Go to `Configuration`, scroll down to find Web Search, where you can select `default` (default, not recommended), `Tavily`, `Baidu AI Search`, `BoCha`, or `Exa`. ### default (Not Recommended) diff --git a/docs/zh/use/websearch.md b/docs/zh/use/websearch.md index 82a448c4df..96a569aa70 100644 --- a/docs/zh/use/websearch.md +++ b/docs/zh/use/websearch.md @@ -19,7 +19,7 @@ AstrBot 支持 5 种网页搜索源接入方式:`默认`、`Tavily`、`百度 ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) -进入 `配置`,下拉找到网页搜索,您可选择 `default`(默认,不推荐) 或 `Tavily` 或 `Exa`。 +进入 `配置`,下拉找到网页搜索,您可选择 `default`(默认,不推荐)、`Tavily`、`百度 AI 搜索`、`BoCha` 或 `Exa`。 ### default(不推荐) From 479c58e6051b26faf98433477c3231235e924b7b Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Sun, 5 Apr 2026 02:05:10 +0800 Subject: [PATCH 3/6] =?UTF-8?q?fix(websearch):=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E5=85=A8=E5=B1=80=20HEADERS=20=E6=B1=A1=E6=9F=93=E3=80=81?= =?UTF-8?q?=E5=AF=86=E9=92=A5=E7=B4=A2=E5=BC=95=E8=B6=8A=E7=95=8C=E7=AD=89?= =?UTF-8?q?=E9=97=AE=E9=A2=98=EF=BC=8C=E6=8F=90=E5=8F=96=E5=85=B1=E4=BA=AB?= =?UTF-8?q?=E5=B7=A5=E5=85=B7=E5=87=BD=E6=95=B0=E6=B6=88=E9=99=A4=E9=87=8D?= =?UTF-8?q?=E5=A4=8D=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrbot/builtin_stars/web_searcher/main.py | 216 ++++++------------ astrbot/core/astr_agent_hooks.py | 9 +- .../core/knowledge_base/parsers/url_parser.py | 8 +- astrbot/core/utils/web_search_utils.py | 64 ++++++ astrbot/dashboard/routes/chat.py | 36 +-- astrbot/dashboard/routes/live_chat.py | 33 +-- dashboard/src/components/chat/MessageList.vue | 10 +- docs/en/use/websearch.md | 4 +- docs/zh/use/websearch.md | 8 +- 9 files changed, 168 insertions(+), 220 deletions(-) create mode 100644 astrbot/core/utils/web_search_utils.py diff --git a/astrbot/builtin_stars/web_searcher/main.py b/astrbot/builtin_stars/web_searcher/main.py index 14b3e0d90c..e0044969b6 100644 --- a/astrbot/builtin_stars/web_searcher/main.py +++ b/astrbot/builtin_stars/web_searcher/main.py @@ -11,11 +11,14 @@ from astrbot.api.event import AstrMessageEvent, filter from astrbot.api.provider import ProviderRequest from astrbot.core.provider.func_tool_manager import FunctionToolManager +from astrbot.core.utils.web_search_utils import normalize_web_search_base_url from .engines import HEADERS, USER_AGENTS, SearchResult from .engines.bing import Bing from .engines.sogo import Sogo +MIN_WEB_SEARCH_TIMEOUT = 30 + class Main(star.Star): TOOLS = [ @@ -28,6 +31,14 @@ class Main(star.Star): "exa_extract_web_page", "exa_find_similar", ] + MANAGED_TOOLS = TOOLS + ["AIsearch"] + PROVIDER_TOOLS = { + "default": ("web_search", "fetch_url"), + "tavily": ("web_search_tavily", "tavily_extract_web_page"), + "baidu_ai_search": ("AIsearch",), + "bocha": ("web_search_bocha",), + "exa": ("web_search_exa", "exa_extract_web_page", "exa_find_similar"), + } def __init__(self, context: star.Context) -> None: self.context = context @@ -79,15 +90,40 @@ async def _tidy_text(self, text: str) -> str: """清理文本,去除空格、换行符等""" return text.strip().replace("\n", " ").replace("\r", " ").replace(" ", " ") + def _normalize_timeout(self, timeout: int) -> aiohttp.ClientTimeout: + return aiohttp.ClientTimeout(total=max(timeout, MIN_WEB_SEARCH_TIMEOUT)) + + def _get_tavily_base_url(self, cfg: AstrBotConfig) -> str: + return normalize_web_search_base_url( + cfg.get("provider_settings", {}).get("websearch_tavily_base_url"), + default="https://api.tavily.com", + provider_name="Tavily", + ) + + def _get_exa_base_url(self, cfg: AstrBotConfig) -> str: + return normalize_web_search_base_url( + cfg.get("provider_settings", {}).get("websearch_exa_base_url"), + default="https://api.exa.ai", + provider_name="Exa", + ) + + def _add_active_tools( + self, tool_set, func_tool_mgr, tool_names: tuple[str, ...] + ) -> None: + for tool_name in tool_names: + tool = func_tool_mgr.get_func(tool_name) + if tool and tool.active: + tool_set.add_tool(tool) + async def _get_from_url(self, url: str, timeout: int = 30) -> str: """获取网页内容""" - if timeout < 30: - timeout = 30 - header = HEADERS - header.update({"User-Agent": random.choice(USER_AGENTS)}) + header = HEADERS.copy() + header["User-Agent"] = random.choice(USER_AGENTS) async with aiohttp.ClientSession(trust_env=True) as session: async with session.get( - url, headers=header, timeout=aiohttp.ClientTimeout(total=timeout) + url, + headers=header, + timeout=self._normalize_timeout(timeout), ) as response: html = await response.text(encoding="utf-8") doc = Document(html) @@ -145,9 +181,10 @@ async def _get_tavily_key(self, cfg: AstrBotConfig) -> str: """并发安全的从列表中获取并轮换Tavily API密钥。""" tavily_keys = cfg.get("provider_settings", {}).get("websearch_tavily_key", []) if not tavily_keys: - raise ValueError("错误:Tavily API密钥未在AstrBot中配置。") + raise ValueError("Error: Tavily API key is not configured in AstrBot.") async with self.tavily_key_lock: + self.tavily_key_index %= len(tavily_keys) key = tavily_keys[self.tavily_key_index] self.tavily_key_index = (self.tavily_key_index + 1) % len(tavily_keys) return key @@ -160,13 +197,7 @@ async def _web_search_tavily( ) -> list[SearchResult]: """使用 Tavily 搜索引擎进行搜索""" tavily_key = await self._get_tavily_key(cfg) - base_url = ( - cfg.get("provider_settings", {}) - .get("websearch_tavily_base_url", "https://api.tavily.com") - .rstrip("/") - ) - if timeout < 30: - timeout = 30 + base_url = self._get_tavily_base_url(cfg) url = f"{base_url}/search" header = { "Authorization": f"Bearer {tavily_key}", @@ -177,7 +208,7 @@ async def _web_search_tavily( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -201,13 +232,7 @@ async def _extract_tavily( ) -> list[dict]: """使用 Tavily 提取网页内容""" tavily_key = await self._get_tavily_key(cfg) - base_url = ( - cfg.get("provider_settings", {}) - .get("websearch_tavily_base_url", "https://api.tavily.com") - .rstrip("/") - ) - if timeout < 30: - timeout = 30 + base_url = self._get_tavily_base_url(cfg) url = f"{base_url}/extract" header = { "Authorization": f"Bearer {tavily_key}", @@ -218,7 +243,7 @@ async def _extract_tavily( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -243,7 +268,7 @@ async def search_from_search_engine( """搜索网络以回答用户的问题。当用户需要搜索网络以获取即时性的信息时调用此工具。 Args: - query(string): 和用户的问题最相关的搜索关键词,用于在 Google 上搜索。 + query(string): 和用户的问题最相关的搜索关键词,用于在搜索引擎上搜索。 max_results(number): 返回的最大搜索结果数量,默认为 5。 """ @@ -308,8 +333,6 @@ async def fetch_website_content( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 resp = await self._get_from_url(url, timeout=timeout) return resp @@ -342,8 +365,6 @@ async def search_from_tavily( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 logger.info(f"web_searcher - search_from_tavily: {query}") cfg = self.context.get_config(umo=event.unified_msg_origin) # websearch_link = cfg["provider_settings"].get("web_search_link", False) @@ -409,8 +430,6 @@ async def tavily_extract_web_page( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 cfg = self.context.get_config(umo=event.unified_msg_origin) if not cfg.get("provider_settings", {}).get("websearch_tavily_key", []): raise ValueError("Error: Tavily API key is not configured in AstrBot.") @@ -437,9 +456,10 @@ async def _get_bocha_key(self, cfg: AstrBotConfig) -> str: """并发安全的从列表中获取并轮换BoCha API密钥。""" bocha_keys = cfg.get("provider_settings", {}).get("websearch_bocha_key", []) if not bocha_keys: - raise ValueError("错误:BoCha API密钥未在AstrBot中配置。") + raise ValueError("Error: BoCha API key is not configured in AstrBot.") async with self.bocha_key_lock: + self.bocha_key_index %= len(bocha_keys) key = bocha_keys[self.bocha_key_index] self.bocha_key_index = (self.bocha_key_index + 1) % len(bocha_keys) return key @@ -452,8 +472,6 @@ async def _web_search_bocha( ) -> list[SearchResult]: """使用 BoCha 搜索引擎进行搜索""" bocha_key = await self._get_bocha_key(cfg) - if timeout < 30: - timeout = 30 url = "https://api.bochaai.com/v1/web-search" header = { "Authorization": f"Bearer {bocha_key}", @@ -464,7 +482,7 @@ async def _web_search_bocha( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -545,8 +563,6 @@ async def search_from_bocha( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 logger.info(f"web_searcher - search_from_bocha: {query}") cfg = self.context.get_config(umo=event.unified_msg_origin) # websearch_link = cfg["provider_settings"].get("web_search_link", False) @@ -600,9 +616,10 @@ async def _get_exa_key(self, cfg: AstrBotConfig) -> str: """并发安全的从列表中获取并轮换 Exa API 密钥。""" exa_keys = cfg.get("provider_settings", {}).get("websearch_exa_key", []) if not exa_keys: - raise ValueError("错误:Exa API 密钥未在 AstrBot 中配置。") + raise ValueError("Error: Exa API key is not configured in AstrBot.") async with self.exa_key_lock: + self.exa_key_index %= len(exa_keys) key = exa_keys[self.exa_key_index] self.exa_key_index = (self.exa_key_index + 1) % len(exa_keys) return key @@ -615,11 +632,7 @@ async def _web_search_exa( ) -> list[SearchResult]: """使用 Exa 搜索引擎进行搜索""" exa_key = await self._get_exa_key(cfg) - base_url = ( - cfg.get("provider_settings", {}) - .get("websearch_exa_base_url", "https://api.exa.ai") - .rstrip("/") - ) + base_url = self._get_exa_base_url(cfg) url = f"{base_url}/search" header = { "x-api-key": exa_key, @@ -630,7 +643,7 @@ async def _web_search_exa( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -670,8 +683,6 @@ async def search_from_exa( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 logger.info(f"web_searcher - search_from_exa: {query}") cfg = self.context.get_config(umo=event.unified_msg_origin) if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): @@ -725,13 +736,7 @@ async def _extract_exa( ) -> list[dict]: """使用 Exa 提取网页内容""" exa_key = await self._get_exa_key(cfg) - base_url = ( - cfg.get("provider_settings", {}) - .get("websearch_exa_base_url", "https://api.exa.ai") - .rstrip("/") - ) - if timeout < 30: - timeout = 30 + base_url = self._get_exa_base_url(cfg) url = f"{base_url}/contents" header = { "x-api-key": exa_key, @@ -742,7 +747,7 @@ async def _extract_exa( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -750,12 +755,7 @@ async def _extract_exa( f"Exa content extraction failed: {reason}, status: {response.status}", ) data = await response.json() - results: list[dict] = data.get("results", []) - if not results: - raise ValueError( - "Error: Exa content extraction does not return any results.", - ) - return results + return data.get("results", []) @llm_tool("exa_extract_web_page") async def exa_extract_web_page( @@ -772,8 +772,6 @@ async def exa_extract_web_page( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 cfg = self.context.get_config(umo=event.unified_msg_origin) if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): raise ValueError("Error: Exa API key is not configured in AstrBot.") @@ -787,6 +785,8 @@ async def exa_extract_web_page( } results = await self._extract_exa(cfg, payload, timeout=timeout) + if not results: + return "Error: Exa content extraction does not return any results." ret_ls = [] for result in results: ret_ls.append(f"URL: {result.get('url', 'No URL')}") @@ -802,13 +802,7 @@ async def _find_similar_exa( ) -> list[SearchResult]: """使用 Exa 查找相似链接""" exa_key = await self._get_exa_key(cfg) - base_url = ( - cfg.get("provider_settings", {}) - .get("websearch_exa_base_url", "https://api.exa.ai") - .rstrip("/") - ) - if timeout < 30: - timeout = 30 + base_url = self._get_exa_base_url(cfg) url = f"{base_url}/findSimilar" header = { "x-api-key": exa_key, @@ -819,7 +813,7 @@ async def _find_similar_exa( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -854,8 +848,6 @@ async def find_similar_links( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 logger.info(f"web_searcher - find_similar_links: {url}") cfg = self.context.get_config(umo=event.unified_msg_origin) if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): @@ -912,81 +904,25 @@ async def edit_web_search_tools( return if not websearch_enable: - # pop tools - for tool_name in self.TOOLS: + for tool_name in self.MANAGED_TOOLS: tool_set.remove_tool(tool_name) return func_tool_mgr = self.context.get_llm_tool_manager() - if provider == "default": - web_search_t = func_tool_mgr.get_func("web_search") - fetch_url_t = func_tool_mgr.get_func("fetch_url") - if web_search_t and web_search_t.active: - tool_set.add_tool(web_search_t) - if fetch_url_t and fetch_url_t.active: - tool_set.add_tool(fetch_url_t) - tool_set.remove_tool("web_search_tavily") - tool_set.remove_tool("tavily_extract_web_page") - tool_set.remove_tool("AIsearch") - tool_set.remove_tool("web_search_bocha") - tool_set.remove_tool("web_search_exa") - tool_set.remove_tool("exa_extract_web_page") - tool_set.remove_tool("exa_find_similar") - elif provider == "tavily": - web_search_tavily = func_tool_mgr.get_func("web_search_tavily") - tavily_extract_web_page = func_tool_mgr.get_func("tavily_extract_web_page") - if web_search_tavily and web_search_tavily.active: - tool_set.add_tool(web_search_tavily) - if tavily_extract_web_page and tavily_extract_web_page.active: - tool_set.add_tool(tavily_extract_web_page) - tool_set.remove_tool("web_search") - tool_set.remove_tool("fetch_url") - tool_set.remove_tool("AIsearch") - tool_set.remove_tool("web_search_bocha") - tool_set.remove_tool("web_search_exa") - tool_set.remove_tool("exa_extract_web_page") - tool_set.remove_tool("exa_find_similar") - elif provider == "baidu_ai_search": + for tool_name in self.MANAGED_TOOLS: + tool_set.remove_tool(tool_name) + + if provider == "baidu_ai_search": try: await self.ensure_baidu_ai_search_mcp(event.unified_msg_origin) - aisearch_tool = func_tool_mgr.get_func("AIsearch") - if aisearch_tool and aisearch_tool.active: - tool_set.add_tool(aisearch_tool) - tool_set.remove_tool("web_search") - tool_set.remove_tool("fetch_url") - tool_set.remove_tool("web_search_tavily") - tool_set.remove_tool("tavily_extract_web_page") - tool_set.remove_tool("web_search_bocha") - tool_set.remove_tool("web_search_exa") - tool_set.remove_tool("exa_extract_web_page") - tool_set.remove_tool("exa_find_similar") + self._add_active_tools( + tool_set, + func_tool_mgr, + self.PROVIDER_TOOLS["baidu_ai_search"], + ) except Exception as e: logger.error(f"Cannot Initialize Baidu AI Search MCP Server: {e}") - elif provider == "bocha": - web_search_bocha = func_tool_mgr.get_func("web_search_bocha") - if web_search_bocha and web_search_bocha.active: - tool_set.add_tool(web_search_bocha) - tool_set.remove_tool("web_search") - tool_set.remove_tool("fetch_url") - tool_set.remove_tool("AIsearch") - tool_set.remove_tool("web_search_tavily") - tool_set.remove_tool("tavily_extract_web_page") - tool_set.remove_tool("web_search_exa") - tool_set.remove_tool("exa_extract_web_page") - tool_set.remove_tool("exa_find_similar") - elif provider == "exa": - web_search_exa = func_tool_mgr.get_func("web_search_exa") - exa_extract_web_page = func_tool_mgr.get_func("exa_extract_web_page") - exa_find_similar = func_tool_mgr.get_func("exa_find_similar") - if web_search_exa and web_search_exa.active: - tool_set.add_tool(web_search_exa) - if exa_extract_web_page and exa_extract_web_page.active: - tool_set.add_tool(exa_extract_web_page) - if exa_find_similar and exa_find_similar.active: - tool_set.add_tool(exa_find_similar) - tool_set.remove_tool("web_search") - tool_set.remove_tool("fetch_url") - tool_set.remove_tool("AIsearch") - tool_set.remove_tool("web_search_tavily") - tool_set.remove_tool("tavily_extract_web_page") - tool_set.remove_tool("web_search_bocha") + return + + tool_names = self.PROVIDER_TOOLS.get(provider, self.PROVIDER_TOOLS["default"]) + self._add_active_tools(tool_set, func_tool_mgr, tool_names) diff --git a/astrbot/core/astr_agent_hooks.py b/astrbot/core/astr_agent_hooks.py index 86f8a6c5b2..83c21a7462 100644 --- a/astrbot/core/astr_agent_hooks.py +++ b/astrbot/core/astr_agent_hooks.py @@ -9,6 +9,7 @@ from astrbot.core.astr_agent_context import AstrAgentContext from astrbot.core.pipeline.context_utils import call_event_hook from astrbot.core.star.star_handler import EventType +from astrbot.core.utils.web_search_utils import WEB_SEARCH_REFERENCE_TOOLS class MainAgentHooks(BaseAgentRunHooks[AstrAgentContext]): @@ -59,13 +60,7 @@ async def on_tool_end( platform_name = run_context.context.event.get_platform_name() if ( platform_name == "webchat" - and tool.name - in [ - "web_search_tavily", - "web_search_bocha", - "web_search_exa", - "exa_find_similar", - ] + and tool.name in WEB_SEARCH_REFERENCE_TOOLS and len(run_context.messages) > 0 and tool_result and len(tool_result.content) diff --git a/astrbot/core/knowledge_base/parsers/url_parser.py b/astrbot/core/knowledge_base/parsers/url_parser.py index d0c41cafa3..660e110fe6 100644 --- a/astrbot/core/knowledge_base/parsers/url_parser.py +++ b/astrbot/core/knowledge_base/parsers/url_parser.py @@ -2,6 +2,8 @@ import aiohttp +from astrbot.core.utils.web_search_utils import normalize_web_search_base_url + class URLExtractor: """URL 内容提取器,封装了 Tavily API 调用和密钥管理""" @@ -22,7 +24,11 @@ def __init__( self.tavily_keys = tavily_keys self.tavily_key_index = 0 self.tavily_key_lock = asyncio.Lock() - self.tavily_base_url = tavily_base_url.rstrip("/") + self.tavily_base_url = normalize_web_search_base_url( + tavily_base_url, + default="https://api.tavily.com", + provider_name="Tavily", + ) async def _get_tavily_key(self) -> str: """并发安全的从列表中获取并轮换Tavily API密钥。""" diff --git a/astrbot/core/utils/web_search_utils.py b/astrbot/core/utils/web_search_utils.py new file mode 100644 index 0000000000..acecaebce1 --- /dev/null +++ b/astrbot/core/utils/web_search_utils.py @@ -0,0 +1,64 @@ +import json +from typing import Any +from urllib.parse import urlparse + +WEB_SEARCH_REFERENCE_TOOLS = ( + "web_search_tavily", + "web_search_bocha", + "web_search_exa", + "exa_find_similar", +) + + +def normalize_web_search_base_url( + base_url: str | None, + *, + default: str, + provider_name: str, +) -> str: + normalized = (base_url or "").strip() + if not normalized: + normalized = default + normalized = normalized.rstrip("/") + + parsed = urlparse(normalized) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + raise ValueError( + f"Error: {provider_name} API Base URL must start with http:// or https://.", + ) + return normalized + + +def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict: + web_search_results = {} + + for part in accumulated_parts: + if part.get("type") != "tool_call" or not part.get("tool_calls"): + continue + + for tool_call in part["tool_calls"]: + if tool_call.get( + "name" + ) not in WEB_SEARCH_REFERENCE_TOOLS or not tool_call.get("result"): + continue + + result = tool_call["result"] + try: + result_data = json.loads(result) if isinstance(result, str) else result + except json.JSONDecodeError: + continue + + if not isinstance(result_data, dict): + continue + + for item in result_data.get("results", []): + if not isinstance(item, dict): + continue + if idx := item.get("index"): + web_search_results[idx] = { + "url": item.get("url"), + "title": item.get("title"), + "snippet": item.get("snippet"), + } + + return web_search_results diff --git a/astrbot/dashboard/routes/chat.py b/astrbot/dashboard/routes/chat.py index 4c4fd0ce84..63b24f5927 100644 --- a/astrbot/dashboard/routes/chat.py +++ b/astrbot/dashboard/routes/chat.py @@ -23,6 +23,7 @@ from astrbot.core.utils.active_event_registry import active_event_registry from astrbot.core.utils.astrbot_path import get_astrbot_data_path from astrbot.core.utils.datetime_utils import to_utc_isoformat +from astrbot.core.utils.web_search_utils import collect_web_search_results from .route import Response, Route, RouteContext @@ -215,7 +216,7 @@ async def _create_attachment_from_file( def _extract_web_search_refs( self, accumulated_text: str, accumulated_parts: list ) -> dict: - """从消息中提取 web_search_tavily 的引用 + """从消息中提取网页搜索引用。 Args: accumulated_text: 累积的文本内容 @@ -224,38 +225,7 @@ def _extract_web_search_refs( Returns: 包含 used 列表的字典,记录被引用的搜索结果 """ - supported = [ - "web_search_tavily", - "web_search_bocha", - "web_search_exa", - "exa_find_similar", - ] - # 从 accumulated_parts 中找到所有 web_search_tavily 的工具调用结果 - web_search_results = {} - tool_call_parts = [ - p - for p in accumulated_parts - if p.get("type") == "tool_call" and p.get("tool_calls") - ] - - for part in tool_call_parts: - for tool_call in part["tool_calls"]: - if tool_call.get("name") not in supported or not tool_call.get( - "result" - ): - continue - try: - result_data = json.loads(tool_call["result"]) - for item in result_data.get("results", []): - if idx := item.get("index"): - web_search_results[idx] = { - "url": item.get("url"), - "title": item.get("title"), - "snippet": item.get("snippet"), - } - except (json.JSONDecodeError, KeyError): - pass - + web_search_results = collect_web_search_results(accumulated_parts) if not web_search_results: return {} diff --git a/astrbot/dashboard/routes/live_chat.py b/astrbot/dashboard/routes/live_chat.py index 25310cf61a..25edd10f34 100644 --- a/astrbot/dashboard/routes/live_chat.py +++ b/astrbot/dashboard/routes/live_chat.py @@ -22,6 +22,7 @@ from astrbot.core.platform.sources.webchat.webchat_queue_mgr import webchat_queue_mgr from astrbot.core.utils.astrbot_path import get_astrbot_data_path, get_astrbot_temp_path from astrbot.core.utils.datetime_utils import to_utc_isoformat +from astrbot.core.utils.web_search_utils import collect_web_search_results from .route import Route, RouteContext @@ -198,37 +199,7 @@ def _extract_web_search_refs( self, accumulated_text: str, accumulated_parts: list ) -> dict: """从消息中提取 web_search 引用。""" - supported = [ - "web_search_tavily", - "web_search_bocha", - "web_search_exa", - "exa_find_similar", - ] - web_search_results = {} - tool_call_parts = [ - p - for p in accumulated_parts - if p.get("type") == "tool_call" and p.get("tool_calls") - ] - - for part in tool_call_parts: - for tool_call in part["tool_calls"]: - if tool_call.get("name") not in supported or not tool_call.get( - "result" - ): - continue - try: - result_data = json.loads(tool_call["result"]) - for item in result_data.get("results", []): - if idx := item.get("index"): - web_search_results[idx] = { - "url": item.get("url"), - "title": item.get("title"), - "snippet": item.get("snippet"), - } - except (json.JSONDecodeError, KeyError): - pass - + web_search_results = collect_web_search_results(accumulated_parts) if not web_search_results: return {} diff --git a/dashboard/src/components/chat/MessageList.vue b/dashboard/src/components/chat/MessageList.vue index 63c8fabb2c..128e75af0c 100644 --- a/dashboard/src/components/chat/MessageList.vue +++ b/dashboard/src/components/chat/MessageList.vue @@ -200,6 +200,13 @@ setCustomComponents('message-list', { code_block: MarkdownCodeBlockNode }); +const WEB_SEARCH_REFERENCE_TOOLS = Object.freeze([ + 'web_search_tavily', + 'web_search_bocha', + 'web_search_exa', + 'exa_find_similar' +]); + export default { name: 'MessageList', components: { @@ -303,8 +310,7 @@ export default { part.tool_calls.forEach(toolCall => { // 检查是否是网页搜索工具调用 - const supportedTools = ['web_search_tavily', 'web_search_bocha', 'web_search_exa', 'exa_find_similar']; - if (!supportedTools.includes(toolCall.name) || !toolCall.result) { + if (!WEB_SEARCH_REFERENCE_TOOLS.includes(toolCall.name) || !toolCall.result) { return; } diff --git a/docs/en/use/websearch.md b/docs/en/use/websearch.md index b030c48756..35a5e70852 100644 --- a/docs/en/use/websearch.md +++ b/docs/en/use/websearch.md @@ -1,7 +1,7 @@ # Web Search -The web search feature aims to provide large language models with the ability to invoke search engines like Google, Bing, and Sogou to obtain recent world information, which can improve the accuracy of model responses and reduce hallucinations to some extent. +The web search feature aims to provide large language models with the ability to invoke search engines like Bing and Sogou to obtain recent world information, which can improve the accuracy of model responses and reduce hallucinations to some extent. AstrBot's built-in web search functionality relies on the large language model's `function calling` capability. If you're not familiar with function calling, please refer to: [Function Calling](/use/websearch). @@ -16,7 +16,7 @@ And other prompts with search intent to trigger the model to invoke the search t AstrBot supports 5 types of web search source integration: `default`, `Tavily`, `Baidu AI Search`, `BoCha`, and `Exa`. -The former uses AstrBot's built-in web search requester to query Google, Bing, and Sogou search engines, performing best in network environments with Google access. **We recommend using Tavily or Exa**. +The former uses AstrBot's built-in web search requester to query Bing and Sogou search engines. **We recommend using Tavily or Exa**. ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) diff --git a/docs/zh/use/websearch.md b/docs/zh/use/websearch.md index 96a569aa70..613d697e1a 100644 --- a/docs/zh/use/websearch.md +++ b/docs/zh/use/websearch.md @@ -1,6 +1,6 @@ # 网页搜索 -网页搜索功能旨在提供大模型调用 Google,Bing,搜狗等搜索引擎以获取世界最近信息的能力,一定程度上能够提高大模型的回复准确度,减少幻觉。 +网页搜索功能旨在提供大模型调用 Bing、搜狗等搜索引擎以获取世界最近信息的能力,一定程度上能够提高大模型的回复准确度,减少幻觉。 AstrBot 内置的网页搜索功能依赖大模型提供 `函数调用` 能力。如果你不了解函数调用,请参考:[函数调用](/use/websearch)。 @@ -15,7 +15,7 @@ AstrBot 内置的网页搜索功能依赖大模型提供 `函数调用` 能力 AstrBot 支持 5 种网页搜索源接入方式:`默认`、`Tavily`、`百度 AI 搜索`、`BoCha`、`Exa`。 -前者使用 AstrBot 内置的网页搜索请求器请求 Google、Bing、搜狗搜索引擎,在能够使用 Google 的网络环境下表现最佳。**我们推荐使用 Tavily 或 Exa**。 +前者使用 AstrBot 内置的网页搜索请求器请求 Bing、搜狗搜索引擎。**我们推荐使用 Tavily 或 Exa**。 ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) @@ -29,7 +29,7 @@ AstrBot 支持 5 种网页搜索源接入方式:`默认`、`Tavily`、`百度 #### 网页搜索(web_search) -使用 Google、Bing、搜狗等搜索引擎进行搜索。 +使用 Bing、搜狗等搜索引擎进行搜索。 | 参数 | 类型 | 必填 | 说明 | |------|------|------|------| @@ -168,4 +168,4 @@ Exa 独有的功能,根据给定的 URL 查找语义相似的网页。适合 | `include` | string | 否 | 限定搜索域名,多个域名用 `\|` 或 `,` 分隔,最多 100 个。示例:`qq.com` 或 `qq.com\|m.163.com` | | `exclude` | string | 否 | 排除搜索域名,多个域名用 `\|` 或 `,` 分隔,最多 100 个。示例:`qq.com` 或 `qq.com\|m.163.com` | | `count` | number | 否 | 返回的搜索结果数量,范围 1-50,默认 10(实际返回数量可能少于指定值) | -| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | \ No newline at end of file +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | From 22e2c8bcf9d70b13900689685f019c393eeb398c Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Sun, 5 Apr 2026 02:24:01 +0800 Subject: [PATCH 4/6] =?UTF-8?q?fix(websearch):=20=E7=BB=9F=E4=B8=80?= =?UTF-8?q?=E5=89=8D=E5=90=8E=E7=AB=AF=E7=BD=91=E9=A1=B5=E6=90=9C=E7=B4=A2?= =?UTF-8?q?=E5=BC=95=E7=94=A8=E6=8F=90=E5=8F=96=E9=80=BB=E8=BE=91=EF=BC=8C?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=89=8D=E7=AB=AF=20refs=20=E9=99=8D?= =?UTF-8?q?=E7=BA=A7=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 重构 web_search_utils.py 为分层结构,新增 build_web_search_refs() 和 _extract_ref_indices() 支持从 标签提取引用索引 - 简化 chat.py/live_chat.py 中 ref 提取为调用 build_web_search_refs() - MessageList.vue 新增 getMessageRefs() 在后端未返回 refs 时前端自行降级提取 - 修复 chat.py 中消息保存条件判断逻辑 --- astrbot/core/utils/web_search_utils.py | 88 +++++++++++-- astrbot/dashboard/routes/chat.py | 62 ++++----- astrbot/dashboard/routes/live_chat.py | 27 ++-- dashboard/src/components/chat/MessageList.vue | 118 ++++++++++++------ tests/unit/test_web_search_utils.py | 90 +++++++++++++ 5 files changed, 282 insertions(+), 103 deletions(-) create mode 100644 tests/unit/test_web_search_utils.py diff --git a/astrbot/core/utils/web_search_utils.py b/astrbot/core/utils/web_search_utils.py index acecaebce1..0052440152 100644 --- a/astrbot/core/utils/web_search_utils.py +++ b/astrbot/core/utils/web_search_utils.py @@ -1,4 +1,5 @@ import json +import re from typing import Any from urllib.parse import urlparse @@ -29,9 +30,9 @@ def normalize_web_search_base_url( return normalized -def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict: - web_search_results = {} - +def _iter_web_search_result_items( + accumulated_parts: list[dict[str, Any]], +): for part in accumulated_parts: if part.get("type") != "tool_call" or not part.get("tool_calls"): continue @@ -52,13 +53,78 @@ def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict: continue for item in result_data.get("results", []): - if not isinstance(item, dict): - continue - if idx := item.get("index"): - web_search_results[idx] = { - "url": item.get("url"), - "title": item.get("title"), - "snippet": item.get("snippet"), - } + if isinstance(item, dict): + yield item + + +def _extract_ref_indices(accumulated_text: str) -> list[str]: + ref_indices: list[str] = [] + seen_indices: set[str] = set() + + for match in re.finditer(r"(.*?)", accumulated_text): + ref_index = match.group(1).strip() + if not ref_index or ref_index in seen_indices: + continue + ref_indices.append(ref_index) + seen_indices.add(ref_index) + + return ref_indices + + +def collect_web_search_ref_items( + accumulated_parts: list[dict[str, Any]], + favicon_cache: dict[str, str] | None = None, +) -> list[dict[str, Any]]: + web_search_refs: list[dict[str, Any]] = [] + seen_indices: set[str] = set() + + for item in _iter_web_search_result_items(accumulated_parts): + ref_index = item.get("index") + if not ref_index or ref_index in seen_indices: + continue + + payload = { + "index": ref_index, + "url": item.get("url"), + "title": item.get("title"), + "snippet": item.get("snippet"), + } + if favicon_cache and payload["url"] in favicon_cache: + payload["favicon"] = favicon_cache[payload["url"]] + + web_search_refs.append(payload) + seen_indices.add(ref_index) + + return web_search_refs + + +def build_web_search_refs( + accumulated_text: str, + accumulated_parts: list[dict[str, Any]], + favicon_cache: dict[str, str] | None = None, +) -> dict: + ordered_refs = collect_web_search_ref_items(accumulated_parts, favicon_cache) + if not ordered_refs: + return {} + + refs_by_index = {ref["index"]: ref for ref in ordered_refs} + ref_indices = _extract_ref_indices(accumulated_text) + used_refs = [refs_by_index[idx] for idx in ref_indices if idx in refs_by_index] + + if not used_refs: + used_refs = ordered_refs + + return {"used": used_refs} + + +def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict: + web_search_results = {} + + for ref in collect_web_search_ref_items(accumulated_parts): + web_search_results[ref["index"]] = { + "url": ref.get("url"), + "title": ref.get("title"), + "snippet": ref.get("snippet"), + } return web_search_results diff --git a/astrbot/dashboard/routes/chat.py b/astrbot/dashboard/routes/chat.py index 63b24f5927..53e2fe1beb 100644 --- a/astrbot/dashboard/routes/chat.py +++ b/astrbot/dashboard/routes/chat.py @@ -1,7 +1,6 @@ import asyncio import json import os -import re import uuid from contextlib import asynccontextmanager from typing import cast @@ -23,7 +22,7 @@ from astrbot.core.utils.active_event_registry import active_event_registry from astrbot.core.utils.astrbot_path import get_astrbot_data_path from astrbot.core.utils.datetime_utils import to_utc_isoformat -from astrbot.core.utils.web_search_utils import collect_web_search_results +from astrbot.core.utils.web_search_utils import build_web_search_refs from .route import Response, Route, RouteContext @@ -216,35 +215,13 @@ async def _create_attachment_from_file( def _extract_web_search_refs( self, accumulated_text: str, accumulated_parts: list ) -> dict: - """从消息中提取网页搜索引用。 - - Args: - accumulated_text: 累积的文本内容 - accumulated_parts: 累积的消息部分列表 - - Returns: - 包含 used 列表的字典,记录被引用的搜索结果 - """ - web_search_results = collect_web_search_results(accumulated_parts) - if not web_search_results: - return {} - - # 从文本中提取所有 xxx 标签并去重 - ref_indices = { - m.strip() for m in re.findall(r"(.*?)", accumulated_text) - } - - # 构建被引用的结果列表 - used_refs = [] - for ref_index in ref_indices: - if ref_index not in web_search_results: - continue - payload = {"index": ref_index, **web_search_results[ref_index]} - if favicon := sp.temporary_cache.get("_ws_favicon", {}).get(payload["url"]): - payload["favicon"] = favicon - used_refs.append(payload) - - return {"used": used_refs} if used_refs else {} + """从消息中提取网页搜索引用。""" + favicon_cache = sp.temporary_cache.get("_ws_favicon", {}) + return build_web_search_refs( + accumulated_text, + accumulated_parts, + favicon_cache, + ) async def _save_bot_message( self, @@ -446,19 +423,27 @@ async def stream(): accumulated_parts.append(part) # 消息结束处理 + should_save = False if msg_type == "end": - break + should_save = bool( + accumulated_parts + or accumulated_text + or accumulated_reasoning + or refs + or agent_stats + ) elif ( (streaming and msg_type == "complete") or not streaming # or msg_type == "break" ): - if ( - chain_type == "tool_call" - or chain_type == "tool_call_result" + if chain_type not in ( + "tool_call", + "tool_call_result", + "agent_stats", ): - continue + should_save = True - # 提取 web_search_tavily 引用 + if should_save: try: refs = self._extract_web_search_refs( accumulated_text, @@ -499,6 +484,9 @@ async def stream(): # tool_calls = {} agent_stats = {} refs = {} + + if msg_type == "end": + break except BaseException as e: logger.exception(f"WebChat stream unexpected error: {e}", exc_info=True) finally: diff --git a/astrbot/dashboard/routes/live_chat.py b/astrbot/dashboard/routes/live_chat.py index 25edd10f34..b68a02c20e 100644 --- a/astrbot/dashboard/routes/live_chat.py +++ b/astrbot/dashboard/routes/live_chat.py @@ -1,7 +1,6 @@ import asyncio import json import os -import re import time import uuid import wave @@ -22,7 +21,7 @@ from astrbot.core.platform.sources.webchat.webchat_queue_mgr import webchat_queue_mgr from astrbot.core.utils.astrbot_path import get_astrbot_data_path, get_astrbot_temp_path from astrbot.core.utils.datetime_utils import to_utc_isoformat -from astrbot.core.utils.web_search_utils import collect_web_search_results +from astrbot.core.utils.web_search_utils import build_web_search_refs from .route import Route, RouteContext @@ -199,24 +198,12 @@ def _extract_web_search_refs( self, accumulated_text: str, accumulated_parts: list ) -> dict: """从消息中提取 web_search 引用。""" - web_search_results = collect_web_search_results(accumulated_parts) - if not web_search_results: - return {} - - ref_indices = { - m.strip() for m in re.findall(r"(.*?)", accumulated_text) - } - - used_refs = [] - for ref_index in ref_indices: - if ref_index not in web_search_results: - continue - payload = {"index": ref_index, **web_search_results[ref_index]} - if favicon := sp.temporary_cache.get("_ws_favicon", {}).get(payload["url"]): - payload["favicon"] = favicon - used_refs.append(payload) - - return {"used": used_refs} if used_refs else {} + favicon_cache = sp.temporary_cache.get("_ws_favicon", {}) + return build_web_search_refs( + accumulated_text, + accumulated_parts, + favicon_cache, + ) async def _save_bot_message( self, diff --git a/dashboard/src/components/chat/MessageList.vue b/dashboard/src/components/chat/MessageList.vue index 128e75af0c..4030be61a0 100644 --- a/dashboard/src/components/chat/MessageList.vue +++ b/dashboard/src/components/chat/MessageList.vue @@ -149,7 +149,7 @@ @click="$emit('replyMessage', msg, index)" :title="tm('actions.reply')" /> - + @@ -294,7 +294,81 @@ export default { this.extractWebSearchResults(); }, methods: { - // 从消息中提取 web_search_tavily 的搜索结果 + extractRefsFromToolCall(toolCall) { + if (!WEB_SEARCH_REFERENCE_TOOLS.includes(toolCall?.name) || !toolCall.result) { + return []; + } + + try { + const resultData = typeof toolCall.result === 'string' + ? JSON.parse(toolCall.result) + : toolCall.result; + + if (!resultData?.results || !Array.isArray(resultData.results)) { + return []; + } + + const refs = []; + const seenIndices = new Set(); + + resultData.results.forEach(item => { + if (!item?.index || seenIndices.has(item.index)) { + return; + } + + refs.push({ + index: item.index, + url: item.url, + title: item.title, + snippet: item.snippet + }); + seenIndices.add(item.index); + }); + + return refs; + } catch (e) { + console.error('Failed to parse web search result:', e); + return []; + } + }, + + collectMessageWebSearchRefs(messageParts) { + if (!Array.isArray(messageParts)) { + return []; + } + + const refs = []; + const seenIndices = new Set(); + + messageParts.forEach(part => { + if (part.type !== 'tool_call' || !Array.isArray(part.tool_calls)) { + return; + } + + part.tool_calls.forEach(toolCall => { + this.extractRefsFromToolCall(toolCall).forEach(ref => { + if (seenIndices.has(ref.index)) { + return; + } + refs.push(ref); + seenIndices.add(ref.index); + }); + }); + }); + + return refs; + }, + + getMessageRefs(content) { + if (content?.refs?.used?.length) { + return content.refs; + } + + const fallbackRefs = this.collectMessageWebSearchRefs(content?.message); + return fallbackRefs.length ? { used: fallbackRefs } : null; + }, + + // 从消息中提取网页搜索结果映射 extractWebSearchResults() { const results = {}; @@ -302,39 +376,13 @@ export default { if (msg.content.type !== 'bot' || !Array.isArray(msg.content.message)) { return; } - - msg.content.message.forEach(part => { - if (part.type !== 'tool_call' || !Array.isArray(part.tool_calls)) { - return; - } - - part.tool_calls.forEach(toolCall => { - // 检查是否是网页搜索工具调用 - if (!WEB_SEARCH_REFERENCE_TOOLS.includes(toolCall.name) || !toolCall.result) { - return; - } - - try { - // 解析工具调用结果 - const resultData = typeof toolCall.result === 'string' - ? JSON.parse(toolCall.result) - : toolCall.result; - - if (resultData.results && Array.isArray(resultData.results)) { - resultData.results.forEach(item => { - if (item.index) { - results[item.index] = { - url: item.url, - title: item.title, - snippet: item.snippet - }; - } - }); - } - } catch (e) { - console.error('Failed to parse web search result:', e); - } - }); + + this.collectMessageWebSearchRefs(msg.content.message).forEach(ref => { + results[ref.index] = { + url: ref.url, + title: ref.title, + snippet: ref.snippet + }; }); }); diff --git a/tests/unit/test_web_search_utils.py b/tests/unit/test_web_search_utils.py new file mode 100644 index 0000000000..7e32bbc7cc --- /dev/null +++ b/tests/unit/test_web_search_utils.py @@ -0,0 +1,90 @@ +import json + +from astrbot.core.utils.web_search_utils import ( + build_web_search_refs, + collect_web_search_ref_items, + collect_web_search_results, +) + + +def _make_web_search_parts() -> list[dict]: + return [ + { + "type": "tool_call", + "tool_calls": [ + { + "name": "web_search_exa", + "result": json.dumps( + { + "results": [ + { + "index": "a152.1", + "url": "https://example.com/1", + "title": "Example 1", + "snippet": "Snippet 1", + }, + { + "index": "a152.2", + "url": "https://example.com/2", + "title": "Example 2", + "snippet": "Snippet 2", + }, + ] + } + ), + } + ], + } + ] + + +def test_collect_web_search_results_builds_index_mapping(): + results = collect_web_search_results(_make_web_search_parts()) + + assert results == { + "a152.1": { + "url": "https://example.com/1", + "title": "Example 1", + "snippet": "Snippet 1", + }, + "a152.2": { + "url": "https://example.com/2", + "title": "Example 2", + "snippet": "Snippet 2", + }, + } + + +def test_collect_web_search_ref_items_preserves_order_and_favicon(): + refs = collect_web_search_ref_items( + _make_web_search_parts(), + {"https://example.com/2": "https://example.com/favicon.ico"}, + ) + + assert [ref["index"] for ref in refs] == ["a152.1", "a152.2"] + assert "favicon" not in refs[0] + assert refs[1]["favicon"] == "https://example.com/favicon.ico" + + +def test_build_web_search_refs_uses_explicit_ref_indices_in_text_order(): + refs = build_web_search_refs( + "Second a152.2 first a152.1", + _make_web_search_parts(), + ) + + assert [ref["index"] for ref in refs["used"]] == ["a152.2", "a152.1"] + + +def test_build_web_search_refs_falls_back_to_all_results_without_refs(): + refs = build_web_search_refs("No explicit refs here.", _make_web_search_parts()) + + assert [ref["index"] for ref in refs["used"]] == ["a152.1", "a152.2"] + + +def test_build_web_search_refs_ignores_tool_call_id_and_falls_back(): + refs = build_web_search_refs( + "call_a73499ddbaf845dba8310e44", + _make_web_search_parts(), + ) + + assert [ref["index"] for ref in refs["used"]] == ["a152.1", "a152.2"] From 370167fb397d6ff0e3bd4ee4f9ee4577c5098952 Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Mon, 6 Apr 2026 21:20:00 +0800 Subject: [PATCH 5/6] =?UTF-8?q?fix(websearch):=20=E4=BF=AE=E5=A4=8D=20UUID?= =?UTF-8?q?=20=E7=94=9F=E6=88=90=E9=80=BB=E8=BE=91=EF=BC=8C=E7=A1=AE?= =?UTF-8?q?=E4=BF=9D=E5=94=AF=E4=B8=80=E6=80=A7=EF=BC=9B=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=20API=20Base=20URL=20=E9=94=99=E8=AF=AF=E6=8F=90=E7=A4=BA?= =?UTF-8?q?=E4=BF=A1=E6=81=AF=EF=BC=9B=E6=96=B0=E5=A2=9E=E6=B6=88=E6=81=AF?= =?UTF-8?q?=E5=BC=95=E7=94=A8=E7=BC=93=E5=AD=98=E6=9C=BA=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrbot/builtin_stars/web_searcher/main.py | 10 +-- astrbot/core/utils/web_search_utils.py | 4 +- dashboard/src/components/chat/MessageList.vue | 67 +++++++++++++++++-- tests/unit/test_web_search_utils.py | 18 +++++ 4 files changed, 89 insertions(+), 10 deletions(-) diff --git a/astrbot/builtin_stars/web_searcher/main.py b/astrbot/builtin_stars/web_searcher/main.py index e0044969b6..f5972fd141 100644 --- a/astrbot/builtin_stars/web_searcher/main.py +++ b/astrbot/builtin_stars/web_searcher/main.py @@ -396,7 +396,7 @@ async def search_from_tavily( return "Error: Tavily web searcher does not return any results." ret_ls = [] - ref_uuid = str(uuid.uuid4())[:4] + ref_uuid = str(uuid.uuid4()) for idx, result in enumerate(results, 1): index = f"{ref_uuid}.{idx}" ret_ls.append( @@ -425,7 +425,7 @@ async def tavily_extract_web_page( """Extract the content of a web page using Tavily. Args: - url(string): Required. An URl to extract content from. + url(string): Required. A URL to extract content from. extract_depth(string): Optional. The depth of the extraction, must be one of 'basic', 'advanced'. Default is "basic". timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. @@ -595,7 +595,7 @@ async def search_from_bocha( return "Error: BoCha web searcher does not return any results." ret_ls = [] - ref_uuid = str(uuid.uuid4())[:4] + ref_uuid = str(uuid.uuid4()) for idx, result in enumerate(results, 1): index = f"{ref_uuid}.{idx}" ret_ls.append( @@ -717,7 +717,7 @@ async def search_from_exa( return "Error: Exa web searcher does not return any results." ret_ls = [] - ref_uuid = str(uuid.uuid4())[:4] + ref_uuid = str(uuid.uuid4()) for idx, result in enumerate(results, 1): index = f"{ref_uuid}.{idx}" ret_ls.append( @@ -869,7 +869,7 @@ async def find_similar_links( return "Error: Exa find similar does not return any results." ret_ls = [] - ref_uuid = str(uuid.uuid4())[:4] + ref_uuid = str(uuid.uuid4()) for idx, result in enumerate(results, 1): index = f"{ref_uuid}.{idx}" ret_ls.append( diff --git a/astrbot/core/utils/web_search_utils.py b/astrbot/core/utils/web_search_utils.py index 0052440152..4c00d48f0f 100644 --- a/astrbot/core/utils/web_search_utils.py +++ b/astrbot/core/utils/web_search_utils.py @@ -25,7 +25,9 @@ def normalize_web_search_base_url( parsed = urlparse(normalized) if parsed.scheme not in {"http", "https"} or not parsed.netloc: raise ValueError( - f"Error: {provider_name} API Base URL must start with http:// or https://.", + f"Error: {provider_name} API Base URL must be a base host URL starting " + f"with http:// or https:// (for example, {default}), not a full endpoint " + f"path. Received: {normalized!r}.", ) return normalized diff --git a/dashboard/src/components/chat/MessageList.vue b/dashboard/src/components/chat/MessageList.vue index 4030be61a0..b67f35bb3f 100644 --- a/dashboard/src/components/chat/MessageList.vue +++ b/dashboard/src/components/chat/MessageList.vue @@ -274,7 +274,8 @@ export default { url: '' }, // Web search results mapping: { 'uuid.idx': { url, title, snippet } } - webSearchResults: {} + webSearchResults: {}, + messageRefsCache: new WeakMap() }; }, async mounted() { @@ -359,13 +360,66 @@ export default { return refs; }, + buildMessageRefsCacheKey(messageParts) { + if (!Array.isArray(messageParts)) { + return ''; + } + + const cacheParts = []; + + messageParts.forEach(part => { + if (part.type !== 'tool_call' || !Array.isArray(part.tool_calls)) { + return; + } + + part.tool_calls.forEach(toolCall => { + if (!WEB_SEARCH_REFERENCE_TOOLS.includes(toolCall?.name) || !toolCall.result) { + return; + } + + const rawResult = typeof toolCall.result === 'string' + ? toolCall.result + : JSON.stringify(toolCall.result); + + cacheParts.push(`${toolCall.id || toolCall.name}:${rawResult}`); + }); + }); + + return cacheParts.join('||'); + }, + + getCachedMessageRefs(content) { + if (!content || typeof content !== 'object') { + return null; + } + + const cacheKey = this.buildMessageRefsCacheKey(content.message); + if (!cacheKey) { + return null; + } + + const cachedEntry = this.messageRefsCache.get(content); + if (cachedEntry?.key === cacheKey) { + return cachedEntry.refs; + } + + const refs = this.collectMessageWebSearchRefs(content.message); + const normalizedRefs = refs.length ? { used: refs } : null; + + this.messageRefsCache.set(content, { + key: cacheKey, + refs: normalizedRefs + }); + + return normalizedRefs; + }, + getMessageRefs(content) { if (content?.refs?.used?.length) { return content.refs; } - const fallbackRefs = this.collectMessageWebSearchRefs(content?.message); - return fallbackRefs.length ? { used: fallbackRefs } : null; + return this.getCachedMessageRefs(content); }, // 从消息中提取网页搜索结果映射 @@ -377,7 +431,12 @@ export default { return; } - this.collectMessageWebSearchRefs(msg.content.message).forEach(ref => { + const refs = this.getMessageRefs(msg.content); + if (!refs?.used?.length) { + return; + } + + refs.used.forEach(ref => { results[ref.index] = { url: ref.url, title: ref.title, diff --git a/tests/unit/test_web_search_utils.py b/tests/unit/test_web_search_utils.py index 7e32bbc7cc..fc86f7c468 100644 --- a/tests/unit/test_web_search_utils.py +++ b/tests/unit/test_web_search_utils.py @@ -1,9 +1,12 @@ import json +import pytest + from astrbot.core.utils.web_search_utils import ( build_web_search_refs, collect_web_search_ref_items, collect_web_search_results, + normalize_web_search_base_url, ) @@ -88,3 +91,18 @@ def test_build_web_search_refs_ignores_tool_call_id_and_falls_back(): ) assert [ref["index"] for ref in refs["used"]] == ["a152.1", "a152.2"] + + +def test_normalize_web_search_base_url_reports_invalid_value(): + with pytest.raises(ValueError) as exc_info: + normalize_web_search_base_url( + "exa.ai/search", + default="https://api.exa.ai", + provider_name="Exa", + ) + + assert str(exc_info.value) == ( + "Error: Exa API Base URL must be a base host URL starting with " + "http:// or https:// (for example, https://api.exa.ai), not a full " + "endpoint path. Received: 'exa.ai/search'." + ) From 96e15f79ad4dc723b459b27358e77345b2b7ec2a Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Mon, 6 Apr 2026 21:51:49 +0800 Subject: [PATCH 6/6] =?UTF-8?q?fix(websearch):=20=E6=94=BE=E5=AE=BD=20API?= =?UTF-8?q?=20Base=20URL=20=E6=A0=A1=E9=AA=8C=EF=BC=8C=E5=A2=9E=E5=BC=BA?= =?UTF-8?q?=20Tavily/Exa=20=E8=AF=B7=E6=B1=82=E6=8A=A5=E9=94=99=E6=8F=90?= =?UTF-8?q?=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrbot/builtin_stars/web_searcher/main.py | 49 +++++++++++++++++-- .../core/knowledge_base/parsers/url_parser.py | 5 +- astrbot/core/utils/web_search_utils.py | 5 +- tests/unit/test_web_search_utils.py | 40 ++++++++++++--- 4 files changed, 84 insertions(+), 15 deletions(-) diff --git a/astrbot/builtin_stars/web_searcher/main.py b/astrbot/builtin_stars/web_searcher/main.py index f5972fd141..82a38de04e 100644 --- a/astrbot/builtin_stars/web_searcher/main.py +++ b/astrbot/builtin_stars/web_searcher/main.py @@ -107,6 +107,15 @@ def _get_exa_base_url(self, cfg: AstrBotConfig) -> str: provider_name="Exa", ) + def _format_provider_request_error( + self, provider_name: str, action: str, url: str, reason: str, status: int + ) -> str: + return ( + f"{provider_name} {action} failed for URL {url}: {reason}, status: {status}. " + "If you configured an API Base URL, make sure it is a base URL or proxy " + "prefix rather than a specific endpoint path." + ) + def _add_active_tools( self, tool_set, func_tool_mgr, tool_names: tuple[str, ...] ) -> None: @@ -213,7 +222,13 @@ async def _web_search_tavily( if response.status != 200: reason = await response.text() raise Exception( - f"Tavily web search failed: {reason}, status: {response.status}", + self._format_provider_request_error( + "Tavily", + "web search", + url, + reason, + response.status, + ), ) data = await response.json() results = [] @@ -248,7 +263,13 @@ async def _extract_tavily( if response.status != 200: reason = await response.text() raise Exception( - f"Tavily web search failed: {reason}, status: {response.status}", + self._format_provider_request_error( + "Tavily", + "content extraction", + url, + reason, + response.status, + ), ) data = await response.json() results: list[dict] = data.get("results", []) @@ -648,7 +669,13 @@ async def _web_search_exa( if response.status != 200: reason = await response.text() raise Exception( - f"Exa web search failed: {reason}, status: {response.status}", + self._format_provider_request_error( + "Exa", + "web search", + url, + reason, + response.status, + ), ) data = await response.json() results = [] @@ -752,7 +779,13 @@ async def _extract_exa( if response.status != 200: reason = await response.text() raise Exception( - f"Exa content extraction failed: {reason}, status: {response.status}", + self._format_provider_request_error( + "Exa", + "content extraction", + url, + reason, + response.status, + ), ) data = await response.json() return data.get("results", []) @@ -818,7 +851,13 @@ async def _find_similar_exa( if response.status != 200: reason = await response.text() raise Exception( - f"Exa find similar failed: {reason}, status: {response.status}", + self._format_provider_request_error( + "Exa", + "find similar", + url, + reason, + response.status, + ), ) data = await response.json() results = [] diff --git a/astrbot/core/knowledge_base/parsers/url_parser.py b/astrbot/core/knowledge_base/parsers/url_parser.py index 660e110fe6..09a226f572 100644 --- a/astrbot/core/knowledge_base/parsers/url_parser.py +++ b/astrbot/core/knowledge_base/parsers/url_parser.py @@ -79,7 +79,10 @@ async def extract_text_from_url(self, url: str) -> str: if response.status != 200: reason = await response.text() raise OSError( - f"Tavily web extraction failed: {reason}, status: {response.status}" + f"Tavily web extraction failed for URL {api_url}: " + f"{reason}, status: {response.status}. If you configured " + "a Tavily API Base URL, make sure it is a base URL or " + "proxy prefix rather than a specific endpoint path." ) data = await response.json() diff --git a/astrbot/core/utils/web_search_utils.py b/astrbot/core/utils/web_search_utils.py index 4c00d48f0f..701852449b 100644 --- a/astrbot/core/utils/web_search_utils.py +++ b/astrbot/core/utils/web_search_utils.py @@ -25,9 +25,8 @@ def normalize_web_search_base_url( parsed = urlparse(normalized) if parsed.scheme not in {"http", "https"} or not parsed.netloc: raise ValueError( - f"Error: {provider_name} API Base URL must be a base host URL starting " - f"with http:// or https:// (for example, {default}), not a full endpoint " - f"path. Received: {normalized!r}.", + f"Error: {provider_name} API Base URL must start with http:// or " + f"https://. Proxy base paths are allowed. Received: {normalized!r}.", ) return normalized diff --git a/tests/unit/test_web_search_utils.py b/tests/unit/test_web_search_utils.py index fc86f7c468..b718ac3f31 100644 --- a/tests/unit/test_web_search_utils.py +++ b/tests/unit/test_web_search_utils.py @@ -93,16 +93,44 @@ def test_build_web_search_refs_ignores_tool_call_id_and_falls_back(): assert [ref["index"] for ref in refs["used"]] == ["a152.1", "a152.2"] -def test_normalize_web_search_base_url_reports_invalid_value(): +@pytest.mark.parametrize( + ("base_url", "expected_message"), + [ + ( + "exa.ai/search", + "Error: Exa API Base URL must start with http:// or https://. " + "Proxy base paths are allowed. Received: 'exa.ai/search'.", + ), + ], +) +def test_normalize_web_search_base_url_reports_invalid_value( + base_url: str, expected_message: str +): with pytest.raises(ValueError) as exc_info: normalize_web_search_base_url( - "exa.ai/search", + base_url, default="https://api.exa.ai", provider_name="Exa", ) - assert str(exc_info.value) == ( - "Error: Exa API Base URL must be a base host URL starting with " - "http:// or https:// (for example, https://api.exa.ai), not a full " - "endpoint path. Received: 'exa.ai/search'." + assert str(exc_info.value) == expected_message + + +@pytest.mark.parametrize( + ("base_url", "expected"), + [ + (" https://api.exa.ai/ ", "https://api.exa.ai"), + ("https://proxy.example.com/exa/", "https://proxy.example.com/exa"), + ("https://api.exa.ai/search", "https://api.exa.ai/search"), + ], +) +def test_normalize_web_search_base_url_accepts_proxy_paths( + base_url: str, expected: str +): + normalized = normalize_web_search_base_url( + base_url, + default="https://api.exa.ai", + provider_name="Exa", ) + + assert normalized == expected