AstrBotDevs
diff --git a/‎astrbot/core/astr_main_agent.py‎
Lines changed: 67 additions & 11 deletions b/‎astrbot/core/astr_main_agent.py‎
Lines changed: 67 additions & 11 deletions
diff --git a/‎astrbot/core/config/default.py‎
Lines changed: 62 additions & 3 deletions b/‎astrbot/core/config/default.py‎
Lines changed: 62 additions & 3 deletions
diff --git a/‎astrbot/core/message/components.py‎
Lines changed: 0 additions & 28 deletions b/‎astrbot/core/message/components.py‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎astrbot/core/pipeline/preprocess_stage/stage.py‎
Lines changed: 16 additions & 0 deletions b/‎astrbot/core/pipeline/preprocess_stage/stage.py‎
Lines changed: 16 additions & 0 deletions
@@ -21,7 +21,7 @@
 from astrbot.core.astr_agent_run_util import AgentRunner
 from astrbot.core.astr_agent_tool_exec import FunctionToolExecutor
 from astrbot.core.conversation_mgr import Conversation
-from astrbot.core.message.components import File, Image, Reply
+from astrbot.core.message.components import File, Image, Record, Reply
 from astrbot.core.persona_error_reply import (
     extract_persona_custom_error_message_from_persona,
     set_persona_custom_error_message_on_event,
@@ -419,6 +419,18 @@ def _append_quoted_image_attachment(req: ProviderRequest, image_path: str) -> No
     )
 
 
+def _append_audio_attachment(req: ProviderRequest, audio_path: str) -> None:
+    req.extra_user_content_parts.append(
+        TextPart(text=f"[Audio Attachment: path {audio_path}]")
+    )
+
+
+def _append_quoted_audio_attachment(req: ProviderRequest, audio_path: str) -> None:
+    req.extra_user_content_parts.append(
+        TextPart(text=f"[Audio Attachment in quoted message: path {audio_path}]")
+    )
+
+
 def _get_quoted_message_parser_settings(
     provider_settings: dict[str, object] | None,
 ) -> QuotedMessageParserSettings:
@@ -704,12 +716,25 @@ def _modalities_fix(provider: Provider, req: ProviderRequest) -> None:
                 "Provider %s does not support image, using placeholder.", provider
             )
             image_count = len(req.image_urls)
-            placeholder = " ".join(["[图片]"] * image_count)
+            placeholder = " ".join(["[Image]"] * image_count)
             if req.prompt:
                 req.prompt = f"{placeholder} {req.prompt}"
             else:
                 req.prompt = placeholder
             req.image_urls = []
+    if req.audio_urls:
+        provider_cfg = provider.provider_config.get("modalities", ["audio"])
+        if "audio" not in provider_cfg:
+            logger.debug(
+                "Provider %s does not support audio, using placeholder.", provider
+            )
+            audio_count = len(req.audio_urls)
+            placeholder = " ".join(["[Audio]"] * audio_count)
+            if req.prompt:
+                req.prompt = f"{placeholder} {req.prompt}"
+            else:
+                req.prompt = placeholder
+            req.audio_urls = []
     if req.func_tool:
         provider_cfg = provider.provider_config.get("modalities", ["tool_use"])
         if "tool_use" not in provider_cfg:
@@ -730,11 +755,13 @@ def _sanitize_context_by_modalities(
     if not modalities or not isinstance(modalities, list):
         return
     supports_image = bool("image" in modalities)
+    supports_audio = bool("audio" in modalities)
     supports_tool_use = bool("tool_use" in modalities)
-    if supports_image and supports_tool_use:
+    if supports_image and supports_audio and supports_tool_use:
         return
     sanitized_contexts: list[dict] = []
     removed_image_blocks = 0
+    removed_audio_blocks = 0
     removed_tool_messages = 0
     removed_tool_calls = 0
     for msg in req.contexts:
@@ -753,20 +780,28 @@ def _sanitize_context_by_modalities(
                     removed_tool_calls += 1
                 new_msg.pop("tool_calls", None)
                 new_msg.pop("tool_call_id", None)
-        if not supports_image:
+
+        if not supports_image or not supports_audio:
             content = new_msg.get("content")
             if isinstance(content, list):
                 filtered_parts: list = []
-                removed_any_image = False
+                removed_any_multimodal = False
                 for part in content:
                     if isinstance(part, dict):
                         part_type = str(part.get("type", "")).lower()
-                        if part_type in {"image_url", "image"}:
-                            removed_any_image = True
+                        if not supports_image and part_type in {"image_url", "image"}:
+                            removed_any_multimodal = True
                             removed_image_blocks += 1
                             continue
+                        if not supports_audio and part_type in {
+                            "audio_url",
+                            "input_audio",
+                        }:
+                            removed_any_multimodal = True
+                            removed_audio_blocks += 1
+                            continue
                     filtered_parts.append(part)
-                if removed_any_image:
+                if removed_any_multimodal:
                     new_msg["content"] = filtered_parts
         if role == "assistant":
             content = new_msg.get("content")
@@ -777,10 +812,19 @@ def _sanitize_context_by_modalities(
                 if isinstance(content, str) and (not content.strip()):
                     continue
         sanitized_contexts.append(new_msg)
-    if removed_image_blocks or removed_tool_messages or removed_tool_calls:
+
+    if (
+        removed_image_blocks
+        or removed_audio_blocks
+        or removed_tool_messages
+        or removed_tool_calls
+    ):
         logger.debug(
-            "sanitize_context_by_modalities applied: removed_image_blocks=%s, removed_tool_messages=%s, removed_tool_calls=%s",
+            "sanitize_context_by_modalities applied: "
+            "removed_image_blocks=%s, removed_audio_blocks=%s, "
+            "removed_tool_messages=%s, removed_tool_calls=%s",
             removed_image_blocks,
+            removed_audio_blocks,
             removed_tool_messages,
             removed_tool_calls,
         )
@@ -969,6 +1013,7 @@ async def build_main_agent(
             req = ProviderRequest()
             req.prompt = ""
             req.image_urls = []
+            req.audio_urls = []
             if sel_model := event.get_extra("selected_model"):
                 req.model = sel_model
             if config.provider_wake_prefix and (
@@ -988,6 +1033,10 @@ async def build_main_agent(
                     req.extra_user_content_parts.append(
                         TextPart(text=f"[Image Attachment: path {image_path}]")
                     )
+                elif isinstance(comp, Record):
+                    audio_path = await comp.convert_to_file_path()
+                    req.audio_urls.append(audio_path)
+                    _append_audio_attachment(req, audio_path)
                 elif isinstance(comp, File):
                     file_path = await comp.get_file()
                     file_name = comp.name or os.path.basename(file_path)
@@ -1017,6 +1066,10 @@ async def build_main_agent(
                                 event.track_temporary_local_file(image_path)
                             req.image_urls.append(image_path)
                             _append_quoted_image_attachment(req, image_path)
+                        elif isinstance(reply_comp, Record):
+                            audio_path = await reply_comp.convert_to_file_path()
+                            req.audio_urls.append(audio_path)
+                            _append_quoted_audio_attachment(req, audio_path)
                         elif isinstance(reply_comp, File):
                             file_path = await reply_comp.get_file()
                             file_name = reply_comp.name or os.path.basename(file_path)
@@ -1074,12 +1127,15 @@ async def build_main_agent(
     if isinstance(req.contexts, str):
         req.contexts = json.loads(req.contexts)
     req.image_urls = normalize_and_dedupe_strings(req.image_urls)
+    req.audio_urls = normalize_and_dedupe_strings(req.audio_urls)
+
     if config.file_extract_enabled:
         try:
             await _apply_file_extract(event, req, config)
         except Exception as exc:
             logger.error("Error occurred while applying file extract: %s", exc)
-    if not req.prompt and (not req.image_urls):
+
+    if not req.prompt and not req.image_urls and not req.audio_urls:
         if not event.get_group_id() and req.extra_user_content_parts:
             req.prompt = "<attachment>"
         else:
 
@@ -1247,6 +1247,18 @@ class ChatProviderTemplate(TypedDict):
                         "proxy": "",
                         "custom_headers": {},
                     },
+                    "LongCat": {
+                        "id": "longcat",
+                        "provider": "longcat",
+                        "type": "longcat_chat_completion",
+                        "provider_type": "chat_completion",
+                        "enable": True,
+                        "key": [],
+                        "api_base": "https://api.longcat.chat/openai",
+                        "timeout": 120,
+                        "proxy": "",
+                        "custom_headers": {},
+                    },
                     "AIHubMix": {
                         "id": "aihubmix",
                         "provider": "aihubmix",
@@ -1761,6 +1773,7 @@ class ChatProviderTemplate(TypedDict):
                         "enable": True,
                         "rerank_api_key": "",
                         "rerank_api_base": "http://127.0.0.1:8000",
+                        "rerank_api_suffix": "/v1/rerank",
                         "rerank_model": "BAAI/bge-reranker-base",
                         "timeout": 20,
                     },
@@ -1789,6 +1802,19 @@ class ChatProviderTemplate(TypedDict):
                         "return_documents": False,
                         "instruct": "",
                     },
+                    "NVIDIA Rerank": {
+                        "id": "nvidia_rerank",
+                        "type": "nvidia_rerank",
+                        "provider": "nvidia",
+                        "provider_type": "rerank",
+                        "enable": True,
+                        "nvidia_rerank_api_key": "",
+                        "nvidia_rerank_api_base": "https://ai.api.nvidia.com/v1/retrieval",
+                        "nvidia_rerank_model": "nv-rerank-qa-mistral-4b:1",
+                        "nvidia_rerank_model_endpoint": "/reranking",
+                        "timeout": 20,
+                        "nvidia_rerank_truncate": "",
+                    },
                     "Xinference STT": {
                         "id": "xinference_stt",
                         "type": "xinference_stt",
@@ -1826,7 +1852,12 @@ class ChatProviderTemplate(TypedDict):
                     "rerank_api_base": {
                         "description": "重排序模型 API Base URL",
                         "type": "string",
-                        "hint": "AstrBot 会在请求时在末尾加上 /v1/rerank。",
+                        "hint": "最终请求路径由 Base URL 和路径后缀拼接而成（默认为 /v1/rerank）。",
+                    },
+                    "rerank_api_suffix": {
+                        "description": "API URL 路径后缀",
+                        "type": "string",
+                        "hint": "追加到 base_url 后的路径，如 /v1/rerank。留空则不追加。",
                     },
                     "rerank_api_key": {
                         "description": "API Key",
@@ -1852,12 +1883,40 @@ class ChatProviderTemplate(TypedDict):
                         "type": "bool",
                         "hint": "如果模型当前未在 Xinference 服务中运行，是否尝试自动启动它。在生产环境中建议关闭。",
                     },
+                    "nvidia_rerank_api_base": {
+                        "description": "API Base URL",
+                        "type": "string",
+                    },
+                    "nvidia_rerank_api_key": {
+                        "description": "API Key",
+                        "type": "string",
+                    },
+                    "nvidia_rerank_model": {
+                        "description": "重排序模型名称",
+                        "type": "string",
+                        "hint": "请参照NVIDIA Docs中模型名称填写。",
+                    },
+                    "nvidia_rerank_model_endpoint": {
+                        "description": "自定义模型端点",
+                        "type": "string",
+                        "hint": "自定义URL末尾端点，默认为 /reranking",
+                    },
+                    "nvidia_rerank_truncate": {
+                        "description": "文本截断策略",
+                        "type": "string",
+                        "hint": "当输入文本过长时，是否截断输入以适应模型的最大上下文长度。",
+                        "options": [
+                            "",
+                            "NONE",
+                            "END",
+                        ],
+                    },
                     "modalities": {
                         "description": "模型能力",
                         "type": "list",
                         "items": {"type": "string"},
-                        "options": ["text", "image", "tool_use"],
-                        "labels": ["文本", "图像", "工具使用"],
+                        "options": ["text", "image", "audio", "tool_use"],
+                        "labels": ["文本", "图像", "音频", "工具使用"],
                         "render_type": "checkbox",
                         "hint": "模型支持的模态。如所填写的模型不支持图像，请取消勾选图像。",
                     },
 
@@ -74,12 +74,6 @@ class ComponentType(str, Enum):
     Music = "Music"
     Json = "Json"
     Unknown = "Unknown"
-    WechatEmoji = "WechatEmoji"  # Wechat 下的 emoji 表情包
-    # Discord-specific component types
-    DiscordEmbed = "DiscordEmbed"
-    DiscordButton = "DiscordButton"
-    DiscordReference = "DiscordReference"
-    DiscordView = "DiscordView"
 
 
 class BaseMessageComponent(BaseModel):
@@ -106,7 +100,6 @@ async def to_dict(self) -> dict:
 class Plain(BaseMessageComponent):
     type: ComponentType = ComponentType.Plain
     text: str
-    convert: bool | None = True
 
     def __init__(self, text: str, convert: bool = True, **_) -> None:
         super().__init__(text=text, convert=convert, **_)
@@ -129,11 +122,7 @@ def __init__(self, **_) -> None:
 class Record(BaseMessageComponent):
     type: ComponentType = ComponentType.Record
     file: str | None = ""
-    magic: bool | None = False
     url: str | None = ""
-    cache: bool | None = True
-    proxy: bool | None = True
-    timeout: int | None = 0
     # Original text content (e.g. TTS source text), used as caption in fallback scenarios
     text: str | None = None
     # 额外
@@ -239,7 +228,6 @@ class Video(BaseMessageComponent):
     type: ComponentType = ComponentType.Video
     file: str
     cover: str | None = ""
-    c: int | None = 2
     # 额外
     path: str | None = ""
 
@@ -416,14 +404,9 @@ class Image(BaseMessageComponent):
     type: ComponentType = ComponentType.Image
     file: str | None = ""
     _type: str | None = ""
-    subType: int | None = 0
     url: str | None = ""
-    cache: bool | None = True
-    id: int | None = 40000
-    c: int | None = 2
     # 额外
     path: str | None = ""
-    file_unique: str | None = ""  # 某些平台可能有图片缓存的唯一标识
 
     def __init__(self, file: str | None, **_) -> None:
         super().__init__(file=file, **_)
@@ -854,16 +837,6 @@ async def to_dict(self):
         }
 
 
-class WechatEmoji(BaseMessageComponent):
-    type: ComponentType = ComponentType.WechatEmoji
-    md5: str | None = ""
-    md5_len: int | None = 0
-    cdnurl: str | None = ""
-
-    def __init__(self, **_) -> None:
-        super().__init__(**_)
-
-
 ComponentTypes = {
     # Basic Message Segments
     "plain": Plain,
@@ -889,5 +862,4 @@ def __init__(self, **_) -> None:
     "nodes": Nodes,
     "json": Json,
     "unknown": Unknown,
-    "WechatEmoji": WechatEmoji,
 }
@@ -7,6 +7,7 @@
 from astrbot.core.pipeline.context import PipelineContext
 from astrbot.core.pipeline.stage import Stage, register_stage
 from astrbot.core.platform.astr_message_event import AstrMessageEvent
+from astrbot.core.utils.media_utils import ensure_wav
 
 
 @register_stage
@@ -62,6 +63,21 @@ async def process(
                             logger.debug(f"路径映射: {url} -> {component.url}")
                     message_chain[idx] = component
 
+        # In here, we convert all Record components to wav format and update the file path.
+        message_chain = event.get_messages()
+        for idx, component in enumerate(message_chain):
+            if isinstance(component, Record):
+                try:
+                    original_path = await component.convert_to_file_path()
+                    record_path = await ensure_wav(original_path)
+                    if record_path != original_path:
+                        event.track_temporary_local_file(record_path)
+                    component.file = record_path
+                    component.path = record_path
+                    message_chain[idx] = component
+                except Exception as e:
+                    logger.warning(f"Voice processing failed: {e}")
+
         # STT
         if self.stt_settings.get("enable", False):
             # TODO: 独立