diff --git a/astrbot/core/astr_main_agent.py b/astrbot/core/astr_main_agent.py
index 2b4a04907e..9861e669c4 100644
--- a/astrbot/core/astr_main_agent.py
+++ b/astrbot/core/astr_main_agent.py
@@ -51,7 +51,7 @@
     retrieve_knowledge_base,
 )
 from astrbot.core.conversation_mgr import Conversation
-from astrbot.core.message.components import File, Image, Reply
+from astrbot.core.message.components import File, Image, Record, Reply
 from astrbot.core.persona_error_reply import (
     extract_persona_custom_error_message_from_persona,
     set_persona_custom_error_message_on_event,
@@ -515,6 +515,18 @@ def _append_quoted_image_attachment(req: ProviderRequest, image_path: str) -> No
     )
 
 
+def _append_audio_attachment(req: ProviderRequest, audio_path: str) -> None:
+    req.extra_user_content_parts.append(
+        TextPart(text=f"[Audio Attachment: path {audio_path}]")
+    )
+
+
+def _append_quoted_audio_attachment(req: ProviderRequest, audio_path: str) -> None:
+    req.extra_user_content_parts.append(
+        TextPart(text=f"[Audio Attachment in quoted message: path {audio_path}]")
+    )
+
+
 def _get_quoted_message_parser_settings(
     provider_settings: dict[str, object] | None,
 ) -> QuotedMessageParserSettings:
@@ -753,12 +765,25 @@ def _modalities_fix(provider: Provider, req: ProviderRequest) -> None:
                 "Provider %s does not support image, using placeholder.", provider
             )
             image_count = len(req.image_urls)
-            placeholder = " ".join(["[图片]"] * image_count)
+            placeholder = " ".join(["[Image]"] * image_count)
             if req.prompt:
                 req.prompt = f"{placeholder} {req.prompt}"
             else:
                 req.prompt = placeholder
             req.image_urls = []
+    if req.audio_urls:
+        provider_cfg = provider.provider_config.get("modalities", ["audio"])
+        if "audio" not in provider_cfg:
+            logger.debug(
+                "Provider %s does not support audio, using placeholder.", provider
+            )
+            audio_count = len(req.audio_urls)
+            placeholder = " ".join(["[Audio]"] * audio_count)
+            if req.prompt:
+                req.prompt = f"{placeholder} {req.prompt}"
+            else:
+                req.prompt = placeholder
+            req.audio_urls = []
     if req.func_tool:
         provider_cfg = provider.provider_config.get("modalities", ["tool_use"])
         if "tool_use" not in provider_cfg:
@@ -781,12 +806,14 @@ def _sanitize_context_by_modalities(
     if not modalities or not isinstance(modalities, list):
         return
     supports_image = bool("image" in modalities)
+    supports_audio = bool("audio" in modalities)
     supports_tool_use = bool("tool_use" in modalities)
-    if supports_image and supports_tool_use:
+    if supports_image and supports_audio and supports_tool_use:
         return
 
     sanitized_contexts: list[dict] = []
     removed_image_blocks = 0
+    removed_audio_blocks = 0
     removed_tool_messages = 0
     removed_tool_calls = 0
 
@@ -808,20 +835,27 @@ def _sanitize_context_by_modalities(
                 new_msg.pop("tool_calls", None)
                 new_msg.pop("tool_call_id", None)
 
-        if not supports_image:
+        if not supports_image or not supports_audio:
             content = new_msg.get("content")
             if isinstance(content, list):
                 filtered_parts: list = []
-                removed_any_image = False
+                removed_any_multimodal = False
                 for part in content:
                     if isinstance(part, dict):
                         part_type = str(part.get("type", "")).lower()
-                        if part_type in {"image_url", "image"}:
-                            removed_any_image = True
+                        if not supports_image and part_type in {"image_url", "image"}:
+                            removed_any_multimodal = True
                             removed_image_blocks += 1
                             continue
+                        if not supports_audio and part_type in {
+                            "audio_url",
+                            "input_audio",
+                        }:
+                            removed_any_multimodal = True
+                            removed_audio_blocks += 1
+                            continue
                     filtered_parts.append(part)
-                if removed_any_image:
+                if removed_any_multimodal:
                     new_msg["content"] = filtered_parts
 
         if role == "assistant":
@@ -835,11 +869,18 @@ def _sanitize_context_by_modalities(
 
         sanitized_contexts.append(new_msg)
 
-    if removed_image_blocks or removed_tool_messages or removed_tool_calls:
+    if (
+        removed_image_blocks
+        or removed_audio_blocks
+        or removed_tool_messages
+        or removed_tool_calls
+    ):
         logger.debug(
             "sanitize_context_by_modalities applied: "
-            "removed_image_blocks=%s, removed_tool_messages=%s, removed_tool_calls=%s",
+            "removed_image_blocks=%s, removed_audio_blocks=%s, "
+            "removed_tool_messages=%s, removed_tool_calls=%s",
             removed_image_blocks,
+            removed_audio_blocks,
             removed_tool_messages,
             removed_tool_calls,
         )
@@ -1101,6 +1142,7 @@ async def build_main_agent(
             req = ProviderRequest()
             req.prompt = ""
             req.image_urls = []
+            req.audio_urls = []
             if sel_model := event.get_extra("selected_model"):
                 req.model = sel_model
             if config.provider_wake_prefix and not event.message_str.startswith(
@@ -1124,6 +1166,10 @@ async def build_main_agent(
                     req.extra_user_content_parts.append(
                         TextPart(text=f"[Image Attachment: path {image_path}]")
                     )
+                elif isinstance(comp, Record):
+                    audio_path = await comp.convert_to_file_path()
+                    req.audio_urls.append(audio_path)
+                    _append_audio_attachment(req, audio_path)
                 elif isinstance(comp, File):
                     file_path = await comp.get_file()
                     file_name = comp.name or os.path.basename(file_path)
@@ -1155,6 +1201,10 @@ async def build_main_agent(
                                 event.track_temporary_local_file(image_path)
                             req.image_urls.append(image_path)
                             _append_quoted_image_attachment(req, image_path)
+                        elif isinstance(reply_comp, Record):
+                            audio_path = await reply_comp.convert_to_file_path()
+                            req.audio_urls.append(audio_path)
+                            _append_quoted_audio_attachment(req, audio_path)
                         elif isinstance(reply_comp, File):
                             file_path = await reply_comp.get_file()
                             file_name = reply_comp.name or os.path.basename(file_path)
@@ -1222,6 +1272,7 @@ async def build_main_agent(
     if isinstance(req.contexts, str):
         req.contexts = json.loads(req.contexts)
     req.image_urls = normalize_and_dedupe_strings(req.image_urls)
+    req.audio_urls = normalize_and_dedupe_strings(req.audio_urls)
 
     if config.file_extract_enabled:
         try:
@@ -1229,7 +1280,7 @@ async def build_main_agent(
         except Exception as exc:  # noqa: BLE001
             logger.error("Error occurred while applying file extract: %s", exc)
 
-    if not req.prompt and not req.image_urls:
+    if not req.prompt and not req.image_urls and not req.audio_urls:
         if not event.get_group_id() and req.extra_user_content_parts:
             req.prompt = "<attachment>"
         else:
diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
index 28971473f7..61a00abd9e 100644
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -1874,8 +1874,8 @@ class ChatProviderTemplate(TypedDict):
                         "description": "模型能力",
                         "type": "list",
                         "items": {"type": "string"},
-                        "options": ["text", "image", "tool_use"],
-                        "labels": ["文本", "图像", "工具使用"],
+                        "options": ["text", "image", "audio", "tool_use"],
+                        "labels": ["文本", "图像", "音频", "工具使用"],
                         "render_type": "checkbox",
                         "hint": "模型支持的模态。如所填写的模型不支持图像，请取消勾选图像。",
                     },
diff --git a/astrbot/core/message/components.py b/astrbot/core/message/components.py
index 29bb478da3..2f19434c9d 100644
--- a/astrbot/core/message/components.py
+++ b/astrbot/core/message/components.py
@@ -64,7 +64,6 @@ class ComponentType(str, Enum):
     Music = "Music"
     Json = "Json"
     Unknown = "Unknown"
-    WechatEmoji = "WechatEmoji"  # Wechat 下的 emoji 表情包
 
 
 class BaseMessageComponent(BaseModel):
@@ -91,7 +90,6 @@ async def to_dict(self) -> dict:
 class Plain(BaseMessageComponent):
     type: ComponentType = ComponentType.Plain
     text: str
-    convert: bool | None = True
 
     def __init__(self, text: str, convert: bool = True, **_) -> None:
         super().__init__(text=text, convert=convert, **_)
@@ -114,11 +112,7 @@ def __init__(self, **_) -> None:
 class Record(BaseMessageComponent):
     type: ComponentType = ComponentType.Record
     file: str | None = ""
-    magic: bool | None = False
     url: str | None = ""
-    cache: bool | None = True
-    proxy: bool | None = True
-    timeout: int | None = 0
     # Original text content (e.g. TTS source text), used as caption in fallback scenarios
     text: str | None = None
     # 额外
@@ -224,7 +218,6 @@ class Video(BaseMessageComponent):
     type: ComponentType = ComponentType.Video
     file: str
     cover: str | None = ""
-    c: int | None = 2
     # 额外
     path: str | None = ""
 
@@ -401,14 +394,9 @@ class Image(BaseMessageComponent):
     type: ComponentType = ComponentType.Image
     file: str | None = ""
     _type: str | None = ""
-    subType: int | None = 0
     url: str | None = ""
-    cache: bool | None = True
-    id: int | None = 40000
-    c: int | None = 2
     # 额外
     path: str | None = ""
-    file_unique: str | None = ""  # 某些平台可能有图片缓存的唯一标识
 
     def __init__(self, file: str | None, **_) -> None:
         super().__init__(file=file, **_)
@@ -839,16 +827,6 @@ async def to_dict(self):
         }
 
 
-class WechatEmoji(BaseMessageComponent):
-    type: ComponentType = ComponentType.WechatEmoji
-    md5: str | None = ""
-    md5_len: int | None = 0
-    cdnurl: str | None = ""
-
-    def __init__(self, **_) -> None:
-        super().__init__(**_)
-
-
 ComponentTypes = {
     # Basic Message Segments
     "plain": Plain,
@@ -874,5 +852,4 @@ def __init__(self, **_) -> None:
     "nodes": Nodes,
     "json": Json,
     "unknown": Unknown,
-    "WechatEmoji": WechatEmoji,
 }
diff --git a/astrbot/core/pipeline/preprocess_stage/stage.py b/astrbot/core/pipeline/preprocess_stage/stage.py
index 0d6d09370e..0f75dfd157 100644
--- a/astrbot/core/pipeline/preprocess_stage/stage.py
+++ b/astrbot/core/pipeline/preprocess_stage/stage.py
@@ -6,6 +6,7 @@
 from astrbot.core import logger
 from astrbot.core.message.components import Image, Plain, Record
 from astrbot.core.platform.astr_message_event import AstrMessageEvent
+from astrbot.core.utils.media_utils import ensure_wav
 
 from ..context import PipelineContext
 from ..stage import Stage, register_stage
@@ -64,6 +65,21 @@ async def process(
                             logger.debug(f"路径映射: {url} -> {component.url}")
                     message_chain[idx] = component
 
+        # In here, we convert all Record components to wav format and update the file path.
+        message_chain = event.get_messages()
+        for idx, component in enumerate(message_chain):
+            if isinstance(component, Record):
+                try:
+                    original_path = await component.convert_to_file_path()
+                    record_path = await ensure_wav(original_path)
+                    if record_path != original_path:
+                        event.track_temporary_local_file(record_path)
+                    component.file = record_path
+                    component.path = record_path
+                    message_chain[idx] = component
+                except Exception as e:
+                    logger.warning(f"Voice processing failed: {e}")
+
         # STT
         if self.stt_settings.get("enable", False):
             # TODO: 独立
diff --git a/astrbot/core/pipeline/process_stage/method/agent_sub_stages/internal.py b/astrbot/core/pipeline/process_stage/method/agent_sub_stages/internal.py
index 1a04e3a48e..e0ba2463ca 100644
--- a/astrbot/core/pipeline/process_stage/method/agent_sub_stages/internal.py
+++ b/astrbot/core/pipeline/process_stage/method/agent_sub_stages/internal.py
@@ -13,7 +13,7 @@
     MainAgentBuildResult,
     build_main_agent,
 )
-from astrbot.core.message.components import File, Image
+from astrbot.core.message.components import File, Image, Record, Video
 from astrbot.core.message.message_event_result import (
     MessageChain,
     MessageEventResult,
@@ -153,7 +153,8 @@ async def process(
             has_provider_request = event.get_extra("provider_request") is not None
             has_valid_message = bool(event.message_str and event.message_str.strip())
             has_media_content = any(
-                isinstance(comp, Image | File) for comp in event.message_obj.message
+                isinstance(comp, (Image, File, Record, Video))
+                for comp in event.message_obj.message
             )
 
             if (
diff --git a/astrbot/core/pipeline/process_stage/method/agent_sub_stages/third_party.py b/astrbot/core/pipeline/process_stage/method/agent_sub_stages/third_party.py
index 070ad7bdee..9ab315779c 100644
--- a/astrbot/core/pipeline/process_stage/method/agent_sub_stages/third_party.py
+++ b/astrbot/core/pipeline/process_stage/method/agent_sub_stages/third_party.py
@@ -17,7 +17,7 @@
 )
 from astrbot.core.agent.runners.dify.dify_agent_runner import DifyAgentRunner
 from astrbot.core.astr_agent_hooks import MAIN_AGENT_HOOKS
-from astrbot.core.message.components import Image
+from astrbot.core.message.components import Image, Record
 from astrbot.core.message.message_event_result import (
     MessageChain,
     MessageEventResult,
@@ -317,8 +317,11 @@ async def process(
             if isinstance(comp, Image):
                 image_path = await comp.convert_to_base64()
                 req.image_urls.append(image_path)
+            elif isinstance(comp, Record):
+                audio_path = await comp.convert_to_file_path()
+                req.audio_urls.append(audio_path)
 
-        if not req.prompt and not req.image_urls:
+        if not req.prompt and not req.image_urls and not req.audio_urls:
             return
 
         custom_error_message = await self._resolve_persona_custom_error_message(event)
diff --git a/astrbot/core/pipeline/respond/stage.py b/astrbot/core/pipeline/respond/stage.py
index 6a884a5181..aea6a74b3e 100644
--- a/astrbot/core/pipeline/respond/stage.py
+++ b/astrbot/core/pipeline/respond/stage.py
@@ -32,7 +32,6 @@ class RespondStage(Stage):
         Comp.Node: lambda comp: bool(comp.content),  # 转发节点
         Comp.Nodes: lambda comp: bool(comp.nodes),  # 多个转发节点
         Comp.File: lambda comp: bool(comp.file_ or comp.url),
-        Comp.WechatEmoji: lambda comp: comp.md5 is not None,  # 微信表情
         Comp.Json: lambda comp: bool(comp.data),  # Json 卡片
         Comp.Share: lambda comp: bool(comp.url) or bool(comp.title),
         Comp.Music: lambda comp: (
diff --git a/astrbot/core/platform/astr_message_event.py b/astrbot/core/platform/astr_message_event.py
index 0ecd47fedc..6454367022 100644
--- a/astrbot/core/platform/astr_message_event.py
+++ b/astrbot/core/platform/astr_message_event.py
@@ -414,6 +414,7 @@ def request_llm(
         tool_set: ToolSet | None = None,
         session_id: str = "",
         image_urls: list[str] | None = None,
+        audio_urls: list[str] | None = None,
         contexts: list | None = None,
         system_prompt: str = "",
         conversation: Conversation | None = None,
@@ -432,6 +433,8 @@ def request_llm(
 
         image_urls: 可以是 base64:// 或者 http:// 开头的图片链接，也可以是本地图片路径。
 
+        audio_urls: 音频 URL 列表，也支持本地路径。
+
         contexts: 当指定 contexts 时，将会使用 contexts 作为上下文。如果同时传入了 conversation，将会忽略 conversation。
 
         func_tool_manager: [Deprecated] 函数工具管理器，用于调用函数工具。用 self.context.get_llm_tool_manager() 获取。已过时，请使用 tool_set 参数代替。
@@ -441,6 +444,8 @@ def request_llm(
         """
         if image_urls is None:
             image_urls = []
+        if audio_urls is None:
+            audio_urls = []
         if contexts is None:
             contexts = []
         if len(contexts) > 0 and conversation:
@@ -450,6 +455,7 @@ def request_llm(
             prompt=prompt,
             session_id=session_id,
             image_urls=image_urls,
+            audio_urls=audio_urls,
             # func_tool=func_tool_manager,
             func_tool=tool_set,
             contexts=contexts,
diff --git a/astrbot/core/provider/entities.py b/astrbot/core/provider/entities.py
index 20c5a7947d..b27775cdd7 100644
--- a/astrbot/core/provider/entities.py
+++ b/astrbot/core/provider/entities.py
@@ -3,8 +3,11 @@
 import base64
 import enum
 import json
+import uuid
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import Any
+from urllib.parse import urlparse
 
 from anthropic.types import Message as AnthropicMessage
 from google.genai.types import GenerateContentResponse
@@ -21,7 +24,8 @@
 from astrbot.core.agent.tool import ToolSet
 from astrbot.core.db.po import Conversation
 from astrbot.core.message.message_event_result import MessageChain
-from astrbot.core.utils.io import download_image_by_url
+from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
+from astrbot.core.utils.io import download_file, download_image_by_url
 
 
 class ProviderType(enum.Enum):
@@ -93,6 +97,8 @@ class ProviderRequest:
     """会话 ID"""
     image_urls: list[str] = field(default_factory=list)
     """图片 URL 列表"""
+    audio_urls: list[str] = field(default_factory=list)
+    """音频 URL 列表，也支持本地路径"""
     extra_user_content_parts: list[ContentPart] = field(default_factory=list)
     """额外的用户消息内容部分列表，用于在用户消息后添加额外的内容块（如系统提醒、指令等）。支持 dict 或 ContentPart 对象"""
     func_tool: ToolSet | None = None
@@ -115,6 +121,7 @@ def __repr__(self) -> str:
         return (
             f"ProviderRequest(prompt={self.prompt}, session_id={self.session_id}, "
             f"image_count={len(self.image_urls or [])}, "
+            f"audio_count={len(self.audio_urls or [])}, "
             f"func_tool={self.func_tool}, "
             f"contexts={self._print_friendly_context()}, "
             f"system_prompt={self.system_prompt}, "
@@ -133,9 +140,12 @@ def append_tool_calls_result(self, tool_calls_result: ToolCallsResult) -> None:
         self.tool_calls_result.append(tool_calls_result)
 
     def _print_friendly_context(self):
-        """打印友好的消息上下文。将 image_url 的值替换为 <Image>"""
+        """打印友好的消息上下文。将多模态内容折叠为简短标记。"""
         if not self.contexts:
-            return f"prompt: {self.prompt}, image_count: {len(self.image_urls or [])}"
+            return (
+                f"prompt: {self.prompt}, image_count: {len(self.image_urls or [])}, "
+                f"audio_count: {len(self.audio_urls or [])}"
+            )
 
         result_parts = []
 
@@ -148,6 +158,7 @@ def _print_friendly_context(self):
             elif isinstance(content, list):
                 msg_parts = []
                 image_count = 0
+                audio_count = 0
 
                 for item in content:
                     item_type = item.get("type", "")
@@ -156,19 +167,26 @@ def _print_friendly_context(self):
                         msg_parts.append(item.get("text", ""))
                     elif item_type == "image_url":
                         image_count += 1
+                    elif item_type == "audio_url":
+                        audio_count += 1
 
                 if image_count > 0:
                     if msg_parts:
                         msg_parts.append(f"[+{image_count} images]")
                     else:
                         msg_parts.append(f"[{image_count} images]")
+                if audio_count > 0:
+                    if msg_parts:
+                        msg_parts.append(f"[+{audio_count} audios]")
+                    else:
+                        msg_parts.append(f"[{audio_count} audios]")
 
                 result_parts.append(f"{role}: {''.join(msg_parts)}")
 
         return "\n".join(result_parts)
 
     async def assemble_context(self) -> dict:
-        """将请求(prompt 和 image_urls)包装成 OpenAI 的消息格式。"""
+        """将请求(prompt、image_urls 和 audio_urls)包装成统一消息格式。"""
         # 构建内容块列表
         content_blocks = []
 
@@ -178,6 +196,9 @@ async def assemble_context(self) -> dict:
         elif self.image_urls:
             # 如果没有文本但有图片，添加占位文本
             content_blocks.append({"type": "text", "text": "[图片]"})
+        elif self.audio_urls:
+            # 如果没有文本但有音频，添加占位文本
+            content_blocks.append({"type": "text", "text": "[音频]"})
 
         # 2. 额外的内容块（系统提醒、指令等）
         if self.extra_user_content_parts:
@@ -202,12 +223,57 @@ async def assemble_context(self) -> dict:
                     {"type": "image_url", "image_url": {"url": image_data}},
                 )
 
+        # 4. 音频内容
+        if self.audio_urls:
+            for audio_url in self.audio_urls:
+                if audio_url.startswith("http"):
+                    parsed_url = urlparse(audio_url)
+                    suffix = Path(parsed_url.path).suffix
+                    temp_dir = Path(get_astrbot_temp_path())
+                    temp_dir.mkdir(parents=True, exist_ok=True)
+                    temp_audio_path = (
+                        temp_dir / f"provider_request_audio_{uuid.uuid4().hex}{suffix}"
+                    )
+                    try:
+                        await download_file(audio_url, str(temp_audio_path))
+                        audio_data = await self._encode_audio_bs64(
+                            str(temp_audio_path),
+                            source_ref=audio_url,
+                        )
+                    finally:
+                        try:
+                            temp_audio_path.unlink(missing_ok=True)
+                        except Exception as exc:
+                            logger.warning(
+                                "Failed to cleanup %s: %s",
+                                temp_audio_path,
+                                exc,
+                            )
+                elif audio_url.startswith("file:///"):
+                    audio_path = audio_url.replace("file:///", "")
+                    audio_data = await self._encode_audio_bs64(
+                        audio_path,
+                        source_ref=audio_url,
+                    )
+                else:
+                    audio_data = await self._encode_audio_bs64(
+                        audio_url,
+                        source_ref=audio_url,
+                    )
+                if not audio_data:
+                    logger.warning(f"音频 {audio_url} 得到的结果为空，将忽略。")
+                    continue
+                content_blocks.append(
+                    {"type": "audio_url", "audio_url": {"url": audio_data}},
+                )
+
         # 只有当只有一个来自 prompt 的文本块且没有额外内容块时，才降级为简单格式以保持向后兼容
         if (
             len(content_blocks) == 1
             and content_blocks[0]["type"] == "text"
             and not self.extra_user_content_parts
             and not self.image_urls
+            and not self.audio_urls
         ):
             return {"role": "user", "content": content_blocks[0]["text"]}
 
@@ -221,7 +287,21 @@ async def _encode_image_bs64(self, image_url: str) -> str:
         with open(image_url, "rb") as f:
             image_bs64 = base64.b64encode(f.read()).decode("utf-8")
             return "data:image/jpeg;base64," + image_bs64
-        return ""
+
+    async def _encode_audio_bs64(
+        self,
+        audio_path: str,
+        source_ref: str | None = None,
+    ) -> str:
+        """将音频转换为 base64"""
+        mime_type = "audio/wav"
+
+        if audio_path.startswith("base64://"):
+            return audio_path.replace("base64://", f"data:{mime_type};base64,", 1)
+
+        with open(audio_path, "rb") as f:
+            audio_bs64 = base64.b64encode(f.read()).decode("utf-8")
+            return f"data:{mime_type};base64," + audio_bs64
 
 
 @dataclass
diff --git a/astrbot/core/provider/provider.py b/astrbot/core/provider/provider.py
index fab3ce6104..f2571b506c 100644
--- a/astrbot/core/provider/provider.py
+++ b/astrbot/core/provider/provider.py
@@ -98,6 +98,7 @@ async def text_chat(
         prompt: str | None = None,
         session_id: str | None = None,
         image_urls: list[str] | None = None,
+        audio_urls: list[str] | None = None,
         func_tool: ToolSet | None = None,
         contexts: list[Message] | list[dict] | None = None,
         system_prompt: str | None = None,
@@ -113,6 +114,7 @@ async def text_chat(
             prompt: 提示词，和 contexts 二选一使用，如果都指定，则会将 prompt（以及可能的 image_urls） 作为最新的一条记录添加到 contexts 中
             session_id: 会话 ID(此属性已经被废弃)
             image_urls: 图片 URL 列表
+            audio_urls: 音频 URL 列表，也支持本地路径
             tools: tool set
             tool_choice: 工具调用策略，`auto` 表示由模型自行决定，`required` 表示要求模型必须调用工具
             contexts: 上下文，和 prompt 二选一使用
@@ -122,6 +124,7 @@ async def text_chat(
 
         Notes:
             - 如果传入了 image_urls，将会在对话时附上图片。如果模型不支持图片输入，将会抛出错误。
+            - 如果传入了 audio_urls，将会在对话时附上音频。如果模型不支持音频输入，将会抛出错误或降级处理。
             - 如果传入了 tools，将会使用 tools 进行 Function-calling。如果模型不支持 Function-calling，将会抛出错误。
 
         """
@@ -132,6 +135,7 @@ async def text_chat_stream(
         prompt: str | None = None,
         session_id: str | None = None,
         image_urls: list[str] | None = None,
+        audio_urls: list[str] | None = None,
         func_tool: ToolSet | None = None,
         contexts: list[Message] | list[dict] | None = None,
         system_prompt: str | None = None,
@@ -146,6 +150,7 @@ async def text_chat_stream(
             prompt: 提示词，和 contexts 二选一使用，如果都指定，则会将 prompt（以及可能的 image_urls） 作为最新的一条记录添加到 contexts 中
             session_id: 会话 ID(此属性已经被废弃)
             image_urls: 图片 URL 列表
+            audio_urls: 音频 URL 列表，也支持本地路径
             tools: tool set
             tool_choice: 工具调用策略，`auto` 表示由模型自行决定，`required` 表示要求模型必须调用工具
             contexts: 上下文，和 prompt 二选一使用
@@ -154,6 +159,7 @@ async def text_chat_stream(
 
         Notes:
             - 如果传入了 image_urls，将会在对话时附上图片。如果模型不支持图片输入，将会抛出错误。
+            - 如果传入了 audio_urls，将会在对话时附上音频。如果模型不支持音频输入，将会抛出错误或降级处理。
             - 如果传入了 tools，将会使用 tools 进行 Function-calling。如果模型不支持 Function-calling，将会抛出错误。
 
         """
diff --git a/astrbot/core/provider/sources/anthropic_source.py b/astrbot/core/provider/sources/anthropic_source.py
index 89c33021ae..83f2e16dba 100644
--- a/astrbot/core/provider/sources/anthropic_source.py
+++ b/astrbot/core/provider/sources/anthropic_source.py
@@ -12,7 +12,7 @@
 
 from astrbot import logger
 from astrbot.api.provider import Provider
-from astrbot.core.agent.message import ContentPart, ImageURLPart, TextPart
+from astrbot.core.agent.message import AudioURLPart, ContentPart, ImageURLPart, TextPart
 from astrbot.core.exceptions import EmptyModelOutputError
 from astrbot.core.provider.entities import LLMResponse, TokenUsage
 from astrbot.core.provider.func_tool_manager import ToolSet
@@ -242,6 +242,13 @@ def _prepare_payload(self, messages: list[dict]):
                                 logger.warning(
                                     f"Unsupported image URL format for Anthropic: {url[:50]}..."
                                 )
+                        elif part.get("type") == "audio_url":
+                            converted_content.append(
+                                {
+                                    "type": "text",
+                                    "text": "[Audio Attachment]",
+                                }
+                            )
                         else:
                             converted_content.append(part)
                     new_messages.append(
@@ -517,6 +524,7 @@ async def text_chat(
         prompt=None,
         session_id=None,
         image_urls=None,
+        audio_urls=None,
         func_tool=None,
         contexts=None,
         system_prompt=None,
@@ -531,7 +539,10 @@ async def text_chat(
         new_record = None
         if prompt is not None:
             new_record = await self.assemble_context(
-                prompt, image_urls, extra_user_content_parts
+                prompt or "",
+                image_urls,
+                audio_urls,
+                extra_user_content_parts,
             )
         context_query = self._ensure_message_to_dicts(contexts)
         if new_record:
@@ -577,6 +588,7 @@ async def text_chat_stream(
         prompt=None,
         session_id=None,
         image_urls=None,
+        audio_urls=None,
         func_tool=None,
         contexts=None,
         system_prompt=None,
@@ -591,7 +603,10 @@ async def text_chat_stream(
         new_record = None
         if prompt is not None:
             new_record = await self.assemble_context(
-                prompt, image_urls, extra_user_content_parts
+                prompt or "",
+                image_urls,
+                audio_urls,
+                extra_user_content_parts,
             )
         context_query = self._ensure_message_to_dicts(contexts)
         if new_record:
@@ -642,6 +657,7 @@ async def assemble_context(
         self,
         text: str,
         image_urls: list[str] | None = None,
+        audio_urls: list[str] | None = None,
         extra_user_content_parts: list[ContentPart] | None = None,
     ):
         """组装上下文，支持文本和图片"""
@@ -680,7 +696,9 @@ async def resolve_image_url(image_url: str) -> dict | None:
             content.append({"type": "text", "text": text})
         elif image_urls:
             # 如果没有文本但有图片，添加占位文本
-            content.append({"type": "text", "text": "[图片]"})
+            content.append({"type": "text", "text": "[Image]"})
+        elif audio_urls:
+            content.append({"type": "text", "text": "[Audio]"})
         elif extra_user_content_parts:
             # 如果只有额外内容块，也需要添加占位文本
             content.append({"type": "text", "text": " "})
@@ -694,6 +712,8 @@ async def resolve_image_url(image_url: str) -> dict | None:
                     image_dict = await resolve_image_url(block.image_url.url)
                     if image_dict:
                         content.append(image_dict)
+                elif isinstance(block, AudioURLPart):
+                    content.append({"type": "text", "text": "[Audio]"})
                 else:
                     raise ValueError(f"不支持的额外内容块类型: {type(block)}")
 
@@ -703,12 +723,16 @@ async def resolve_image_url(image_url: str) -> dict | None:
                 image_dict = await resolve_image_url(image_url)
                 if image_dict:
                     content.append(image_dict)
+        if audio_urls:
+            for _audio_path in audio_urls:
+                content.append({"type": "text", "text": "[Audio]"})
 
         # 如果只有主文本且没有额外内容块和图片，返回简单格式以保持向后兼容
         if (
             text
             and not extra_user_content_parts
             and not image_urls
+            and not audio_urls
             and len(content) == 1
             and content[0]["type"] == "text"
         ):
diff --git a/astrbot/core/provider/sources/gemini_source.py b/astrbot/core/provider/sources/gemini_source.py
index 1f447c85bd..e5fae456dc 100644
--- a/astrbot/core/provider/sources/gemini_source.py
+++ b/astrbot/core/provider/sources/gemini_source.py
@@ -3,8 +3,11 @@
 import json
 import logging
 import random
+import uuid
 from collections.abc import AsyncGenerator
+from pathlib import Path
 from typing import Literal, cast
+from urllib.parse import urlparse
 
 from google import genai
 from google.genai import types
@@ -13,12 +16,14 @@
 import astrbot.core.message.components as Comp
 from astrbot import logger
 from astrbot.api.provider import Provider
-from astrbot.core.agent.message import ContentPart, ImageURLPart, TextPart
+from astrbot.core.agent.message import AudioURLPart, ContentPart, ImageURLPart, TextPart
 from astrbot.core.exceptions import EmptyModelOutputError
 from astrbot.core.message.message_event_result import MessageChain
 from astrbot.core.provider.entities import LLMResponse, TokenUsage
 from astrbot.core.provider.func_tool_manager import ToolSet
-from astrbot.core.utils.io import download_image_by_url
+from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
+from astrbot.core.utils.io import download_file, download_image_by_url
+from astrbot.core.utils.media_utils import ensure_wav
 from astrbot.core.utils.network_utils import is_connection_error, log_connection_failure
 
 from ..register import register_provider_adapter
@@ -304,6 +309,12 @@ def process_image_url(image_url_dict: dict) -> types.Part:
             image_bytes = base64.b64decode(url.split(",", 1)[1])
             return types.Part.from_bytes(data=image_bytes, mime_type=mime_type)
 
+        def process_audio_url(audio_url_dict: dict) -> types.Part:
+            url = audio_url_dict["url"]
+            mime_type = url.split(":")[1].split(";")[0]
+            audio_bytes = base64.b64decode(url.split(",", 1)[1])
+            return types.Part.from_bytes(data=audio_bytes, mime_type=mime_type)
+
         def append_or_extend(
             contents: list[types.Content],
             part: list[types.Part],
@@ -331,7 +342,11 @@ def append_or_extend(
                         (
                             types.Part.from_text(text=item["text"] or " ")
                             if item["type"] == "text"
-                            else process_image_url(item["image_url"])
+                            else (
+                                process_image_url(item["image_url"])
+                                if item["type"] == "image_url"
+                                else process_audio_url(item["audio_url"])
+                            )
                         )
                         for item in content
                     ]
@@ -782,6 +797,7 @@ async def text_chat(
         prompt=None,
         session_id=None,
         image_urls=None,
+        audio_urls=None,
         func_tool=None,
         contexts=None,
         system_prompt=None,
@@ -796,7 +812,10 @@ async def text_chat(
         new_record = None
         if prompt is not None:
             new_record = await self.assemble_context(
-                prompt, image_urls, extra_user_content_parts
+                prompt or "",
+                image_urls,
+                audio_urls,
+                extra_user_content_parts,
             )
         context_query = self._ensure_message_to_dicts(contexts)
         if new_record:
@@ -840,6 +859,7 @@ async def text_chat_stream(
         prompt=None,
         session_id=None,
         image_urls=None,
+        audio_urls=None,
         func_tool=None,
         contexts=None,
         system_prompt=None,
@@ -854,7 +874,10 @@ async def text_chat_stream(
         new_record = None
         if prompt is not None:
             new_record = await self.assemble_context(
-                prompt, image_urls, extra_user_content_parts
+                prompt or "",
+                image_urls,
+                audio_urls,
+                extra_user_content_parts,
             )
         context_query = self._ensure_message_to_dicts(contexts)
         if new_record:
@@ -920,6 +943,7 @@ async def assemble_context(
         self,
         text: str,
         image_urls: list[str] | None = None,
+        audio_urls: list[str] | None = None,
         extra_user_content_parts: list[ContentPart] | None = None,
     ):
         """组装上下文。"""
@@ -941,6 +965,43 @@ async def resolve_image_part(image_url: str) -> dict | None:
                 "image_url": {"url": image_data},
             }
 
+        async def resolve_audio_part(audio_path: str) -> dict | None:
+            if audio_path.startswith("http"):
+                suffix = Path(urlparse(audio_path).path).suffix or ".wav"
+                temp_dir = Path(get_astrbot_temp_path())
+                temp_dir.mkdir(parents=True, exist_ok=True)
+                resolved_path = str(
+                    temp_dir / f"provider_audio_{uuid.uuid4().hex}{suffix}"
+                )
+                await download_file(audio_path, resolved_path)
+            elif audio_path.startswith("file:///"):
+                resolved_path = audio_path.replace("file:///", "")
+            else:
+                resolved_path = audio_path
+
+            suffix = Path(resolved_path).suffix.lower()
+            if suffix != ".mp3":
+                resolved_path = await ensure_wav(resolved_path)
+                suffix = ".wav"
+
+            try:
+                audio_bytes = Path(resolved_path).read_bytes()
+            except OSError as exc:
+                logger.warning(
+                    f"Failed to read audio file {resolved_path}, skipping. Error: {exc}"
+                )
+                return None
+
+            mime_type = {
+                ".wav": "audio/wav",
+                ".mp3": "audio/mp3",
+            }.get(suffix, "audio/wav")
+            audio_data = base64.b64encode(audio_bytes).decode("utf-8")
+            return {
+                "type": "audio_url",
+                "audio_url": {"url": f"data:{mime_type};base64,{audio_data}"},
+            }
+
         # 构建内容块列表
         content_blocks = []
 
@@ -949,7 +1010,9 @@ async def resolve_image_part(image_url: str) -> dict | None:
             content_blocks.append({"type": "text", "text": text})
         elif image_urls:
             # 如果没有文本但有图片，添加占位文本
-            content_blocks.append({"type": "text", "text": "[图片]"})
+            content_blocks.append({"type": "text", "text": "[Image]"})
+        elif audio_urls:
+            content_blocks.append({"type": "text", "text": "[Audio]"})
         elif extra_user_content_parts:
             # 如果只有额外内容块，也需要添加占位文本
             content_blocks.append({"type": "text", "text": " "})
@@ -963,6 +1026,10 @@ async def resolve_image_part(image_url: str) -> dict | None:
                     image_part = await resolve_image_part(part.image_url.url)
                     if image_part:
                         content_blocks.append(image_part)
+                elif isinstance(part, AudioURLPart):
+                    audio_part = await resolve_audio_part(part.audio_url.url)
+                    if audio_part:
+                        content_blocks.append(audio_part)
                 else:
                     raise ValueError(f"不支持的额外内容块类型: {type(part)}")
 
@@ -973,11 +1040,18 @@ async def resolve_image_part(image_url: str) -> dict | None:
                 if image_part:
                     content_blocks.append(image_part)
 
+        if audio_urls:
+            for audio_path in audio_urls:
+                audio_part = await resolve_audio_part(audio_path)
+                if audio_part:
+                    content_blocks.append(audio_part)
+
         # 如果只有主文本且没有额外内容块和图片，返回简单格式以保持向后兼容
         if (
             text
             and not extra_user_content_parts
             and not image_urls
+            and not audio_urls
             and len(content_blocks) == 1
             and content_blocks[0]["type"] == "text"
         ):
diff --git a/astrbot/core/provider/sources/openai_source.py b/astrbot/core/provider/sources/openai_source.py
index 33432b6636..b19f3460dd 100644
--- a/astrbot/core/provider/sources/openai_source.py
+++ b/astrbot/core/provider/sources/openai_source.py
@@ -5,6 +5,7 @@
 import json
 import random
 import re
+import uuid
 from collections.abc import AsyncGenerator
 from io import BytesIO
 from pathlib import Path
@@ -24,12 +25,20 @@
 import astrbot.core.message.components as Comp
 from astrbot import logger
 from astrbot.api.provider import Provider
-from astrbot.core.agent.message import ContentPart, ImageURLPart, Message, TextPart
+from astrbot.core.agent.message import (
+    AudioURLPart,
+    ContentPart,
+    ImageURLPart,
+    Message,
+    TextPart,
+)
 from astrbot.core.agent.tool import ToolSet
 from astrbot.core.exceptions import EmptyModelOutputError
 from astrbot.core.message.message_event_result import MessageChain
 from astrbot.core.provider.entities import LLMResponse, TokenUsage, ToolCallsResult
-from astrbot.core.utils.io import download_image_by_url
+from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
+from astrbot.core.utils.io import download_file, download_image_by_url
+from astrbot.core.utils.media_utils import ensure_wav
 from astrbot.core.utils.network_utils import (
     create_proxy_client,
     is_connection_error,
@@ -136,7 +145,10 @@ def _context_contains_image(contexts: list[dict]) -> bool:
             if not isinstance(content, list):
                 continue
             for item in content:
-                if isinstance(item, dict) and item.get("type") == "image_url":
+                if isinstance(item, dict) and item.get("type") in {
+                    "image_url",
+                    "audio_url",
+                }:
                     return True
         return False
 
@@ -285,24 +297,103 @@ def _extract_image_part_info(self, part: dict) -> tuple[str | None, str | None]:
             image_detail = None
         return url, image_detail
 
-    async def _transform_content_part(self, part: dict) -> dict:
-        url, image_detail = self._extract_image_part_info(part)
-        if not url:
-            return part
+    def _extract_audio_part_info(self, part: dict) -> str | None:
+        if not isinstance(part, dict) or part.get("type") != "audio_url":
+            return None
 
+        audio_url_data = part.get("audio_url")
+        if not isinstance(audio_url_data, dict):
+            logger.warning("音频内容块格式无效，将保留原始内容。")
+            return None
+
+        url = audio_url_data.get("url")
+        if not isinstance(url, str) or not url:
+            logger.warning("音频内容块缺少有效路径，将保留原始内容。")
+            return None
+
+        return url
+
+    async def _audio_ref_to_local_path(self, audio_ref: str) -> tuple[str, list[Path]]:
+        cleanup_paths: list[Path] = []
+        if audio_ref.startswith("http"):
+            suffix = Path(urlparse(audio_ref).path).suffix or ".wav"
+            temp_dir = Path(get_astrbot_temp_path())
+            temp_dir.mkdir(parents=True, exist_ok=True)
+            target_path = temp_dir / f"provider_audio_{uuid.uuid4().hex}{suffix}"
+            await download_file(audio_ref, str(target_path))
+            cleanup_paths.append(target_path)
+            return str(target_path), cleanup_paths
+        if audio_ref.startswith("file://"):
+            return self._file_uri_to_path(audio_ref), cleanup_paths
+        return audio_ref, cleanup_paths
+
+    async def _resolve_audio_part(self, audio_ref: str) -> dict | None:
+        cleanup_paths: list[Path] = []
         try:
-            resolved_part = await self._resolve_image_part(
-                url, image_detail=image_detail
-            )
+            audio_path, cleanup_paths = await self._audio_ref_to_local_path(audio_ref)
+            suffix = Path(audio_path).suffix.lower()
+            if suffix == ".mp3":
+                audio_format = "mp3"
+            else:
+                converted_audio_path = await ensure_wav(audio_path)
+                if converted_audio_path != audio_path:
+                    cleanup_paths.append(Path(converted_audio_path))
+                audio_path = converted_audio_path
+                audio_format = "wav"
+            audio_bytes = Path(audio_path).read_bytes()
         except Exception as exc:
-            logger.warning(
-                "图片 %s 预处理失败，将保留原始内容。错误: %s",
-                url,
-                exc,
-            )
+            logger.warning("音频 %s 预处理失败，将忽略。错误: %s", audio_ref, exc)
+            return None
+        finally:
+            for cleanup_path in cleanup_paths:
+                try:
+                    cleanup_path.unlink(missing_ok=True)
+                except Exception as cleanup_exc:
+                    logger.warning(
+                        "Failed to cleanup %s: %s",
+                        cleanup_path,
+                        cleanup_exc,
+                    )
+
+        return {
+            "type": "input_audio",
+            "input_audio": {
+                "data": base64.b64encode(audio_bytes).decode("utf-8"),
+                "format": audio_format,
+            },
+        }
+
+    async def _transform_content_part(self, part: dict) -> dict:
+        if not isinstance(part, dict):
             return part
 
-        return resolved_part or part
+        if part.get("type") == "image_url":
+            url, image_detail = self._extract_image_part_info(part)
+            if not url:
+                return part
+
+            try:
+                resolved_part = await self._resolve_image_part(
+                    url, image_detail=image_detail
+                )
+            except Exception as exc:
+                logger.warning(
+                    "图片 %s 预处理失败，将保留原始内容。错误: %s",
+                    url,
+                    exc,
+                )
+                return part
+
+            return resolved_part or part
+
+        if part.get("type") == "audio_url":
+            audio_ref = self._extract_audio_part_info(part)
+            if not audio_ref:
+                return part
+            resolved_part = await self._resolve_audio_part(audio_ref)
+            return resolved_part or part
+
+        return part
 
     async def _materialize_message_image_parts(self, message: dict) -> dict:
         content = message.get("content")
@@ -816,6 +907,7 @@ async def _prepare_chat_payload(
         self,
         prompt: str | None,
         image_urls: list[str] | None = None,
+        audio_urls: list[str] | None = None,
         contexts: list[dict] | list[Message] | None = None,
         system_prompt: str | None = None,
         tool_calls_result: ToolCallsResult | list[ToolCallsResult] | None = None,
@@ -829,7 +921,10 @@ async def _prepare_chat_payload(
         new_record = None
         if prompt is not None:
             new_record = await self.assemble_context(
-                prompt, image_urls, extra_user_content_parts
+                prompt or "",
+                image_urls,
+                audio_urls,
+                extra_user_content_parts,
             )
         context_query = copy.deepcopy(self._ensure_message_to_dicts(contexts))
         if new_record:
@@ -1016,6 +1111,7 @@ async def text_chat(
         prompt=None,
         session_id=None,
         image_urls=None,
+        audio_urls=None,
         func_tool=None,
         contexts=None,
         system_prompt=None,
@@ -1028,6 +1124,7 @@ async def text_chat(
         payloads, context_query = await self._prepare_chat_payload(
             prompt,
             image_urls,
+            audio_urls,
             contexts,
             system_prompt,
             tool_calls_result,
@@ -1087,6 +1184,7 @@ async def text_chat_stream(
         prompt=None,
         session_id=None,
         image_urls=None,
+        audio_urls=None,
         func_tool=None,
         contexts=None,
         system_prompt=None,
@@ -1099,6 +1197,7 @@ async def text_chat_stream(
         payloads, context_query = await self._prepare_chat_payload(
             prompt,
             image_urls,
+            audio_urls,
             contexts,
             system_prompt,
             tool_calls_result,
@@ -1183,6 +1282,7 @@ async def assemble_context(
         self,
         text: str,
         image_urls: list[str] | None = None,
+        audio_urls: list[str] | None = None,
         extra_user_content_parts: list[ContentPart] | None = None,
     ) -> dict:
         """组装成符合 OpenAI 格式的 role 为 user 的消息段"""
@@ -1195,7 +1295,9 @@ async def assemble_context(
             content_blocks.append({"type": "text", "text": text})
         elif image_urls:
             # 如果没有文本但有图片，添加占位文本
-            content_blocks.append({"type": "text", "text": "[图片]"})
+            content_blocks.append({"type": "text", "text": "[Image]"})
+        elif audio_urls:
+            content_blocks.append({"type": "text", "text": "[Audio]"})
         elif extra_user_content_parts:
             # 如果只有额外内容块，也需要添加占位文本
             content_blocks.append({"type": "text", "text": " "})
@@ -1211,6 +1313,10 @@ async def assemble_context(
                     )
                     if image_part:
                         content_blocks.append(image_part)
+                elif isinstance(part, AudioURLPart):
+                    audio_part = await self._resolve_audio_part(part.audio_url.url)
+                    if audio_part:
+                        content_blocks.append(audio_part)
                 else:
                     raise ValueError(f"不支持的额外内容块类型: {type(part)}")
 
@@ -1221,11 +1327,18 @@ async def assemble_context(
                 if image_part:
                     content_blocks.append(image_part)
 
+        if audio_urls:
+            for audio_path in audio_urls:
+                audio_part = await self._resolve_audio_part(audio_path)
+                if audio_part:
+                    content_blocks.append(audio_part)
+
         # 如果只有主文本且没有额外内容块和图片，返回简单格式以保持向后兼容
         if (
             text
             and not extra_user_content_parts
             and not image_urls
+            and not audio_urls
             and len(content_blocks) == 1
             and content_blocks[0]["type"] == "text"
         ):
diff --git a/astrbot/core/star/context.py b/astrbot/core/star/context.py
index 606f46dd73..058cf61e54 100644
--- a/astrbot/core/star/context.py
+++ b/astrbot/core/star/context.py
@@ -107,6 +107,7 @@ async def llm_generate(
         chat_provider_id: str,
         prompt: str | None = None,
         image_urls: list[str] | None = None,
+        audio_urls: list[str] | None = None,
         tools: ToolSet | None = None,
         system_prompt: str | None = None,
         contexts: list[Message] | None = None,
@@ -120,6 +121,7 @@ async def llm_generate(
             chat_provider_id: The chat provider ID to use.
             prompt: The prompt to send to the LLM, if `contexts` and `prompt` are both provided, `prompt` will be appended as the last user message
             image_urls: List of image URLs to include in the prompt, if `contexts` and `prompt` are both provided, `image_urls` will be appended to the last user message
+            audio_urls: List of audio URLs or local paths to include in the prompt, if `contexts` and `prompt` are both provided, `audio_urls` will be appended to the last user message
             tools: ToolSet of tools available to the LLM
             system_prompt: System prompt to guide the LLM's behavior, if provided, it will always insert as the first system message in the context
             contexts: context messages for the LLM
@@ -135,6 +137,7 @@ async def llm_generate(
         llm_resp = await prov.text_chat(
             prompt=prompt,
             image_urls=image_urls,
+            audio_urls=audio_urls,
             func_tool=tools,
             contexts=contexts,
             system_prompt=system_prompt,
@@ -149,6 +152,7 @@ async def tool_loop_agent(
         chat_provider_id: str,
         prompt: str | None = None,
         image_urls: list[str] | None = None,
+        audio_urls: list[str] | None = None,
         tools: ToolSet | None = None,
         system_prompt: str | None = None,
         contexts: list[Message] | None = None,
@@ -165,6 +169,7 @@ async def tool_loop_agent(
             chat_provider_id: The chat provider ID to use.
             prompt: The prompt to send to the LLM, if `contexts` and `prompt` are both provided, `prompt` will be appended as the last user message
             image_urls: List of image URLs to include in the prompt, if `contexts` and `prompt` are both provided, `image_urls` will be appended to the last user message
+            audio_urls: List of audio URLs or local paths to include in the prompt, if `contexts` and `prompt` are both provided, `audio_urls` will be appended to the last user message
             tools: ToolSet of tools available to the LLM
             system_prompt: System prompt to guide the LLM's behavior, if provided, it will always insert as the first system message in the context
             contexts: context messages for the LLM
@@ -207,6 +212,7 @@ async def tool_loop_agent(
         request = ProviderRequest(
             prompt=prompt,
             image_urls=image_urls or [],
+            audio_urls=audio_urls or [],
             func_tool=tools,
             contexts=context_,
             system_prompt=system_prompt or "",
diff --git a/astrbot/core/utils/media_utils.py b/astrbot/core/utils/media_utils.py
index d3f3cc75d3..40f1e60495 100644
--- a/astrbot/core/utils/media_utils.py
+++ b/astrbot/core/utils/media_utils.py
@@ -229,7 +229,7 @@ async def convert_audio_format(
 
     Args:
         audio_path: 原始音频文件路径
-        output_format: 目标格式，例如 amr / ogg
+        output_format: 目标格式，例如 amr / ogg / opus / wav
         output_path: 输出文件路径，如果为None则自动生成
 
     Returns:
@@ -248,6 +248,8 @@ async def convert_audio_format(
         args.extend(["-ac", "1", "-ar", "8000", "-ab", "12.2k"])
     elif output_format == "ogg":
         args.extend(["-acodec", "libopus", "-ac", "1", "-ar", "16000"])
+    elif output_format == "opus":
+        args.extend(["-acodec", "libopus", "-ac", "1", "-ar", "16000"])
     args.append(output_path)
 
     try:
@@ -289,11 +291,67 @@ async def convert_audio_to_wav(audio_path: str, output_path: str | None = None)
     )
 
 
+async def ensure_wav(audio_path: str, output_path: str | None = None) -> str:
+    """Ensure the audio path points to wav format by extension/guess and convert when needed.
+
+    If the file appears to already be wav, return it directly to avoid extra conversion.
+    """
+
+    if not audio_path:
+        return audio_path
+
+    if _get_audio_magic_type(audio_path) == "wav":
+        return audio_path
+
+    return await convert_audio_to_wav(audio_path, output_path)
+
+
+def _get_audio_magic_type(audio_path: str) -> str:
+    """Detect common audio formats from magic bytes."""
+    try:
+        with open(audio_path, "rb") as f:
+            header = f.read(64)
+    except FileNotFoundError:
+        logger.warning(f"[Media Utils] wav check file not found: {audio_path}")
+        return ""
+    except Exception as e:
+        logger.warning(f"[Media Utils] wav check failed: {audio_path}, error: {e}")
+        return ""
+
+    if len(header) < 12:
+        return ""
+
+    if header[:4] == b"RIFF" and header[8:12] == b"WAVE":
+        return "wav"
+
+    if header[:4] == b"#!AM":
+        return "amr"
+
+    if header[:4] == b"OggS":
+        if b"OpusHead" in header:
+            return "opus"
+        return "ogg"
+
+    if header[:3] == b"fLa":
+        return "flac"
+
+    if header[:3] == b"ID3" or header[:2] == b"\xff\xfb":
+        return "mp3"
+
+    if header[:4] == b"ftyp" and b"mp4" in header[:8]:
+        return "mp4"
+
+    if header[:8] == b"#!SILK_V3":
+        return "silk"
+
+    return ""
+
+
 async def extract_video_cover(
     video_path: str,
     output_path: str | None = None,
 ) -> str:
-    """从视频中提取封面图（JPG）。"""
+    """从视频中提取封面图(JPG)"""
     if output_path is None:
         temp_dir = Path(get_astrbot_temp_path())
         temp_dir.mkdir(parents=True, exist_ok=True)
diff --git a/dashboard/src/assets/mdi-subset/materialdesignicons-subset.css b/dashboard/src/assets/mdi-subset/materialdesignicons-subset.css
index 3baa7d48ed..da960e963a 100644
--- a/dashboard/src/assets/mdi-subset/materialdesignicons-subset.css
+++ b/dashboard/src/assets/mdi-subset/materialdesignicons-subset.css
@@ -1,4 +1,4 @@
-/* Auto-generated MDI subset – 255 icons */
+/* Auto-generated MDI subset – 256 icons */
 /* Do not edit manually. Run: pnpm run subset-icons */
 
 @font-face {
@@ -684,6 +684,10 @@
   content: "\F0375";
 }
 
+.mdi-music-note-outline::before {
+  content: "\F0F74";
+}
+
 .mdi-note-text-outline::before {
   content: "\F11D7";
 }
diff --git a/dashboard/src/assets/mdi-subset/materialdesignicons-webfont-subset.woff b/dashboard/src/assets/mdi-subset/materialdesignicons-webfont-subset.woff
index 1c3a7f0274..2027f555ad 100644
Binary files a/dashboard/src/assets/mdi-subset/materialdesignicons-webfont-subset.woff and b/dashboard/src/assets/mdi-subset/materialdesignicons-webfont-subset.woff differ
diff --git a/dashboard/src/assets/mdi-subset/materialdesignicons-webfont-subset.woff2 b/dashboard/src/assets/mdi-subset/materialdesignicons-webfont-subset.woff2
index 923b8a10a7..79b3946fbc 100644
Binary files a/dashboard/src/assets/mdi-subset/materialdesignicons-webfont-subset.woff2 and b/dashboard/src/assets/mdi-subset/materialdesignicons-webfont-subset.woff2 differ
diff --git a/dashboard/src/components/chat/ProviderConfigDialog.vue b/dashboard/src/components/chat/ProviderConfigDialog.vue
index 51ff37677f..359bcecdbd 100644
--- a/dashboard/src/components/chat/ProviderConfigDialog.vue
+++ b/dashboard/src/components/chat/ProviderConfigDialog.vue
@@ -63,6 +63,7 @@
                 <ProviderModelsPanel :entries="filteredMergedModelEntries" :available-count="availableModels.length"
                   v-model:model-search="modelSearch" :loading-models="loadingModels"
                   :is-source-modified="isSourceModified" :supports-image-input="supportsImageInput"
+                  :supports-audio-input="supportsAudioInput"
                   :supports-tool-call="supportsToolCall" :supports-reasoning="supportsReasoning"
                   :format-context-limit="formatContextLimit" :testing-providers="testingProviders" :tm="tm"
                   @fetch-models="fetchAvailableModels" @open-manual-model="openManualModelDialog"
@@ -185,6 +186,7 @@ const {
   resolveSourceIcon,
   getSourceDisplayName,
   supportsImageInput,
+  supportsAudioInput,
   supportsToolCall,
   supportsReasoning,
   formatContextLimit,
diff --git a/dashboard/src/components/chat/ProviderModelMenu.vue b/dashboard/src/components/chat/ProviderModelMenu.vue
index 946fba3f8f..ced65dd552 100644
--- a/dashboard/src/components/chat/ProviderModelMenu.vue
+++ b/dashboard/src/components/chat/ProviderModelMenu.vue
@@ -35,6 +35,11 @@
                                         <v-icon v-bind="tipProps" size="12" color="grey">mdi-eye-outline</v-icon>
                                     </template>
                                 </v-tooltip>
+                                <v-tooltip text="支持音频输入" location="top" v-if="supportsAudioInput(provider)">
+                                    <template v-slot:activator="{ props: tipProps }">
+                                        <v-icon v-bind="tipProps" size="12" color="grey">mdi-music-note-outline</v-icon>
+                                    </template>
+                                </v-tooltip>
                                 <v-tooltip text="支持工具调用" location="top" v-if="supportsToolCall(provider)">
                                     <template v-slot:activator="{ props: tipProps }">
                                         <v-icon v-bind="tipProps" size="12" color="grey">mdi-wrench</v-icon>
@@ -134,6 +139,11 @@ function supportsImageInput(provider: ProviderConfig): boolean {
     return inputs.includes('image');
 }
 
+function supportsAudioInput(provider: ProviderConfig): boolean {
+    const inputs = provider.model_metadata?.modalities?.input || [];
+    return inputs.includes('audio');
+}
+
 function supportsToolCall(provider: ProviderConfig): boolean {
     return Boolean(provider.model_metadata?.tool_call);
 }
diff --git a/dashboard/src/components/provider/ProviderModelsPanel.vue b/dashboard/src/components/provider/ProviderModelsPanel.vue
index 55de72e67a..a9e6239906 100644
--- a/dashboard/src/components/provider/ProviderModelsPanel.vue
+++ b/dashboard/src/components/provider/ProviderModelsPanel.vue
@@ -60,6 +60,9 @@
               <v-icon v-if="supportsImageInput(entry.metadata)" size="14" color="grey">
                 mdi-eye-outline
               </v-icon>
+              <v-icon v-if="supportsAudioInput(entry.metadata)" size="14" color="grey">
+                mdi-music-note-outline
+              </v-icon>
               <v-icon v-if="supportsToolCall(entry.metadata)" size="14" color="grey">
                 mdi-wrench
               </v-icon>
@@ -129,6 +132,9 @@
               <v-icon v-if="supportsImageInput(entry.metadata)" size="14" color="grey">
                 mdi-eye-outline
               </v-icon>
+              <v-icon v-if="supportsAudioInput(entry.metadata)" size="14" color="grey">
+                mdi-music-note-outline
+              </v-icon>
               <v-icon v-if="supportsToolCall(entry.metadata)" size="14" color="grey">
                 mdi-wrench
               </v-icon>
@@ -189,6 +195,10 @@ const props = defineProps({
     type: Function,
     required: true
   },
+  supportsAudioInput: {
+    type: Function,
+    required: true
+  },
   supportsToolCall: {
     type: Function,
     required: true
diff --git a/dashboard/src/components/provider/ProviderSourcesPanel.vue b/dashboard/src/components/provider/ProviderSourcesPanel.vue
index 1946533cc4..3a860eb124 100644
--- a/dashboard/src/components/provider/ProviderSourcesPanel.vue
+++ b/dashboard/src/components/provider/ProviderSourcesPanel.vue
@@ -216,7 +216,7 @@ const emitDeleteSource = (source) => emit('delete-provider-source', source)
 }
 
 .provider-sources-mobile {
-  padding: 14px 18px 0;
+  padding: 16px;
 }
 
 .provider-sources-list-wrap {
diff --git a/dashboard/src/components/shared/AstrBotConfig.vue b/dashboard/src/components/shared/AstrBotConfig.vue
index bc1c86bdfc..33273a36c9 100644
--- a/dashboard/src/components/shared/AstrBotConfig.vue
+++ b/dashboard/src/components/shared/AstrBotConfig.vue
@@ -470,8 +470,21 @@ function hasVisibleItemsAfter(items, currentIndex) {
     padding: 8px 0;
   }
 
-  .property-info, .type-indicator, .config-input {
+  .property-info {
+    padding: 4px 4px;
+  }
+
+  .property-info :deep(.v-list-item) {
+    padding-inline: 0;
+  }
+
+  .type-indicator,
+  .config-input {
     padding: 4px;
   }
+
+  .config-divider {
+    display: none;
+  }
 }
 </style>
diff --git a/dashboard/src/components/shared/AstrBotConfigV4.vue b/dashboard/src/components/shared/AstrBotConfigV4.vue
index aa7b50897f..a07b2b0bcc 100644
--- a/dashboard/src/components/shared/AstrBotConfigV4.vue
+++ b/dashboard/src/components/shared/AstrBotConfigV4.vue
@@ -601,14 +601,17 @@ function getSpecialSubtype(value) {
     padding: 8px 0;
   }
 
-  .property-info,
   .type-indicator {
     padding: 4px 8px;
   }
 
   .config-input {
-    padding-left: 24px;
-    padding-right: 24px;
+    padding-left: 16px;
+    padding-right: 16px;
+  }
+
+  .config-divider {
+    display: none;
   }
 }
 </style>
diff --git a/dashboard/src/components/shared/ConfigItemRenderer.vue b/dashboard/src/components/shared/ConfigItemRenderer.vue
index ab80816a68..0cdddcde61 100644
--- a/dashboard/src/components/shared/ConfigItemRenderer.vue
+++ b/dashboard/src/components/shared/ConfigItemRenderer.vue
@@ -71,7 +71,7 @@
 
     <div
       v-else-if="itemMeta?.type === 'list' && itemMeta?.options && itemMeta?.render_type === 'checkbox'"
-      class="d-flex flex-wrap gap-20"
+      class="checkbox-group d-flex flex-wrap"
     >
       <v-checkbox
         v-for="(option, optionIndex) in itemMeta.options"
@@ -80,8 +80,9 @@
         @update:model-value="emitUpdate"
         :label="getLabel(itemMeta, optionIndex, option)"
         :value="option"
-        class="mr-2"
+        class="config-checkbox"
         color="primary"
+        density="compact"
         hide-details
       ></v-checkbox>
     </div>
@@ -356,8 +357,29 @@ function getSpecialSubtype(value) {
   background-color: rgba(0, 0, 0, 0.5);
 }
 
-.gap-20 {
-  gap: 20px;
+.checkbox-group {
+  gap: 6px 12px;
+}
+
+.config-checkbox {
+  margin-right: 0;
+}
+
+.config-checkbox :deep(.v-selection-control) {
+  min-height: 28px;
+}
+
+.config-checkbox :deep(.v-selection-control__wrapper) {
+  width: 18px;
+  height: 18px;
+}
+
+.config-checkbox :deep(.v-icon) {
+  font-size: 18px;
+}
+
+.config-checkbox :deep(.v-label) {
+  font-size: 0.9rem;
 }
 
 :deep(.v-field__input) {
diff --git a/dashboard/src/composables/useMessages.ts b/dashboard/src/composables/useMessages.ts
index 52ef54e90a..aed32c5961 100644
--- a/dashboard/src/composables/useMessages.ts
+++ b/dashboard/src/composables/useMessages.ts
@@ -755,6 +755,7 @@ export function useMessages(
     function buildBackendMessageParts(
         prompt: string,
         stagedFiles: { attachment_id: string; url: string; original_name: string; type: string }[],
+        audioAttachmentId: string,
         replyTo: ReplyInfo | null
     ): MessagePart[] {
         const parts: MessagePart[] = [];
@@ -783,6 +784,13 @@ export function useMessages(
             });
         }
 
+        if (audioAttachmentId) {
+            parts.push({
+                type: 'record',
+                attachment_id: audioAttachmentId
+            });
+        }
+
         return parts;
     }
 
@@ -916,7 +924,7 @@ export function useMessages(
     async function sendMessage(
         prompt: string,
         stagedFiles: { attachment_id: string; url: string; original_name: string; type: string }[],
-        audioName: string,
+        audioAttachmentId: string,
         selectedProviderId: string,
         selectedModelName: string,
         replyTo: ReplyInfo | null = null
@@ -956,10 +964,12 @@ export function useMessages(
             });
         }
 
-        if (audioName) {
+        if (audioAttachmentId) {
+            const embeddedUrl = await getAttachment(audioAttachmentId);
             userMessageParts.push({
                 type: 'record',
-                embedded_url: audioName
+                attachment_id: audioAttachmentId,
+                embedded_url: embeddedUrl
             });
         }
 
@@ -987,8 +997,13 @@ export function useMessages(
             userStopRequested.value = false;
             currentRunningSessionId.value = currSessionId.value;
 
-            const backendMessageParts = buildBackendMessageParts(prompt, stagedFiles, replyTo);
-            const hasAttachmentOrReply = stagedFiles.length > 0 || !!replyTo;
+            const backendMessageParts = buildBackendMessageParts(
+                prompt,
+                stagedFiles,
+                audioAttachmentId,
+                replyTo
+            );
+            const hasAttachmentOrReply = stagedFiles.length > 0 || !!audioAttachmentId || !!replyTo;
 
             if (transportMode.value === 'websocket') {
                 await sendMessageViaWebSocket(
diff --git a/dashboard/src/composables/useProviderSources.ts b/dashboard/src/composables/useProviderSources.ts
index 2891d8976c..1d2548382b 100644
--- a/dashboard/src/composables/useProviderSources.ts
+++ b/dashboard/src/composables/useProviderSources.ts
@@ -290,6 +290,11 @@ export function useProviderSources(options: UseProviderSourcesOptions) {
     return inputs.includes('image')
   }
 
+  function supportsAudioInput(meta: any) {
+    const inputs = meta?.modalities?.input || []
+    return inputs.includes('audio')
+  }
+
   function supportsToolCall(meta: any) {
     return Boolean(meta?.tool_call)
   }
@@ -543,12 +548,15 @@ export function useProviderSources(options: UseProviderSourcesOptions) {
     let modalities: string[]
 
     if (!metadata) {
-      modalities = ['text', 'image', 'tool_use']
+      modalities = ['text', 'image', 'audio', 'tool_use']
     } else {
       modalities = ['text']
       if (supportsImageInput(metadata)) {
         modalities.push('image')
       }
+      if (supportsAudioInput(metadata)) {
+        modalities.push('audio')
+      }
       if (supportsToolCall(metadata)) {
         modalities.push('tool_use')
       }
@@ -687,6 +695,7 @@ export function useProviderSources(options: UseProviderSourcesOptions) {
     getSourceDisplayName,
     getModelMetadata,
     supportsImageInput,
+    supportsAudioInput,
     supportsToolCall,
     supportsReasoning,
     formatContextLimit,
diff --git a/dashboard/src/composables/useRecording.ts b/dashboard/src/composables/useRecording.ts
index 4b03e85080..083ed1c5b8 100644
--- a/dashboard/src/composables/useRecording.ts
+++ b/dashboard/src/composables/useRecording.ts
@@ -6,6 +6,26 @@ export function useRecording() {
     const audioChunks = ref<Blob[]>([]);
     const mediaRecorder = ref<MediaRecorder | null>(null);
 
+    function getRecordingMimeType(): string {
+        const chunkType = audioChunks.value.find(chunk => chunk.type)?.type;
+        return chunkType || mediaRecorder.value?.mimeType || 'audio/webm';
+    }
+
+    function getRecordingFilename(mimeType: string): string {
+        const extensionMap: Record<string, string> = {
+            'audio/webm': 'webm',
+            'audio/webm;codecs=opus': 'webm',
+            'audio/ogg': 'ogg',
+            'audio/ogg;codecs=opus': 'ogg',
+            'audio/mp4': 'm4a',
+            'audio/mpeg': 'mp3',
+            'audio/wav': 'wav'
+        };
+        const normalizedMimeType = mimeType.toLowerCase();
+        const extension = extensionMap[normalizedMimeType] || normalizedMimeType.split('/')[1]?.split(';')[0] || 'webm';
+        return `${crypto.randomUUID()}.${extension}`;
+    }
+
     async function startRecording(onStart?: (label: string) => void) {
         try {
             const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
@@ -40,13 +60,15 @@ export function useRecording() {
 
             mediaRecorder.value.stop();
             mediaRecorder.value.onstop = async () => {
-                const audioBlob = new Blob(audioChunks.value, { type: 'audio/wav' });
+                const mimeType = getRecordingMimeType();
+                const audioBlob = new Blob(audioChunks.value, { type: mimeType });
+                const filename = getRecordingFilename(mimeType);
                 audioChunks.value = [];
 
                 mediaRecorder.value?.stream.getTracks().forEach(track => track.stop());
 
                 const formData = new FormData();
-                formData.append('file', audioBlob);
+                formData.append('file', audioBlob, filename);
 
                 try {
                     const response = await axios.post('/api/chat/post_file', formData, {
@@ -55,9 +77,9 @@ export function useRecording() {
                         }
                     });
 
-                    const audio = response.data.data.filename;
-                    console.log('Audio uploaded:', audio);
-                    resolve(audio);
+                    const attachmentId = response.data.data.attachment_id;
+                    console.log('Audio uploaded:', attachmentId);
+                    resolve(attachmentId);
                 } catch (err) {
                     console.error('Error uploading audio:', err);
                     reject(err);
diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
index 92927cebd7..e314e676d7 100644
--- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
@@ -1106,6 +1106,7 @@
         "labels": [
           "Text",
           "Image",
+          "Audio",
           "Tool use"
         ]
       },
@@ -1598,4 +1599,4 @@
     "helpMiddle": "or",
     "helpSuffix": "."
   }
-}
+}
\ No newline at end of file
diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
index afb608b4ab..3a7693b0fd 100644
--- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
@@ -1107,6 +1107,7 @@
                 "labels": [
                     "Текст",
                     "Изображение",
+                    "Аудио",
                     "Инструменты"
                 ]
             },
diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
index 8ce6d575f6..97aba66118 100644
--- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
@@ -1108,6 +1108,7 @@
         "labels": [
           "文本",
           "图像",
+          "音频",
           "工具使用"
         ]
       },
diff --git a/dashboard/src/views/ProviderPage.vue b/dashboard/src/views/ProviderPage.vue
index 899ec2c3b9..83ab04897a 100644
--- a/dashboard/src/views/ProviderPage.vue
+++ b/dashboard/src/views/ProviderPage.vue
@@ -100,6 +100,7 @@
                         :loading-models="loadingModels"
                         :is-source-modified="isSourceModified"
                         :supports-image-input="supportsImageInput"
+                        :supports-audio-input="supportsAudioInput"
                         :supports-tool-call="supportsToolCall"
                         :supports-reasoning="supportsReasoning"
                         :format-context-limit="formatContextLimit"
@@ -324,6 +325,7 @@ const {
   resolveSourceIcon,
   getSourceDisplayName,
   supportsImageInput,
+  supportsAudioInput,
   supportsToolCall,
   supportsReasoning,
   formatContextLimit,
diff --git a/tests/unit/test_astr_main_agent.py b/tests/unit/test_astr_main_agent.py
index 9a42abd733..57609bd98b 100644
--- a/tests/unit/test_astr_main_agent.py
+++ b/tests/unit/test_astr_main_agent.py
@@ -675,7 +675,7 @@ def test_modalities_fix_image_not_supported(self, mock_provider):
 
         module._modalities_fix(mock_provider, req)
 
-        assert "[图片]" in req.prompt
+        assert "[Image]" in req.prompt
         assert req.image_urls == []
 
     def test_modalities_fix_tool_not_supported(self, mock_provider):