5151 retrieve_knowledge_base ,
5252)
5353from astrbot .core .conversation_mgr import Conversation
54- from astrbot .core .message .components import File , Image , Reply
54+ from astrbot .core .message .components import File , Image , Record , Reply
5555from astrbot .core .persona_error_reply import (
5656 extract_persona_custom_error_message_from_persona ,
5757 set_persona_custom_error_message_on_event ,
@@ -515,6 +515,18 @@ def _append_quoted_image_attachment(req: ProviderRequest, image_path: str) -> No
515515 )
516516
517517
518+ def _append_audio_attachment (req : ProviderRequest , audio_path : str ) -> None :
519+ req .extra_user_content_parts .append (
520+ TextPart (text = f"[Audio Attachment: path { audio_path } ]" )
521+ )
522+
523+
524+ def _append_quoted_audio_attachment (req : ProviderRequest , audio_path : str ) -> None :
525+ req .extra_user_content_parts .append (
526+ TextPart (text = f"[Audio Attachment in quoted message: path { audio_path } ]" )
527+ )
528+
529+
518530def _get_quoted_message_parser_settings (
519531 provider_settings : dict [str , object ] | None ,
520532) -> QuotedMessageParserSettings :
@@ -753,12 +765,25 @@ def _modalities_fix(provider: Provider, req: ProviderRequest) -> None:
753765 "Provider %s does not support image, using placeholder." , provider
754766 )
755767 image_count = len (req .image_urls )
756- placeholder = " " .join (["[图片 ]" ] * image_count )
768+ placeholder = " " .join (["[Image ]" ] * image_count )
757769 if req .prompt :
758770 req .prompt = f"{ placeholder } { req .prompt } "
759771 else :
760772 req .prompt = placeholder
761773 req .image_urls = []
774+ if req .audio_urls :
775+ provider_cfg = provider .provider_config .get ("modalities" , ["audio" ])
776+ if "audio" not in provider_cfg :
777+ logger .debug (
778+ "Provider %s does not support audio, using placeholder." , provider
779+ )
780+ audio_count = len (req .audio_urls )
781+ placeholder = " " .join (["[Audio]" ] * audio_count )
782+ if req .prompt :
783+ req .prompt = f"{ placeholder } { req .prompt } "
784+ else :
785+ req .prompt = placeholder
786+ req .audio_urls = []
762787 if req .func_tool :
763788 provider_cfg = provider .provider_config .get ("modalities" , ["tool_use" ])
764789 if "tool_use" not in provider_cfg :
@@ -781,12 +806,14 @@ def _sanitize_context_by_modalities(
781806 if not modalities or not isinstance (modalities , list ):
782807 return
783808 supports_image = bool ("image" in modalities )
809+ supports_audio = bool ("audio" in modalities )
784810 supports_tool_use = bool ("tool_use" in modalities )
785- if supports_image and supports_tool_use :
811+ if supports_image and supports_audio and supports_tool_use :
786812 return
787813
788814 sanitized_contexts : list [dict ] = []
789815 removed_image_blocks = 0
816+ removed_audio_blocks = 0
790817 removed_tool_messages = 0
791818 removed_tool_calls = 0
792819
@@ -808,20 +835,27 @@ def _sanitize_context_by_modalities(
808835 new_msg .pop ("tool_calls" , None )
809836 new_msg .pop ("tool_call_id" , None )
810837
811- if not supports_image :
838+ if not supports_image or not supports_audio :
812839 content = new_msg .get ("content" )
813840 if isinstance (content , list ):
814841 filtered_parts : list = []
815- removed_any_image = False
842+ removed_any_multimodal = False
816843 for part in content :
817844 if isinstance (part , dict ):
818845 part_type = str (part .get ("type" , "" )).lower ()
819- if part_type in {"image_url" , "image" }:
820- removed_any_image = True
846+ if not supports_image and part_type in {"image_url" , "image" }:
847+ removed_any_multimodal = True
821848 removed_image_blocks += 1
822849 continue
850+ if not supports_audio and part_type in {
851+ "audio_url" ,
852+ "input_audio" ,
853+ }:
854+ removed_any_multimodal = True
855+ removed_audio_blocks += 1
856+ continue
823857 filtered_parts .append (part )
824- if removed_any_image :
858+ if removed_any_multimodal :
825859 new_msg ["content" ] = filtered_parts
826860
827861 if role == "assistant" :
@@ -835,11 +869,18 @@ def _sanitize_context_by_modalities(
835869
836870 sanitized_contexts .append (new_msg )
837871
838- if removed_image_blocks or removed_tool_messages or removed_tool_calls :
872+ if (
873+ removed_image_blocks
874+ or removed_audio_blocks
875+ or removed_tool_messages
876+ or removed_tool_calls
877+ ):
839878 logger .debug (
840879 "sanitize_context_by_modalities applied: "
841- "removed_image_blocks=%s, removed_tool_messages=%s, removed_tool_calls=%s" ,
880+ "removed_image_blocks=%s, removed_audio_blocks=%s, "
881+ "removed_tool_messages=%s, removed_tool_calls=%s" ,
842882 removed_image_blocks ,
883+ removed_audio_blocks ,
843884 removed_tool_messages ,
844885 removed_tool_calls ,
845886 )
@@ -1101,6 +1142,7 @@ async def build_main_agent(
11011142 req = ProviderRequest ()
11021143 req .prompt = ""
11031144 req .image_urls = []
1145+ req .audio_urls = []
11041146 if sel_model := event .get_extra ("selected_model" ):
11051147 req .model = sel_model
11061148 if config .provider_wake_prefix and not event .message_str .startswith (
@@ -1124,6 +1166,10 @@ async def build_main_agent(
11241166 req .extra_user_content_parts .append (
11251167 TextPart (text = f"[Image Attachment: path { image_path } ]" )
11261168 )
1169+ elif isinstance (comp , Record ):
1170+ audio_path = await comp .convert_to_file_path ()
1171+ req .audio_urls .append (audio_path )
1172+ _append_audio_attachment (req , audio_path )
11271173 elif isinstance (comp , File ):
11281174 file_path = await comp .get_file ()
11291175 file_name = comp .name or os .path .basename (file_path )
@@ -1155,6 +1201,10 @@ async def build_main_agent(
11551201 event .track_temporary_local_file (image_path )
11561202 req .image_urls .append (image_path )
11571203 _append_quoted_image_attachment (req , image_path )
1204+ elif isinstance (reply_comp , Record ):
1205+ audio_path = await reply_comp .convert_to_file_path ()
1206+ req .audio_urls .append (audio_path )
1207+ _append_quoted_audio_attachment (req , audio_path )
11581208 elif isinstance (reply_comp , File ):
11591209 file_path = await reply_comp .get_file ()
11601210 file_name = reply_comp .name or os .path .basename (file_path )
@@ -1222,14 +1272,15 @@ async def build_main_agent(
12221272 if isinstance (req .contexts , str ):
12231273 req .contexts = json .loads (req .contexts )
12241274 req .image_urls = normalize_and_dedupe_strings (req .image_urls )
1275+ req .audio_urls = normalize_and_dedupe_strings (req .audio_urls )
12251276
12261277 if config .file_extract_enabled :
12271278 try :
12281279 await _apply_file_extract (event , req , config )
12291280 except Exception as exc : # noqa: BLE001
12301281 logger .error ("Error occurred while applying file extract: %s" , exc )
12311282
1232- if not req .prompt and not req .image_urls :
1283+ if not req .prompt and not req .image_urls and not req . audio_urls :
12331284 if not event .get_group_id () and req .extra_user_content_parts :
12341285 req .prompt = "<attachment>"
12351286 else :
0 commit comments