2121from astrbot .core .astr_agent_run_util import AgentRunner
2222from astrbot .core .astr_agent_tool_exec import FunctionToolExecutor
2323from astrbot .core .conversation_mgr import Conversation
24- from astrbot .core .message .components import File , Image , Reply
24+ from astrbot .core .message .components import File , Image , Record , Reply
2525from astrbot .core .persona_error_reply import (
2626 extract_persona_custom_error_message_from_persona ,
2727 set_persona_custom_error_message_on_event ,
@@ -419,6 +419,18 @@ def _append_quoted_image_attachment(req: ProviderRequest, image_path: str) -> No
419419 )
420420
421421
422+ def _append_audio_attachment (req : ProviderRequest , audio_path : str ) -> None :
423+ req .extra_user_content_parts .append (
424+ TextPart (text = f"[Audio Attachment: path { audio_path } ]" )
425+ )
426+
427+
428+ def _append_quoted_audio_attachment (req : ProviderRequest , audio_path : str ) -> None :
429+ req .extra_user_content_parts .append (
430+ TextPart (text = f"[Audio Attachment in quoted message: path { audio_path } ]" )
431+ )
432+
433+
422434def _get_quoted_message_parser_settings (
423435 provider_settings : dict [str , object ] | None ,
424436) -> QuotedMessageParserSettings :
@@ -704,12 +716,25 @@ def _modalities_fix(provider: Provider, req: ProviderRequest) -> None:
704716 "Provider %s does not support image, using placeholder." , provider
705717 )
706718 image_count = len (req .image_urls )
707- placeholder = " " .join (["[图片 ]" ] * image_count )
719+ placeholder = " " .join (["[Image ]" ] * image_count )
708720 if req .prompt :
709721 req .prompt = f"{ placeholder } { req .prompt } "
710722 else :
711723 req .prompt = placeholder
712724 req .image_urls = []
725+ if req .audio_urls :
726+ provider_cfg = provider .provider_config .get ("modalities" , ["audio" ])
727+ if "audio" not in provider_cfg :
728+ logger .debug (
729+ "Provider %s does not support audio, using placeholder." , provider
730+ )
731+ audio_count = len (req .audio_urls )
732+ placeholder = " " .join (["[Audio]" ] * audio_count )
733+ if req .prompt :
734+ req .prompt = f"{ placeholder } { req .prompt } "
735+ else :
736+ req .prompt = placeholder
737+ req .audio_urls = []
713738 if req .func_tool :
714739 provider_cfg = provider .provider_config .get ("modalities" , ["tool_use" ])
715740 if "tool_use" not in provider_cfg :
@@ -730,11 +755,13 @@ def _sanitize_context_by_modalities(
730755 if not modalities or not isinstance (modalities , list ):
731756 return
732757 supports_image = bool ("image" in modalities )
758+ supports_audio = bool ("audio" in modalities )
733759 supports_tool_use = bool ("tool_use" in modalities )
734- if supports_image and supports_tool_use :
760+ if supports_image and supports_audio and supports_tool_use :
735761 return
736762 sanitized_contexts : list [dict ] = []
737763 removed_image_blocks = 0
764+ removed_audio_blocks = 0
738765 removed_tool_messages = 0
739766 removed_tool_calls = 0
740767 for msg in req .contexts :
@@ -753,20 +780,28 @@ def _sanitize_context_by_modalities(
753780 removed_tool_calls += 1
754781 new_msg .pop ("tool_calls" , None )
755782 new_msg .pop ("tool_call_id" , None )
756- if not supports_image :
783+
784+ if not supports_image or not supports_audio :
757785 content = new_msg .get ("content" )
758786 if isinstance (content , list ):
759787 filtered_parts : list = []
760- removed_any_image = False
788+ removed_any_multimodal = False
761789 for part in content :
762790 if isinstance (part , dict ):
763791 part_type = str (part .get ("type" , "" )).lower ()
764- if part_type in {"image_url" , "image" }:
765- removed_any_image = True
792+ if not supports_image and part_type in {"image_url" , "image" }:
793+ removed_any_multimodal = True
766794 removed_image_blocks += 1
767795 continue
796+ if not supports_audio and part_type in {
797+ "audio_url" ,
798+ "input_audio" ,
799+ }:
800+ removed_any_multimodal = True
801+ removed_audio_blocks += 1
802+ continue
768803 filtered_parts .append (part )
769- if removed_any_image :
804+ if removed_any_multimodal :
770805 new_msg ["content" ] = filtered_parts
771806 if role == "assistant" :
772807 content = new_msg .get ("content" )
@@ -777,10 +812,19 @@ def _sanitize_context_by_modalities(
777812 if isinstance (content , str ) and (not content .strip ()):
778813 continue
779814 sanitized_contexts .append (new_msg )
780- if removed_image_blocks or removed_tool_messages or removed_tool_calls :
815+
816+ if (
817+ removed_image_blocks
818+ or removed_audio_blocks
819+ or removed_tool_messages
820+ or removed_tool_calls
821+ ):
781822 logger .debug (
782- "sanitize_context_by_modalities applied: removed_image_blocks=%s, removed_tool_messages=%s, removed_tool_calls=%s" ,
823+ "sanitize_context_by_modalities applied: "
824+ "removed_image_blocks=%s, removed_audio_blocks=%s, "
825+ "removed_tool_messages=%s, removed_tool_calls=%s" ,
783826 removed_image_blocks ,
827+ removed_audio_blocks ,
784828 removed_tool_messages ,
785829 removed_tool_calls ,
786830 )
@@ -969,6 +1013,7 @@ async def build_main_agent(
9691013 req = ProviderRequest ()
9701014 req .prompt = ""
9711015 req .image_urls = []
1016+ req .audio_urls = []
9721017 if sel_model := event .get_extra ("selected_model" ):
9731018 req .model = sel_model
9741019 if config .provider_wake_prefix and (
@@ -988,6 +1033,10 @@ async def build_main_agent(
9881033 req .extra_user_content_parts .append (
9891034 TextPart (text = f"[Image Attachment: path { image_path } ]" )
9901035 )
1036+ elif isinstance (comp , Record ):
1037+ audio_path = await comp .convert_to_file_path ()
1038+ req .audio_urls .append (audio_path )
1039+ _append_audio_attachment (req , audio_path )
9911040 elif isinstance (comp , File ):
9921041 file_path = await comp .get_file ()
9931042 file_name = comp .name or os .path .basename (file_path )
@@ -1017,6 +1066,10 @@ async def build_main_agent(
10171066 event .track_temporary_local_file (image_path )
10181067 req .image_urls .append (image_path )
10191068 _append_quoted_image_attachment (req , image_path )
1069+ elif isinstance (reply_comp , Record ):
1070+ audio_path = await reply_comp .convert_to_file_path ()
1071+ req .audio_urls .append (audio_path )
1072+ _append_quoted_audio_attachment (req , audio_path )
10201073 elif isinstance (reply_comp , File ):
10211074 file_path = await reply_comp .get_file ()
10221075 file_name = reply_comp .name or os .path .basename (file_path )
@@ -1074,12 +1127,15 @@ async def build_main_agent(
10741127 if isinstance (req .contexts , str ):
10751128 req .contexts = json .loads (req .contexts )
10761129 req .image_urls = normalize_and_dedupe_strings (req .image_urls )
1130+ req .audio_urls = normalize_and_dedupe_strings (req .audio_urls )
1131+
10771132 if config .file_extract_enabled :
10781133 try :
10791134 await _apply_file_extract (event , req , config )
10801135 except Exception as exc :
10811136 logger .error ("Error occurred while applying file extract: %s" , exc )
1082- if not req .prompt and (not req .image_urls ):
1137+
1138+ if not req .prompt and not req .image_urls and not req .audio_urls :
10831139 if not event .get_group_id () and req .extra_user_content_parts :
10841140 req .prompt = "<attachment>"
10851141 else :
0 commit comments