From 771eec644d62684625d71e3a51517f9a50aa378b Mon Sep 17 00:00:00 2001 From: Liang Ding Date: Sun, 15 Mar 2026 03:03:33 +0800 Subject: [PATCH] [bugfix] fix load audio from local video path when USE_AUDIO_IN_VIDEO (fix #8332) --- swift/template/templates/qwen.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/swift/template/templates/qwen.py b/swift/template/templates/qwen.py index a12466b67c..7fab666fa3 100644 --- a/swift/template/templates/qwen.py +++ b/swift/template/templates/qwen.py @@ -660,9 +660,15 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int inputs.videos[index] = _video if self.use_audio_in_video: import librosa + # Librosa/soundfile do not support video containers (e.g. .mp4). Decode via ffmpeg (gh#8332). if video.startswith('http://') or video.startswith('https://'): import audioread video = audioread.ffdec.FFmpegAudioFile(video) + elif isinstance(video, str): + path_lower = video.lower() + if any(path_lower.endswith(ext) for ext in ('.mp4', '.webm', '.avi', '.mov', '.mkv')): + import audioread + video = audioread.ffdec.FFmpegAudioFile(video) video = librosa.load(video, sr=self.sampling_rate)[0] if self.mode != 'vllm': inputs.audios.insert(inputs.audio_idx, (video, 'video'))