diff --git a/swift/template/templates/qwen.py b/swift/template/templates/qwen.py index a12466b67c..7fab666fa3 100644 --- a/swift/template/templates/qwen.py +++ b/swift/template/templates/qwen.py @@ -660,9 +660,15 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int inputs.videos[index] = _video if self.use_audio_in_video: import librosa + # Librosa/soundfile do not support video containers (e.g. .mp4). Decode via ffmpeg (gh#8332). if video.startswith('http://') or video.startswith('https://'): import audioread video = audioread.ffdec.FFmpegAudioFile(video) + elif isinstance(video, str): + path_lower = video.lower() + if any(path_lower.endswith(ext) for ext in ('.mp4', '.webm', '.avi', '.mov', '.mkv')): + import audioread + video = audioread.ffdec.FFmpegAudioFile(video) video = librosa.load(video, sr=self.sampling_rate)[0] if self.mode != 'vllm': inputs.audios.insert(inputs.audio_idx, (video, 'video'))