From 10e685a4fc95f519491b7db459bbab316b37f4e4 Mon Sep 17 00:00:00 2001 From: jasagiri <172246+jasagiri@users.noreply.github.com> Date: Mon, 18 May 2026 04:11:58 +0900 Subject: [PATCH] fix: read wav via soundfile to avoid torchcodec dependency Since torchaudio 2.8, torchaudio.load routes through torchcodec, an optional dependency that is often not installed. This makes load_wav fail with an ImportError on otherwise working setups. Read the file directly with soundfile and fall back to torchaudio only if that fails. Co-Authored-By: Claude Opus 4.7 --- cosyvoice/utils/file_utils.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py index b173ef201..ae943bb01 100644 --- a/cosyvoice/utils/file_utils.py +++ b/cosyvoice/utils/file_utils.py @@ -18,6 +18,8 @@ import json import torch import torchaudio +import soundfile as sf +import numpy as np import logging logging.getLogger('matplotlib').setLevel(logging.WARNING) logging.basicConfig(level=logging.DEBUG, @@ -42,8 +44,21 @@ def read_json_lists(list_file): def load_wav(wav, target_sr, min_sr=16000): - speech, sample_rate = torchaudio.load(wav, backend='soundfile') - speech = speech.mean(dim=0, keepdim=True) + # Read via soundfile directly: since torchaudio 2.8, torchaudio.load routes + # through torchcodec, an optional dependency that is often not installed. + try: + speech_np, sample_rate = sf.read(wav, dtype='float32') + # Convert to torch tensor + if speech_np.ndim == 1: + speech = torch.from_numpy(speech_np).unsqueeze(0) + else: + # Multi-channel: convert to mono by averaging + speech = torch.from_numpy(speech_np.T).mean(dim=0, keepdim=True) + except Exception as e: + logging.warning(f'soundfile failed, falling back to torchaudio: {e}') + speech, sample_rate = torchaudio.load(wav, backend='soundfile') + speech = speech.mean(dim=0, keepdim=True) + if sample_rate != target_sr: assert sample_rate >= min_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr) speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)