From 10e685a4fc95f519491b7db459bbab316b37f4e4 Mon Sep 17 00:00:00 2001
From: jasagiri <172246+jasagiri@users.noreply.github.com>
Date: Mon, 18 May 2026 04:11:58 +0900
Subject: [PATCH] fix: read wav via soundfile to avoid torchcodec dependency

Since torchaudio 2.8, torchaudio.load routes through torchcodec, an
optional dependency that is often not installed. This makes load_wav
fail with an ImportError on otherwise working setups.

Read the file directly with soundfile and fall back to torchaudio
only if that fails.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 cosyvoice/utils/file_utils.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py
index b173ef201..ae943bb01 100644
--- a/cosyvoice/utils/file_utils.py
+++ b/cosyvoice/utils/file_utils.py
@@ -18,6 +18,8 @@
 import json
 import torch
 import torchaudio
+import soundfile as sf
+import numpy as np
 import logging
 logging.getLogger('matplotlib').setLevel(logging.WARNING)
 logging.basicConfig(level=logging.DEBUG,
@@ -42,8 +44,21 @@ def read_json_lists(list_file):
 
 
 def load_wav(wav, target_sr, min_sr=16000):
-    speech, sample_rate = torchaudio.load(wav, backend='soundfile')
-    speech = speech.mean(dim=0, keepdim=True)
+    # Read via soundfile directly: since torchaudio 2.8, torchaudio.load routes
+    # through torchcodec, an optional dependency that is often not installed.
+    try:
+        speech_np, sample_rate = sf.read(wav, dtype='float32')
+        # Convert to torch tensor
+        if speech_np.ndim == 1:
+            speech = torch.from_numpy(speech_np).unsqueeze(0)
+        else:
+            # Multi-channel: convert to mono by averaging
+            speech = torch.from_numpy(speech_np.T).mean(dim=0, keepdim=True)
+    except Exception as e:
+        logging.warning(f'soundfile failed, falling back to torchaudio: {e}')
+        speech, sample_rate = torchaudio.load(wav, backend='soundfile')
+        speech = speech.mean(dim=0, keepdim=True)
+    
     if sample_rate != target_sr:
         assert sample_rate >= min_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)