FunAudioLLM · petercai · Dec 14, 2025
diff --git a/asset/zero_shot_prompt.txt b/asset/zero_shot_prompt.txt
@@ -0,0 +1 @@
+希望你以后能够做的比我还好呦。
diff --git a/tools/sft_en_test.py b/tools/sft_en_test.py
@@ -0,0 +1,25 @@
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+sys.path.append('third_party/Matcha-TTS')
+import torch
+import torchaudio
+from cosyvoice.cli.cosyvoice import CosyVoice
+
+
+cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
+
+en_spk_ids = ['英文女', '英文男']
+
+tts_text = 'Maintaining your ability to learn translates into increased marketability, improved career optionsand higher salaries.'
+
+for spk_id in en_spk_ids:
+    chunks = []
+    for out in cosyvoice.inference_sft(tts_text, spk_id, stream=False):
+        chunks.append(out['tts_speech'])
+    if len(chunks) == 0:
+        continue
+    speech = torch.cat(chunks, dim=1)
+    filename = f'sft_en_{spk_id}.wav'
+    torchaudio.save(filename, speech, cosyvoice.sample_rate)
+    print('saved', filename)
diff --git a/tools/sft_speak_id.py b/tools/sft_speak_id.py
@@ -0,0 +1,13 @@
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+sys.path.append('third_party/Matcha-TTS')
+from cosyvoice.cli.cosyvoice import CosyVoice
+
+
+cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
+
+
+# 使用 SFT 模型，遍历所有集成的 speaker 并合成语音
+spk_ids = cosyvoice.list_available_spks()
+print('available spk_id:', spk_ids)
diff --git a/tools/sft_test.py b/tools/sft_test.py
@@ -0,0 +1,26 @@
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+sys.path.append('third_party/Matcha-TTS')
+import torch
+import torchaudio
+from cosyvoice.cli.cosyvoice import CosyVoice
+
+
+cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
+
+tts_text = '收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。'
+
+spk_ids = ['中文女', '中文男', '粤语女']
+# print('available spk_id:', spk_ids)
+
+for spk_id in spk_ids:
+    chunks = []
+    for out in cosyvoice.inference_sft(tts_text, spk_id, stream=False):
+        chunks.append(out['tts_speech'])
+    if len(chunks) == 0:
+        continue
+    speech = torch.cat(chunks, dim=1)
+    filename = f'sft_{spk_id}.wav'
+    torchaudio.save(filename, speech, cosyvoice.sample_rate)
+    print('saved', filename)
diff --git a/tools/zero_short_speak_id.py b/tools/zero_short_speak_id.py
@@ -0,0 +1,29 @@
+import os
+from pathlib import Path
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+sys.path.append('third_party/Matcha-TTS')
+import torchaudio
+from cosyvoice.cli.cosyvoice import CosyVoice2
+from cosyvoice.utils.file_utils import load_wav
+
+
+cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=False)
+
+# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
+# zero_shot usage
+voice_file = 'asset/zero_shot_prompt.wav'
+prompt_speech_16k = load_wav(voice_file, 16000)
+prompt_text_file = Path(voice_file).with_suffix(".txt")
+if prompt_text_file.exists():
+    with open(prompt_text_file, 'r', encoding='utf-8') as f:
+        prompt_text = f.read().strip()
+
+# save zero_shot spk as a voice_id for future usage
+assert cosyvoice.add_zero_shot_spk(prompt_text, prompt_speech_16k, 'my_zero_shot_spk') is True
+for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
+    torchaudio.save('speaker_id_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+# contiue to use "my_zero_shot_spk" instead of "prompt_speech_16k"
+...
+# this step is optional unless you want to persist "my_zero_shot_spk" into "model_dir/spk2info.pt" for a new session
+cosyvoice.save_spkinfo()
diff --git a/tools/zero_short_test.py b/tools/zero_short_test.py
@@ -0,0 +1,24 @@
+import os
+from pathlib import Path
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+sys.path.append('third_party/Matcha-TTS')
+import torchaudio
+from cosyvoice.cli.cosyvoice import CosyVoice2
+from cosyvoice.utils.file_utils import load_wav
+
+
+cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=True)
+
+# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
+# zero_shot usage
+voice_file = 'asset/zero_shot_prompt.wav'
+prompt_speech_16k = load_wav(voice_file, 16000)
+prompt_text_file = Path(voice_file).with_suffix(".txt")
+if prompt_text_file.exists():
+    with open(prompt_text_file, 'r', encoding='utf-8') as f:
+        prompt_text = f.read().strip()
+for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', prompt_text, prompt_speech_16k, stream=False)):
+    filename = 'zero_shot_{}.wav'.format(i)
+    src_ = j['tts_speech']
+    torchaudio.save(filename, src_, cosyvoice.sample_rate)