diff --git a/asset/zero_shot_prompt.txt b/asset/zero_shot_prompt.txt new file mode 100644 index 000000000..4ab681347 --- /dev/null +++ b/asset/zero_shot_prompt.txt @@ -0,0 +1 @@ +希望你以后能够做的比我还好呦。 \ No newline at end of file diff --git a/tools/sft_en_test.py b/tools/sft_en_test.py new file mode 100644 index 000000000..b81168435 --- /dev/null +++ b/tools/sft_en_test.py @@ -0,0 +1,25 @@ +import os +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) +sys.path.append('third_party/Matcha-TTS') +import torch +import torchaudio +from cosyvoice.cli.cosyvoice import CosyVoice + + +cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False) + +en_spk_ids = ['英文女', '英文男'] + +tts_text = 'Maintaining your ability to learn translates into increased marketability, improved career optionsand higher salaries.' + +for spk_id in en_spk_ids: + chunks = [] + for out in cosyvoice.inference_sft(tts_text, spk_id, stream=False): + chunks.append(out['tts_speech']) + if len(chunks) == 0: + continue + speech = torch.cat(chunks, dim=1) + filename = f'sft_en_{spk_id}.wav' + torchaudio.save(filename, speech, cosyvoice.sample_rate) + print('saved', filename) diff --git a/tools/sft_speak_id.py b/tools/sft_speak_id.py new file mode 100644 index 000000000..903bb3f06 --- /dev/null +++ b/tools/sft_speak_id.py @@ -0,0 +1,13 @@ +import os +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) +sys.path.append('third_party/Matcha-TTS') +from cosyvoice.cli.cosyvoice import CosyVoice + + +cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False) + + +# 使用 SFT 模型,遍历所有集成的 speaker 并合成语音 +spk_ids = cosyvoice.list_available_spks() +print('available spk_id:', spk_ids) diff --git a/tools/sft_test.py b/tools/sft_test.py new file mode 100644 index 000000000..fe645079f --- /dev/null +++ b/tools/sft_test.py @@ -0,0 +1,26 @@ +import os +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) +sys.path.append('third_party/Matcha-TTS') +import torch +import torchaudio +from cosyvoice.cli.cosyvoice import CosyVoice + + +cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False) + +tts_text = '收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。' + +spk_ids = ['中文女', '中文男', '粤语女'] +# print('available spk_id:', spk_ids) + +for spk_id in spk_ids: + chunks = [] + for out in cosyvoice.inference_sft(tts_text, spk_id, stream=False): + chunks.append(out['tts_speech']) + if len(chunks) == 0: + continue + speech = torch.cat(chunks, dim=1) + filename = f'sft_{spk_id}.wav' + torchaudio.save(filename, speech, cosyvoice.sample_rate) + print('saved', filename) diff --git a/tools/zero_short_speak_id.py b/tools/zero_short_speak_id.py new file mode 100644 index 000000000..8ba0804a4 --- /dev/null +++ b/tools/zero_short_speak_id.py @@ -0,0 +1,29 @@ +import os +from pathlib import Path +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) +sys.path.append('third_party/Matcha-TTS') +import torchaudio +from cosyvoice.cli.cosyvoice import CosyVoice2 +from cosyvoice.utils.file_utils import load_wav + + +cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=False) + +# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference +# zero_shot usage +voice_file = 'asset/zero_shot_prompt.wav' +prompt_speech_16k = load_wav(voice_file, 16000) +prompt_text_file = Path(voice_file).with_suffix(".txt") +if prompt_text_file.exists(): + with open(prompt_text_file, 'r', encoding='utf-8') as f: + prompt_text = f.read().strip() + +# save zero_shot spk as a voice_id for future usage +assert cosyvoice.add_zero_shot_spk(prompt_text, prompt_speech_16k, 'my_zero_shot_spk') is True +for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)): + torchaudio.save('speaker_id_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) +# contiue to use "my_zero_shot_spk" instead of "prompt_speech_16k" +... +# this step is optional unless you want to persist "my_zero_shot_spk" into "model_dir/spk2info.pt" for a new session +cosyvoice.save_spkinfo() diff --git a/tools/zero_short_test.py b/tools/zero_short_test.py new file mode 100644 index 000000000..fc2973cc3 --- /dev/null +++ b/tools/zero_short_test.py @@ -0,0 +1,24 @@ +import os +from pathlib import Path +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) +sys.path.append('third_party/Matcha-TTS') +import torchaudio +from cosyvoice.cli.cosyvoice import CosyVoice2 +from cosyvoice.utils.file_utils import load_wav + + +cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=True) + +# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference +# zero_shot usage +voice_file = 'asset/zero_shot_prompt.wav' +prompt_speech_16k = load_wav(voice_file, 16000) +prompt_text_file = Path(voice_file).with_suffix(".txt") +if prompt_text_file.exists(): + with open(prompt_text_file, 'r', encoding='utf-8') as f: + prompt_text = f.read().strip() +for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', prompt_text, prompt_speech_16k, stream=False)): + filename = 'zero_shot_{}.wav'.format(i) + src_ = j['tts_speech'] + torchaudio.save(filename, src_, cosyvoice.sample_rate)