Skip to content

Commit 5ec1e54

Browse files
Copilotwannaphong
andcommitted
Add VachanaTTS2 support with wrapper, documentation, and dependency
Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com>
1 parent ec3ffdb commit 5ec1e54

4 files changed

Lines changed: 128 additions & 5 deletions

File tree

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,26 @@ file = tts.tts("ภาษาไทย ง่าย มาก มาก", filenam
2424
wave = tts.tts("ภาษาไทย ง่าย มาก มาก",return_type="waveform") # It will get waveform.
2525
```
2626

27+
### Using Different TTS Models
28+
29+
PyThaiTTS supports multiple TTS models. You can specify which model to use:
30+
31+
```python
32+
from pythaitts import TTS
33+
34+
# Use VachanaTTS (default voices: th_f_1, th_m_1, th_f_2, th_m_2)
35+
tts = TTS(pretrained="vachana")
36+
file = tts.tts("สวัสดีครับ", speaker_idx="th_f_1", filename="output.wav")
37+
38+
# Use Lunarlist ONNX (default)
39+
tts = TTS(pretrained="lunarlist_onnx")
40+
file = tts.tts("ภาษาไทย ง่าย มาก", filename="output.wav")
41+
42+
# Use KhanomTan
43+
tts = TTS(pretrained="khanomtan")
44+
file = tts.tts("ภาษาไทย", speaker_idx="Linda", filename="output.wav")
45+
```
46+
2747
### Text Preprocessing
2848

2949
PyThaiTTS includes automatic text preprocessing to improve TTS quality:

pythaitts/__init__.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
class TTS:
1111
def __init__(self, pretrained="lunarlist_onnx", mode="last_checkpoint", version="1.0", device:str="cpu") -> None:
1212
"""
13-
:param str pretrained: TTS pretrained (lunarlist_onnx, khanomtan, lunarlist)
14-
:param str mode: pretrained mode (lunarlist_onnx don't support)
13+
:param str pretrained: TTS pretrained (lunarlist_onnx, khanomtan, lunarlist, vachana)
14+
:param str mode: pretrained mode (lunarlist_onnx and vachana don't support)
1515
:param str version: model version (default is 1.0 or 1.1)
16-
:param str device: device for running model. (lunarlist_onnx support CPU only.)
16+
:param str device: device for running model. (lunarlist_onnx and vachana support CPU only.)
1717
1818
**Options for mode**
1919
* *last_checkpoint* (default) - last checkpoint of model
@@ -28,6 +28,8 @@ def __init__(self, pretrained="lunarlist_onnx", mode="last_checkpoint", version=
2828
For lunarlist_onnx tts model, \
2929
You can see more about lunarlist tts at `https://github.com/PyThaiNLP/thaitts-onnx <https://github.com/PyThaiNLP/thaitts-onnx>`_
3030
31+
For vachana tts model, \
32+
You can see more about vachana tts at `https://github.com/VYNCX/VachanaTTS2 <https://github.com/VYNCX/VachanaTTS2>`_
3133
3234
3335
"""
@@ -49,6 +51,9 @@ def load_pretrained(self,version):
4951
elif self.pretrained == "lunarlist":
5052
from pythaitts.pretrained.lunarlist_model import LunarlistModel
5153
self.model = LunarlistModel(mode=self.mode, device=self.device)
54+
elif self.pretrained == "vachana":
55+
from pythaitts.pretrained.vachana_tts import VachanaTTS
56+
self.model = VachanaTTS()
5257
else:
5358
raise NotImplemented(
5459
"PyThaiTTS doesn't support %s pretrained." % self.pretrained
@@ -59,7 +64,7 @@ def tts(self, text: str, speaker_idx: str = "Linda", language_idx: str = "th-th"
5964
speech synthesis
6065
6166
:param str text: text
62-
:param str speaker_idx: speaker (default is Linda)
67+
:param str speaker_idx: speaker (default is Linda for khanomtan, th_f_1 for vachana)
6368
:param str language_idx: language (default is th-th)
6469
:param str return_type: return type (default is file)
6570
:param str filename: path filename for save wav file if return_type is file.
@@ -72,6 +77,8 @@ def tts(self, text: str, speaker_idx: str = "Linda", language_idx: str = "th-th"
7277

7378
if self.pretrained == "lunarlist" or self.pretrained == "lunarlist_onnx":
7479
return self.model(text=text,return_type=return_type,filename=filename)
80+
elif self.pretrained == "vachana":
81+
return self.model(text=text,speaker_idx=speaker_idx,return_type=return_type,filename=filename)
7582
return self.model(
7683
text=text,
7784
speaker_idx=speaker_idx,
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
VachanaTTS2 model
4+
5+
VachanaTTS2 is a Thai text-to-speech model built on VITS architecture.
6+
It supports multiple Thai voices and is optimized for both CPU and GPU usage.
7+
8+
See more: https://github.com/VYNCX/VachanaTTS2
9+
"""
10+
import tempfile
11+
12+
13+
class VachanaTTS:
14+
def __init__(self) -> None:
15+
"""
16+
Initialize VachanaTTS model.
17+
The model will be automatically downloaded from HuggingFace on first use.
18+
"""
19+
try:
20+
from vachanatts import TTS as VachanaTTS_TTS
21+
self.tts_func = VachanaTTS_TTS
22+
except ImportError:
23+
raise ImportError(
24+
"vachanatts is not installed. Please install it with: pip install vachanatts"
25+
)
26+
27+
def __call__(self, text: str, speaker_idx: str = "th_f_1", return_type: str = "file", filename: str = None, **kwargs):
28+
"""
29+
Generate speech from text using VachanaTTS.
30+
31+
:param str text: Input text to synthesize
32+
:param str speaker_idx: Voice to use (th_f_1, th_m_1, th_f_2, th_m_2). Default is "th_f_1"
33+
:param str return_type: Return type ("file" or "waveform")
34+
:param str filename: Output filename for the generated audio
35+
:param kwargs: Additional parameters (volume, speed, noise_scale, noise_w_scale)
36+
:return: File path if return_type is "file", otherwise audio waveform data
37+
"""
38+
# Extract additional parameters with defaults
39+
volume = kwargs.get('volume', 1.0)
40+
speed = kwargs.get('speed', 1.0)
41+
noise_scale = kwargs.get('noise_scale', 0.667)
42+
noise_w_scale = kwargs.get('noise_w_scale', 0.8)
43+
44+
if return_type == "waveform":
45+
# For waveform return, we need to generate to a temp file then read it
46+
import wave
47+
import numpy as np
48+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
49+
temp_filename = fp.name
50+
51+
# Generate the audio file
52+
self.tts_func(
53+
text,
54+
voice=speaker_idx,
55+
output=temp_filename,
56+
volume=volume,
57+
speed=speed,
58+
noise_scale=noise_scale,
59+
noise_w_scale=noise_w_scale
60+
)
61+
62+
# Read the waveform from the file
63+
with wave.open(temp_filename, 'rb') as wav_file:
64+
n_frames = wav_file.getnframes()
65+
audio_data = wav_file.readframes(n_frames)
66+
# Convert bytes to numpy array
67+
import struct
68+
sample_width = wav_file.getsampwidth()
69+
if sample_width == 2:
70+
waveform = np.frombuffer(audio_data, dtype=np.int16)
71+
else:
72+
waveform = np.frombuffer(audio_data, dtype=np.int8)
73+
74+
# Clean up temp file
75+
import os
76+
os.unlink(temp_filename)
77+
78+
return waveform
79+
else:
80+
# File output
81+
if filename is None:
82+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
83+
filename = fp.name
84+
85+
self.tts_func(
86+
text,
87+
voice=speaker_idx,
88+
output=filename,
89+
volume=volume,
90+
speed=speed,
91+
noise_scale=noise_scale,
92+
noise_w_scale=noise_w_scale
93+
)
94+
95+
return filename

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
huggingface_hub
22
numpy>=1.22
3-
onnxruntime
3+
onnxruntime
4+
vachanatts

0 commit comments

Comments
 (0)