|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +VachanaTTS2 model |
| 4 | +
|
| 5 | +VachanaTTS2 is a Thai text-to-speech model built on VITS architecture. |
| 6 | +It supports multiple Thai voices and is optimized for both CPU and GPU usage. |
| 7 | +
|
| 8 | +See more: https://github.com/VYNCX/VachanaTTS2 |
| 9 | +""" |
| 10 | +import tempfile |
| 11 | + |
| 12 | + |
| 13 | +class VachanaTTS: |
| 14 | + def __init__(self) -> None: |
| 15 | + """ |
| 16 | + Initialize VachanaTTS model. |
| 17 | + The model will be automatically downloaded from HuggingFace on first use. |
| 18 | + """ |
| 19 | + try: |
| 20 | + from vachanatts import TTS as VachanaTTS_TTS |
| 21 | + self.tts_func = VachanaTTS_TTS |
| 22 | + except ImportError: |
| 23 | + raise ImportError( |
| 24 | + "vachanatts is not installed. Please install it with: pip install vachanatts" |
| 25 | + ) |
| 26 | + |
| 27 | + def __call__(self, text: str, speaker_idx: str = "th_f_1", return_type: str = "file", filename: str = None, **kwargs): |
| 28 | + """ |
| 29 | + Generate speech from text using VachanaTTS. |
| 30 | +
|
| 31 | + :param str text: Input text to synthesize |
| 32 | + :param str speaker_idx: Voice to use (th_f_1, th_m_1, th_f_2, th_m_2). Default is "th_f_1" |
| 33 | + :param str return_type: Return type ("file" or "waveform") |
| 34 | + :param str filename: Output filename for the generated audio |
| 35 | + :param kwargs: Additional parameters (volume, speed, noise_scale, noise_w_scale) |
| 36 | + :return: File path if return_type is "file", otherwise audio waveform data |
| 37 | + """ |
| 38 | + # Extract additional parameters with defaults |
| 39 | + volume = kwargs.get('volume', 1.0) |
| 40 | + speed = kwargs.get('speed', 1.0) |
| 41 | + noise_scale = kwargs.get('noise_scale', 0.667) |
| 42 | + noise_w_scale = kwargs.get('noise_w_scale', 0.8) |
| 43 | + |
| 44 | + if return_type == "waveform": |
| 45 | + # For waveform return, we need to generate to a temp file then read it |
| 46 | + import wave |
| 47 | + import numpy as np |
| 48 | + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
| 49 | + temp_filename = fp.name |
| 50 | + |
| 51 | + # Generate the audio file |
| 52 | + self.tts_func( |
| 53 | + text, |
| 54 | + voice=speaker_idx, |
| 55 | + output=temp_filename, |
| 56 | + volume=volume, |
| 57 | + speed=speed, |
| 58 | + noise_scale=noise_scale, |
| 59 | + noise_w_scale=noise_w_scale |
| 60 | + ) |
| 61 | + |
| 62 | + # Read the waveform from the file |
| 63 | + with wave.open(temp_filename, 'rb') as wav_file: |
| 64 | + n_frames = wav_file.getnframes() |
| 65 | + audio_data = wav_file.readframes(n_frames) |
| 66 | + # Convert bytes to numpy array |
| 67 | + import struct |
| 68 | + sample_width = wav_file.getsampwidth() |
| 69 | + if sample_width == 2: |
| 70 | + waveform = np.frombuffer(audio_data, dtype=np.int16) |
| 71 | + else: |
| 72 | + waveform = np.frombuffer(audio_data, dtype=np.int8) |
| 73 | + |
| 74 | + # Clean up temp file |
| 75 | + import os |
| 76 | + os.unlink(temp_filename) |
| 77 | + |
| 78 | + return waveform |
| 79 | + else: |
| 80 | + # File output |
| 81 | + if filename is None: |
| 82 | + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
| 83 | + filename = fp.name |
| 84 | + |
| 85 | + self.tts_func( |
| 86 | + text, |
| 87 | + voice=speaker_idx, |
| 88 | + output=filename, |
| 89 | + volume=volume, |
| 90 | + speed=speed, |
| 91 | + noise_scale=noise_scale, |
| 92 | + noise_w_scale=noise_w_scale |
| 93 | + ) |
| 94 | + |
| 95 | + return filename |
0 commit comments