Merge pull request #13 from PyThaiNLP/copilot/add-support-for-vachanatts2

wannaphong · web-flow · commit fbbdd963a0a1 · 2026-01-28T21:32:23.000+07:00
Add VachanaTTS2 model support
diff --git a/README.md b/README.md
@@ -24,6 +24,26 @@ file = tts.tts("ภาษาไทย ง่าย มาก มาก", filenam
 wave = tts.tts("ภาษาไทย ง่าย มาก มาก",return_type="waveform") # It will get waveform.
 ```
 
+### Using Different TTS Models
+
+PyThaiTTS supports multiple TTS models. You can specify which model to use:
+
+```python
+from pythaitts import TTS
+
+# Use VachanaTTS (default voices: th_f_1, th_m_1, th_f_2, th_m_2)
+tts = TTS(pretrained="vachana")
+file = tts.tts("สวัสดีครับ", speaker_idx="th_f_1", filename="output.wav")
+
+# Use Lunarlist ONNX (default)
+tts = TTS(pretrained="lunarlist_onnx")
+file = tts.tts("ภาษาไทย ง่าย มาก", filename="output.wav")
+
+# Use KhanomTan
+tts = TTS(pretrained="khanomtan")
+file = tts.tts("ภาษาไทย", speaker_idx="Linda", filename="output.wav")
+```
+
 ### Text Preprocessing
 
 PyThaiTTS includes automatic text preprocessing to improve TTS quality:
diff --git a/pythaitts/__init__.py b/pythaitts/__init__.py
@@ -10,10 +10,10 @@
 class TTS:
     def __init__(self, pretrained="lunarlist_onnx", mode="last_checkpoint", version="1.0", device:str="cpu") -> None:
         """
-        :param str pretrained: TTS pretrained (lunarlist_onnx, khanomtan, lunarlist)
-        :param str mode: pretrained mode (lunarlist_onnx don't support)
+        :param str pretrained: TTS pretrained (lunarlist_onnx, khanomtan, lunarlist, vachana)
+        :param str mode: pretrained mode (lunarlist_onnx and vachana don't support)
         :param str version: model version (default is 1.0 or 1.1)
-        :param str device: device for running model. (lunarlist_onnx support CPU only.)
+        :param str device: device for running model. (lunarlist_onnx and vachana support CPU only.)
 
         **Options for mode**
             * *last_checkpoint* (default) - last checkpoint of model
@@ -28,6 +28,8 @@ def __init__(self, pretrained="lunarlist_onnx", mode="last_checkpoint", version=
         For lunarlist_onnx tts model, \
         You can see more about lunarlist tts at `https://github.com/PyThaiNLP/thaitts-onnx <https://github.com/PyThaiNLP/thaitts-onnx>`_
 
+        For vachana tts model, \
+        You can see more about vachana tts at `https://github.com/VYNCX/VachanaTTS2 <https://github.com/VYNCX/VachanaTTS2>`_
 
         
         """
@@ -49,8 +51,11 @@ def load_pretrained(self,version):
         elif self.pretrained == "lunarlist":
             from pythaitts.pretrained.lunarlist_model import LunarlistModel
             self.model = LunarlistModel(mode=self.mode, device=self.device)
+        elif self.pretrained == "vachana":
+            from pythaitts.pretrained.vachana_tts import VachanaTTS
+            self.model = VachanaTTS()
         else:
-            raise NotImplemented(
+            raise NotImplementedError(
                 "PyThaiTTS doesn't support %s pretrained." % self.pretrained
             )
 
@@ -59,7 +64,7 @@ def tts(self, text: str, speaker_idx: str = "Linda", language_idx: str = "th-th"
         speech synthesis
 
         :param str text: text
-        :param str speaker_idx: speaker (default is Linda)
+        :param str speaker_idx: speaker (default is Linda for khanomtan, th_f_1 for vachana)
         :param str language_idx: language (default is th-th)
         :param str return_type: return type (default is file)
         :param str filename: path filename for save wav file if return_type is file.
@@ -72,6 +77,8 @@ def tts(self, text: str, speaker_idx: str = "Linda", language_idx: str = "th-th"
         
         if self.pretrained == "lunarlist" or self.pretrained == "lunarlist_onnx":
             return self.model(text=text,return_type=return_type,filename=filename)
+        elif self.pretrained == "vachana":
+            return self.model(text=text,speaker_idx=speaker_idx,return_type=return_type,filename=filename)
         return self.model(
             text=text,
             speaker_idx=speaker_idx,
diff --git a/pythaitts/pretrained/vachana_tts.py b/pythaitts/pretrained/vachana_tts.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+"""
+VachanaTTS2 model
+
+VachanaTTS2 is a Thai text-to-speech model built on VITS architecture.
+It supports multiple Thai voices and is optimized for both CPU and GPU usage.
+
+See more: https://github.com/VYNCX/VachanaTTS2
+"""
+import tempfile
+import wave
+import numpy as np
+import os
+
+
+class VachanaTTS:
+    # Supported voice options
+    SUPPORTED_VOICES = ["th_f_1", "th_m_1", "th_f_2", "th_m_2"]
+    
+    def __init__(self) -> None:
+        """
+        Initialize VachanaTTS model.
+        The model will be automatically downloaded from HuggingFace on first use.
+        """
+        try:
+            from vachanatts import TTS as VachanaTTS_TTS
+            self.tts_func = VachanaTTS_TTS
+        except ImportError:
+            raise ImportError(
+                "vachanatts is not installed. Please install it with: pip install vachanatts"
+            )
+
+    def __call__(self, text: str, speaker_idx: str = "th_f_1", return_type: str = "file", filename: str = None, **kwargs):
+        """
+        Generate speech from text using VachanaTTS.
+
+        :param str text: Input text to synthesize
+        :param str speaker_idx: Voice to use (th_f_1, th_m_1, th_f_2, th_m_2). Default is "th_f_1"
+        :param str return_type: Return type ("file" or "waveform")
+        :param str filename: Output filename for the generated audio
+        :param kwargs: Additional parameters (volume, speed, noise_scale, noise_w_scale)
+        :return: File path if return_type is "file", otherwise audio waveform data
+        """
+        # Validate speaker_idx
+        if speaker_idx not in self.SUPPORTED_VOICES:
+            raise ValueError(
+                f"Unsupported voice '{speaker_idx}'. Supported voices are: {', '.join(self.SUPPORTED_VOICES)}"
+            )
+        
+        # Extract additional parameters with defaults
+        volume = kwargs.get('volume', 1.0)
+        speed = kwargs.get('speed', 1.0)
+        noise_scale = kwargs.get('noise_scale', 0.667)
+        noise_w_scale = kwargs.get('noise_w_scale', 0.8)
+
+        if return_type == "waveform":
+            # For waveform return, we need to generate to a temp file then read it
+            temp_filename = None
+            try:
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+                    temp_filename = fp.name
+                
+                # Generate the audio file
+                self.tts_func(
+                    text,
+                    voice=speaker_idx,
+                    output=temp_filename,
+                    volume=volume,
+                    speed=speed,
+                    noise_scale=noise_scale,
+                    noise_w_scale=noise_w_scale
+                )
+                
+                # Read the waveform from the file
+                with wave.open(temp_filename, 'rb') as wav_file:
+                    n_frames = wav_file.getnframes()
+                    audio_data = wav_file.readframes(n_frames)
+                    sample_width = wav_file.getsampwidth()
+                    
+                    # Convert bytes to numpy array based on sample width
+                    if sample_width == 1:
+                        waveform = np.frombuffer(audio_data, dtype=np.int8)
+                    elif sample_width == 2:
+                        waveform = np.frombuffer(audio_data, dtype=np.int16)
+                    elif sample_width == 4:
+                        waveform = np.frombuffer(audio_data, dtype=np.int32)
+                    else:
+                        raise ValueError(f"Unsupported sample width: {sample_width} bytes")
+                
+                return waveform
+            finally:
+                # Clean up temp file
+                if temp_filename and os.path.exists(temp_filename):
+                    os.unlink(temp_filename)
+        else:
+            # File output
+            if filename is None:
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+                    filename = fp.name
+            
+            self.tts_func(
+                text,
+                voice=speaker_idx,
+                output=filename,
+                volume=volume,
+                speed=speed,
+                noise_scale=noise_scale,
+                noise_w_scale=noise_w_scale
+            )
+            
+            return filename
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 huggingface_hub
 numpy>=1.22
-onnxruntime
+onnxruntime
+vachanatts
diff --git a/tests/test_vachana.py b/tests/test_vachana.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+"""
+Unit tests for VachanaTTS integration
+"""
+import unittest
+from unittest.mock import Mock, patch, MagicMock
+import numpy as np
+from pythaitts import TTS
+
+
+class TestVachanaIntegration(unittest.TestCase):
+    """Test VachanaTTS integration"""
+
+    @patch('pythaitts.pretrained.vachana_tts.VachanaTTS')
+    def test_vachana_model_initialization(self, mock_vachana):
+        """Test that VachanaTTS model can be initialized"""
+        # Create TTS instance with vachana model
+        tts = TTS(pretrained="vachana")
+        
+        # Verify model is loaded
+        self.assertIsNotNone(tts.model)
+        self.assertEqual(tts.pretrained, "vachana")
+
+    @patch('pythaitts.pretrained.vachana_tts.VachanaTTS')
+    def test_vachana_tts_call(self, mock_vachana_class):
+        """Test calling tts method with vachana model"""
+        # Setup mock
+        mock_instance = Mock()
+        mock_instance.return_value = "/tmp/output.wav"
+        mock_vachana_class.return_value = mock_instance
+        
+        # Create TTS instance
+        tts = TTS(pretrained="vachana")
+        
+        # Call tts method
+        result = tts.tts("สวัสดีครับ", speaker_idx="th_f_1", filename="/tmp/test.wav")
+        
+        # Verify the model was called with correct parameters
+        mock_instance.assert_called_once()
+        call_args = mock_instance.call_args
+        self.assertEqual(call_args.kwargs['text'], "สวัสดีครับ")
+        self.assertEqual(call_args.kwargs['speaker_idx'], "th_f_1")
+        self.assertEqual(call_args.kwargs['filename'], "/tmp/test.wav")
+        self.assertEqual(call_args.kwargs['return_type'], "file")
+
+    @patch('pythaitts.pretrained.vachana_tts.VachanaTTS')
+    def test_vachana_with_preprocessing(self, mock_vachana_class):
+        """Test that preprocessing works with vachana model"""
+        # Setup mock
+        mock_instance = Mock()
+        mock_instance.return_value = "/tmp/output.wav"
+        mock_vachana_class.return_value = mock_instance
+        
+        # Create TTS instance
+        tts = TTS(pretrained="vachana")
+        
+        # Call tts method with text that needs preprocessing
+        result = tts.tts("มี 5 คนๆ", speaker_idx="th_f_1", preprocess=True)
+        
+        # Verify preprocessing was applied
+        mock_instance.assert_called_once()
+        call_args = mock_instance.call_args
+        processed_text = call_args.kwargs['text']
+        
+        # Text should have numbers converted and ๆ expanded
+        self.assertNotIn("5", processed_text)
+        self.assertNotIn("ๆ", processed_text)
+        self.assertIn("ห้า", processed_text)
+        self.assertIn("คนคน", processed_text)
+
+    @patch('pythaitts.pretrained.vachana_tts.VachanaTTS')
+    def test_vachana_all_supported_voices(self, mock_vachana_class):
+        """Test that all supported voices work correctly"""
+        # Setup mock
+        mock_instance = Mock()
+        mock_instance.return_value = "/tmp/output.wav"
+        mock_vachana_class.return_value = mock_instance
+        
+        # Create TTS instance
+        tts = TTS(pretrained="vachana")
+        
+        # Test all supported voices
+        supported_voices = ["th_f_1", "th_m_1", "th_f_2", "th_m_2"]
+        for voice in supported_voices:
+            mock_instance.reset_mock()
+            result = tts.tts("สวัสดี", speaker_idx=voice)
+            
+            # Verify the voice was passed correctly
+            call_args = mock_instance.call_args
+            self.assertEqual(call_args.kwargs['speaker_idx'], voice)
+
+    @patch('pythaitts.pretrained.vachana_tts.VachanaTTS')
+    def test_vachana_waveform_return(self, mock_vachana_class):
+        """Test waveform return type functionality"""
+        # Setup mock
+        mock_instance = Mock()
+        mock_waveform = np.array([0.1, 0.2, 0.3, 0.4])
+        mock_instance.return_value = mock_waveform
+        mock_vachana_class.return_value = mock_instance
+        
+        # Create TTS instance
+        tts = TTS(pretrained="vachana")
+        
+        # Call tts method with waveform return type
+        result = tts.tts("สวัสดี", speaker_idx="th_f_1", return_type="waveform")
+        
+        # Verify the return type was set correctly
+        call_args = mock_instance.call_args
+        self.assertEqual(call_args.kwargs['return_type'], "waveform")
+
+
+if __name__ == '__main__':
+    unittest.main()