long text handling

hamees-sayed · hamees-sayed · commit 0302a04e3982 · 2024-12-03T10:04:26.000Z
diff --git a/README.md b/README.md
@@ -75,7 +75,20 @@ if __name__ == "__main__":
 - `speed`: Speech speed multiplier (default: 1.0)
 - `add_wav_header`: Include WAV header in output (default: True)
 - `transliterate`: Enable text transliteration (default: False)
-- `remove_extra_silence`: Remove additional silence (default: True)
+- `remove_extra_silence`: Remove additional silence (default: True)  
+
+These parameters are part of the `Smallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts kwargs, allowing you to override these parameters for a specific synthesis request.
+
+For example, you can modify the speech speed and sample rate just for a particular synthesis call:  
+```py
+client.synthesize(
+    "Hello, this is a test for sync synthesis function.",
+    save_as="sync_synthesize.wav",
+    speed=1.5,  # Overrides default speed
+    sample_rate=16000  # Overrides default sample rate
+)
+```
+
 
 ### Async   
 Asynchronous text-to-speech synthesis client.    
@@ -107,7 +120,18 @@ if __name__ == "__main__":
 - `speed`: Speech speed multiplier (default: 1.0)
 - `add_wav_header`: Include WAV header in output (default: True)
 - `transliterate`: Enable text transliteration (default: False)
-- `remove_extra_silence`: Remove additional silence (default: True)
+- `remove_extra_silence`: Remove additional silence (default: True)  
+
+These parameters are part of the AsyncSmallest instance. They can be set when creating the instance (as shown above). However, the synthesize function also accepts kwargs, allowing you to override any of these parameters on a per-request basis.
+
+For example, you can modify the speech speed and sample rate just for a particular synthesis request:  
+```py
+audio_bytes = await tts.synthesize(
+    "Hello, this is a test of the async synthesis function.",
+    speed=1.5,  # Overrides default speed
+    sample_rate=16000  # Overrides default sample rate
+)
+```
 
 ### LLM to Speech    
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "smallestai"
-version = "1.2.0"
+version = "1.3.0"
 description = "Official Python client for the Smallest AI API"
 authors = [
     {name = "Smallest", email = "info@smallest.ai"},
diff --git a/smallest/async_tts.py b/smallest/async_tts.py
@@ -7,20 +7,20 @@
 from .models import TTSModels, TTSVoices
 from .exceptions import TTSError, APIError
 from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header,
-                     get_smallest_languages, get_smallest_voices, get_smallest_models, API_BASE_URL)
+                     get_smallest_languages, get_smallest_voices, get_smallest_models, SENTENCE_END_REGEX, API_BASE_URL)
 
 
 class AsyncSmallest:
     def __init__(
-            self,
-            api_key: Optional[str] = None,
-            model: TTSModels = "lightning",
-            sample_rate: int = 24000,
-            voice: TTSVoices = "emily",
-            speed: Optional[float] = 1.0,
-            add_wav_header: Optional[bool] = True,
-            transliterate: Optional[bool] = False,
-            remove_extra_silence: Optional[bool] = False
+        self,
+        api_key: Optional[str] = None,
+        model: TTSModels = "lightning",
+        sample_rate: int = 24000,
+        voice: TTSVoices = "emily",
+        speed: Optional[float] = 1.0,
+        add_wav_header: Optional[bool] = True,
+        transliterate: Optional[bool] = False,
+        remove_extra_silence: Optional[bool] = False
     ) -> None:
         """
         AsyncSmallest Instance for asynchronous text-to-speech synthesis.
@@ -48,6 +48,7 @@ def __init__(
         self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
         if not self.api_key:
             raise TTSError("API key is required")
+        self.chunk_size = 250
         
         self.opts = TTSOptions(
             model=model,
@@ -70,6 +71,48 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
         if self.session:
             await self.session.close()
 
+    def _split_into_chunks(self, text: str) -> List[str]:
+        """
+        Splits the input text into chunks based on sentence boundaries and the maximum chunk size.
+        """
+        chunks = []
+        current_chunk = ""
+        last_break_index = 0
+
+        i = 0
+        while i < len(text):
+            current_chunk += text[i]
+
+            if text[i] in ".,":
+                last_break_index = i
+
+            if len(current_chunk) >= self.chunk_size:
+                if last_break_index > 0:
+                    chunk = text[:last_break_index + 1].strip()
+                    chunk = chunk.replace("—", " ")
+                    chunks.append(chunk)
+                
+                    text = text[last_break_index + 1:]
+                    i = -1
+                    current_chunk = ""
+                    last_break_index = 0
+                else:
+                    # No break point found, split at max length
+                    current_chunk = current_chunk.replace("—", " ")
+                    chunks.append(current_chunk.strip())
+                    text = text[self.chunk_size:]
+                    i = -1
+                    current_chunk = ""
+
+            i += 1
+
+        if text:
+            text = text.replace("—", " ")
+            chunks.append(text.strip())
+
+        return chunks
+
+
     def get_languages(self) -> List[str]:
         """Returns a list of available languages."""
         return get_smallest_languages()
@@ -110,42 +153,45 @@ async def synthesize(
             setattr(opts, key, value)
 
         validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
-
-        payload = {
-            "text": preprocess_text(text),
-            "sample_rate": opts.sample_rate,
-            "voice_id": opts.voice,
-            "add_wav_header": opts.add_wav_header,
-            "speed": opts.speed,
-            "model": opts.model,
-            "transliterate": opts.transliterate,
-            "remove_extra_silence": opts.remove_extra_silence
-        }
-
-        headers = {
-            "Authorization": f"Bearer {self.api_key}",
-            "Content-Type": "application/json",
-        }
-
-        if not self.session:
-            self.session = aiohttp.ClientSession()
+        chunks = self._split_into_chunks(text)
+        audio_content = b""
+
+        for chunk in chunks:
+            payload = {
+                "text": preprocess_text(chunk),
+                "sample_rate": opts.sample_rate,
+                "voice_id": opts.voice,
+                "add_wav_header": False,
+                "speed": opts.speed,
+                "model": opts.model,
+                "transliterate": opts.transliterate,
+                "remove_extra_silence": opts.remove_extra_silence
+            }
+
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            }
+
+            if not self.session:
+                self.session = aiohttp.ClientSession()
         
-        async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
-            if res.status != 200:
-                raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
+            async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
+                if res.status != 200:
+                    raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
             
-            audio_content = await res.read()
+                audio_content += await res.read()
 
         if save_as:
             if not save_as.endswith(".wav"):
                 raise TTSError("Invalid file name. Extension must be .wav")
             
-            if self.opts.add_wav_header:
-                async with aiofiles.open(save_as, mode='wb') as f:
-                    await f.write(audio_content)
-            else:
-                async with aiofiles.open(save_as, mode='wb') as f:
-                    await f.write(add_wav_header(audio_content, self.opts.sample_rate))
+            async with aiofiles.open(save_as, mode='wb') as f:
+                await f.write(add_wav_header(audio_content, self.opts.sample_rate))
+
             return None
 
+        if opts.add_wav_header:
+            return add_wav_header(audio_content, self.opts.sample_rate)
+        
         return audio_content
diff --git a/smallest/stream_tts.py b/smallest/stream_tts.py
@@ -34,13 +34,14 @@ def __init__(
             max_retries: Number of retry attempts for failed synthesis (default: 3)
         """
         self.tts_instance = tts_instance
+        self.tts_instance.opts.add_wav_header = False
+
         self.sentence_end_regex = SENTENCE_END_REGEX
         self.queue_timeout = queue_timeout
         self.max_retries = max_retries
         self.queue = Queue()
         self.buffer_size = 250
         self.stop_flag = False
-        self.tts_instance.opts.add_wav_header = False
 
 
     async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
@@ -53,7 +54,7 @@ async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> Non
         buffer = ""
         async for chunk in llm_output:
             buffer += chunk
-            if self.sentence_end_regex.match(buffer) or self.buffer_size > 600:
+            if self.sentence_end_regex.match(buffer) or len(buffer) > self.buffer_size:
                 self.queue.put(buffer)
                 buffer = ""
 
diff --git a/smallest/tts.py b/smallest/tts.py
diff --git a/smallest/utils.py b/smallest/utils.py