Skip to content

Commit 0302a04

Browse files
committed
long text handling
1 parent 2e46e66 commit 0302a04

6 files changed

Lines changed: 207 additions & 83 deletions

File tree

README.md

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,20 @@ if __name__ == "__main__":
7575
- `speed`: Speech speed multiplier (default: 1.0)
7676
- `add_wav_header`: Include WAV header in output (default: True)
7777
- `transliterate`: Enable text transliteration (default: False)
78-
- `remove_extra_silence`: Remove additional silence (default: True)
78+
- `remove_extra_silence`: Remove additional silence (default: True)
79+
80+
These parameters are part of the `Smallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts kwargs, allowing you to override these parameters for a specific synthesis request.
81+
82+
For example, you can modify the speech speed and sample rate just for a particular synthesis call:
83+
```py
84+
client.synthesize(
85+
"Hello, this is a test for sync synthesis function.",
86+
save_as="sync_synthesize.wav",
87+
speed=1.5, # Overrides default speed
88+
sample_rate=16000 # Overrides default sample rate
89+
)
90+
```
91+
7992

8093
### Async
8194
Asynchronous text-to-speech synthesis client.
@@ -107,7 +120,18 @@ if __name__ == "__main__":
107120
- `speed`: Speech speed multiplier (default: 1.0)
108121
- `add_wav_header`: Include WAV header in output (default: True)
109122
- `transliterate`: Enable text transliteration (default: False)
110-
- `remove_extra_silence`: Remove additional silence (default: True)
123+
- `remove_extra_silence`: Remove additional silence (default: True)
124+
125+
These parameters are part of the AsyncSmallest instance. They can be set when creating the instance (as shown above). However, the synthesize function also accepts kwargs, allowing you to override any of these parameters on a per-request basis.
126+
127+
For example, you can modify the speech speed and sample rate just for a particular synthesis request:
128+
```py
129+
audio_bytes = await tts.synthesize(
130+
"Hello, this is a test of the async synthesis function.",
131+
speed=1.5, # Overrides default speed
132+
sample_rate=16000 # Overrides default sample rate
133+
)
134+
```
111135

112136
### LLM to Speech
113137

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "smallestai"
3-
version = "1.2.0"
3+
version = "1.3.0"
44
description = "Official Python client for the Smallest AI API"
55
authors = [
66
{name = "Smallest", email = "info@smallest.ai"},

smallest/async_tts.py

Lines changed: 85 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,20 @@
77
from .models import TTSModels, TTSVoices
88
from .exceptions import TTSError, APIError
99
from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header,
10-
get_smallest_languages, get_smallest_voices, get_smallest_models, API_BASE_URL)
10+
get_smallest_languages, get_smallest_voices, get_smallest_models, SENTENCE_END_REGEX, API_BASE_URL)
1111

1212

1313
class AsyncSmallest:
1414
def __init__(
15-
self,
16-
api_key: Optional[str] = None,
17-
model: TTSModels = "lightning",
18-
sample_rate: int = 24000,
19-
voice: TTSVoices = "emily",
20-
speed: Optional[float] = 1.0,
21-
add_wav_header: Optional[bool] = True,
22-
transliterate: Optional[bool] = False,
23-
remove_extra_silence: Optional[bool] = False
15+
self,
16+
api_key: Optional[str] = None,
17+
model: TTSModels = "lightning",
18+
sample_rate: int = 24000,
19+
voice: TTSVoices = "emily",
20+
speed: Optional[float] = 1.0,
21+
add_wav_header: Optional[bool] = True,
22+
transliterate: Optional[bool] = False,
23+
remove_extra_silence: Optional[bool] = False
2424
) -> None:
2525
"""
2626
AsyncSmallest Instance for asynchronous text-to-speech synthesis.
@@ -48,6 +48,7 @@ def __init__(
4848
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
4949
if not self.api_key:
5050
raise TTSError("API key is required")
51+
self.chunk_size = 250
5152

5253
self.opts = TTSOptions(
5354
model=model,
@@ -70,6 +71,48 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
7071
if self.session:
7172
await self.session.close()
7273

74+
def _split_into_chunks(self, text: str) -> List[str]:
75+
"""
76+
Splits the input text into chunks based on sentence boundaries and the maximum chunk size.
77+
"""
78+
chunks = []
79+
current_chunk = ""
80+
last_break_index = 0
81+
82+
i = 0
83+
while i < len(text):
84+
current_chunk += text[i]
85+
86+
if text[i] in ".,":
87+
last_break_index = i
88+
89+
if len(current_chunk) >= self.chunk_size:
90+
if last_break_index > 0:
91+
chunk = text[:last_break_index + 1].strip()
92+
chunk = chunk.replace("—", " ")
93+
chunks.append(chunk)
94+
95+
text = text[last_break_index + 1:]
96+
i = -1
97+
current_chunk = ""
98+
last_break_index = 0
99+
else:
100+
# No break point found, split at max length
101+
current_chunk = current_chunk.replace("—", " ")
102+
chunks.append(current_chunk.strip())
103+
text = text[self.chunk_size:]
104+
i = -1
105+
current_chunk = ""
106+
107+
i += 1
108+
109+
if text:
110+
text = text.replace("—", " ")
111+
chunks.append(text.strip())
112+
113+
return chunks
114+
115+
73116
def get_languages(self) -> List[str]:
74117
"""Returns a list of available languages."""
75118
return get_smallest_languages()
@@ -110,42 +153,45 @@ async def synthesize(
110153
setattr(opts, key, value)
111154

112155
validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
113-
114-
payload = {
115-
"text": preprocess_text(text),
116-
"sample_rate": opts.sample_rate,
117-
"voice_id": opts.voice,
118-
"add_wav_header": opts.add_wav_header,
119-
"speed": opts.speed,
120-
"model": opts.model,
121-
"transliterate": opts.transliterate,
122-
"remove_extra_silence": opts.remove_extra_silence
123-
}
124-
125-
headers = {
126-
"Authorization": f"Bearer {self.api_key}",
127-
"Content-Type": "application/json",
128-
}
129-
130-
if not self.session:
131-
self.session = aiohttp.ClientSession()
156+
chunks = self._split_into_chunks(text)
157+
audio_content = b""
158+
159+
for chunk in chunks:
160+
payload = {
161+
"text": preprocess_text(chunk),
162+
"sample_rate": opts.sample_rate,
163+
"voice_id": opts.voice,
164+
"add_wav_header": False,
165+
"speed": opts.speed,
166+
"model": opts.model,
167+
"transliterate": opts.transliterate,
168+
"remove_extra_silence": opts.remove_extra_silence
169+
}
170+
171+
headers = {
172+
"Authorization": f"Bearer {self.api_key}",
173+
"Content-Type": "application/json",
174+
}
175+
176+
if not self.session:
177+
self.session = aiohttp.ClientSession()
132178

133-
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
134-
if res.status != 200:
135-
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
179+
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
180+
if res.status != 200:
181+
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
136182

137-
audio_content = await res.read()
183+
audio_content += await res.read()
138184

139185
if save_as:
140186
if not save_as.endswith(".wav"):
141187
raise TTSError("Invalid file name. Extension must be .wav")
142188

143-
if self.opts.add_wav_header:
144-
async with aiofiles.open(save_as, mode='wb') as f:
145-
await f.write(audio_content)
146-
else:
147-
async with aiofiles.open(save_as, mode='wb') as f:
148-
await f.write(add_wav_header(audio_content, self.opts.sample_rate))
189+
async with aiofiles.open(save_as, mode='wb') as f:
190+
await f.write(add_wav_header(audio_content, self.opts.sample_rate))
191+
149192
return None
150193

194+
if opts.add_wav_header:
195+
return add_wav_header(audio_content, self.opts.sample_rate)
196+
151197
return audio_content

smallest/stream_tts.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,14 @@ def __init__(
3434
max_retries: Number of retry attempts for failed synthesis (default: 3)
3535
"""
3636
self.tts_instance = tts_instance
37+
self.tts_instance.opts.add_wav_header = False
38+
3739
self.sentence_end_regex = SENTENCE_END_REGEX
3840
self.queue_timeout = queue_timeout
3941
self.max_retries = max_retries
4042
self.queue = Queue()
4143
self.buffer_size = 250
4244
self.stop_flag = False
43-
self.tts_instance.opts.add_wav_header = False
4445

4546

4647
async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
@@ -53,7 +54,7 @@ async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> Non
5354
buffer = ""
5455
async for chunk in llm_output:
5556
buffer += chunk
56-
if self.sentence_end_regex.match(buffer) or self.buffer_size > 600:
57+
if self.sentence_end_regex.match(buffer) or len(buffer) > self.buffer_size:
5758
self.queue.put(buffer)
5859
buffer = ""
5960

0 commit comments

Comments
 (0)