1212
1313API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
1414SENTENCE_END_REGEX = re .compile (r'.*[-.—!?;:…\n]$' )
15+ CHUNK_SIZE = 250
1516SAMPLE_WIDTH = 2
1617CHANNELS = 1
1718
@@ -59,7 +60,7 @@ def preprocess_text(text: str) -> str:
5960 text = mpn .normalize (text )
6061 return text .strip ()
6162
62- def split_into_chunks (self , text : str ) -> List [str ]:
63+ def split_into_chunks (text : str ) -> List [str ]:
6364 """
6465 Splits the input text into chunks based on sentence boundaries
6566 defined by SENTENCE_END_REGEX and the maximum chunk size.
@@ -76,7 +77,7 @@ def split_into_chunks(self, text: str) -> List[str]:
7677 if SENTENCE_END_REGEX .match (current_chunk ):
7778 last_break_index = i
7879
79- if len (current_chunk ) >= self . chunk_size :
80+ if len (current_chunk ) >= CHUNK_SIZE :
8081 if last_break_index > 0 :
8182 # Split at the last valid sentence boundary
8283 chunk = text [:last_break_index + 1 ].strip ()
@@ -91,7 +92,7 @@ def split_into_chunks(self, text: str) -> List[str]:
9192 # No sentence boundary found, split at max length
9293 current_chunk = current_chunk .replace ("—" , " " )
9394 chunks .append (current_chunk .strip ())
94- text = text [self . chunk_size :]
95+ text = text [CHUNK_SIZE :]
9596 i = - 1 # Reset index to process the remaining text
9697 current_chunk = ""
9798
0 commit comments