@@ -99,13 +99,10 @@ std::vector<float> Kokoro::generate(std::string text, float speed) {
9999 size_t pauseMs = params::kPauseValues .contains (lastPhoneme)
100100 ? params::kPauseValues .at (lastPhoneme)
101101 : params::kDefaultPause ;
102- std::vector<float > pause (pauseMs * constants::kSamplesPerMilisecond , 0 .F );
103102
104- // Add audio part and pause to the main audio vector
105- audio.insert (audio.end (), std::make_move_iterator (audioPart.begin ()),
106- std::make_move_iterator (audioPart.end ()));
107- audio.insert (audio.end (), std::make_move_iterator (pause.begin ()),
108- std::make_move_iterator (pause.end ()));
103+ // Add audio part and silence pause to the main audio vector
104+ audio.insert (audio.end (), audioPart.begin (), audioPart.end ());
105+ audio.resize (audio.size () + pauseMs * constants::kSamplesPerMilisecond , 0 .F );
109106 }
110107
111108 return audio;
@@ -119,12 +116,13 @@ void Kokoro::stream(std::string text, float speed,
119116 }
120117
121118 // Build a full callback function
122- auto nativeCallback = [this , callback](const std::vector<float > & audioVec) {
119+ auto nativeCallback = [this , callback](std::vector<float > audioVec) {
123120 if (this ->isStreaming_ ) {
124- this ->callInvoker_ ->invokeAsync ([callback, audioVec](jsi::Runtime &rt) {
125- callback->call (rt,
126- rnexecutorch::jsi_conversion::getJsiValue (audioVec, rt));
127- });
121+ this ->callInvoker_ ->invokeAsync (
122+ [callback, audioVec = std::move (audioVec)](jsi::Runtime &rt) {
123+ callback->call (
124+ rt, rnexecutorch::jsi_conversion::getJsiValue (audioVec, rt));
125+ });
128126 }
129127 };
130128
@@ -167,14 +165,12 @@ void Kokoro::stream(std::string text, float speed,
167165 size_t pauseMs = params::kPauseValues .contains (lastPhoneme)
168166 ? params::kPauseValues .at (lastPhoneme)
169167 : params::kDefaultPause ;
170- std::vector<float > pause (pauseMs * constants::kSamplesPerMilisecond , 0 .F );
171168
172- // Add pause to the audio vector
173- audioPart.insert (audioPart.end (), std::make_move_iterator (pause.begin ()),
174- std::make_move_iterator (pause.end ()));
169+ // Append silence pause directly
170+ audioPart.resize (audioPart.size () + pauseMs * constants::kSamplesPerMilisecond , 0 .F );
175171
176172 // Push the audio right away to the JS side
177- nativeCallback (audioPart);
173+ nativeCallback (std::move ( audioPart) );
178174 }
179175
180176 // Mark the end of the streaming process
@@ -232,16 +228,25 @@ std::vector<float> Kokoro::synthesize(const std::u32string &phonemes,
232228 // Pad indices to the maximum duration limit
233229 indices.resize (context_.inputDurationLimit , 0 );
234230
235- // Pad d tensor: DP output is [1, dpTokens, 640], Synth expects [1, synthTokens, 640]
236- std::vector<float > durPadded (synthTokens * dCols, 0 .0f );
237- std::copy_n (d.const_data_ptr <float >(), dpTokens * dCols, durPadded.data ());
231+ // Prepare duration data for Synthesizer.
232+ // When sizes match, pass the DP tensor directly to avoid a 320KB copy.
233+ size_t durSize = synthTokens * dCols;
234+ std::vector<float > durPadded;
235+ float *durPtr;
236+ if (synthTokens == dpTokens) {
237+ durPtr = d.mutable_data_ptr <float >();
238+ } else {
239+ durPadded.resize (durSize, 0 .0f );
240+ std::copy_n (d.const_data_ptr <float >(), dpTokens * dCols, durPadded.data ());
241+ durPtr = durPadded.data ();
242+ }
238243
239244 // Inference 2 - Synthesizer
240245 auto decoding = synthesizer_.generate (
241246 std::span (tokens),
242247 std::span (reinterpret_cast <bool *>(textMask.data ()), textMask.size ()),
243248 std::span (indices),
244- std::span<float >(durPadded. data (), synthTokens * dCols ),
249+ std::span<float >(durPtr, durSize ),
245250 std::span (voice));
246251 auto audioTensor = decoding->at (0 ).toTensor ();
247252
0 commit comments