Skip to content

Commit 241cae9

Browse files
committed
perf: eliminate redundant allocations in synthesize/stream
- Skip 320KB durPadded allocation when dpTokens == synthTokens (common case) - Replace temporary pause vectors with direct resize (avoid heap alloc) - Capture audio by move in streaming callback (avoid full buffer copy) - Remove pointless make_move_iterator on float vectors
1 parent 9ae3244 commit 241cae9

File tree

1 file changed

+25
-20
lines changed
  • packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro

1 file changed

+25
-20
lines changed

packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,10 @@ std::vector<float> Kokoro::generate(std::string text, float speed) {
9999
size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
100100
? params::kPauseValues.at(lastPhoneme)
101101
: params::kDefaultPause;
102-
std::vector<float> pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
103102

104-
// Add audio part and pause to the main audio vector
105-
audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()),
106-
std::make_move_iterator(audioPart.end()));
107-
audio.insert(audio.end(), std::make_move_iterator(pause.begin()),
108-
std::make_move_iterator(pause.end()));
103+
// Add audio part and silence pause to the main audio vector
104+
audio.insert(audio.end(), audioPart.begin(), audioPart.end());
105+
audio.resize(audio.size() + pauseMs * constants::kSamplesPerMilisecond, 0.F);
109106
}
110107

111108
return audio;
@@ -119,12 +116,13 @@ void Kokoro::stream(std::string text, float speed,
119116
}
120117

121118
// Build a full callback function
122-
auto nativeCallback = [this, callback](const std::vector<float> &audioVec) {
119+
auto nativeCallback = [this, callback](std::vector<float> audioVec) {
123120
if (this->isStreaming_) {
124-
this->callInvoker_->invokeAsync([callback, audioVec](jsi::Runtime &rt) {
125-
callback->call(rt,
126-
rnexecutorch::jsi_conversion::getJsiValue(audioVec, rt));
127-
});
121+
this->callInvoker_->invokeAsync(
122+
[callback, audioVec = std::move(audioVec)](jsi::Runtime &rt) {
123+
callback->call(
124+
rt, rnexecutorch::jsi_conversion::getJsiValue(audioVec, rt));
125+
});
128126
}
129127
};
130128

@@ -167,14 +165,12 @@ void Kokoro::stream(std::string text, float speed,
167165
size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
168166
? params::kPauseValues.at(lastPhoneme)
169167
: params::kDefaultPause;
170-
std::vector<float> pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
171168

172-
// Add pause to the audio vector
173-
audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()),
174-
std::make_move_iterator(pause.end()));
169+
// Append silence pause directly
170+
audioPart.resize(audioPart.size() + pauseMs * constants::kSamplesPerMilisecond, 0.F);
175171

176172
// Push the audio right away to the JS side
177-
nativeCallback(audioPart);
173+
nativeCallback(std::move(audioPart));
178174
}
179175

180176
// Mark the end of the streaming process
@@ -232,16 +228,25 @@ std::vector<float> Kokoro::synthesize(const std::u32string &phonemes,
232228
// Pad indices to the maximum duration limit
233229
indices.resize(context_.inputDurationLimit, 0);
234230

235-
// Pad d tensor: DP output is [1, dpTokens, 640], Synth expects [1, synthTokens, 640]
236-
std::vector<float> durPadded(synthTokens * dCols, 0.0f);
237-
std::copy_n(d.const_data_ptr<float>(), dpTokens * dCols, durPadded.data());
231+
// Prepare duration data for Synthesizer.
232+
// When sizes match, pass the DP tensor directly to avoid a 320KB copy.
233+
size_t durSize = synthTokens * dCols;
234+
std::vector<float> durPadded;
235+
float *durPtr;
236+
if (synthTokens == dpTokens) {
237+
durPtr = d.mutable_data_ptr<float>();
238+
} else {
239+
durPadded.resize(durSize, 0.0f);
240+
std::copy_n(d.const_data_ptr<float>(), dpTokens * dCols, durPadded.data());
241+
durPtr = durPadded.data();
242+
}
238243

239244
// Inference 2 - Synthesizer
240245
auto decoding = synthesizer_.generate(
241246
std::span(tokens),
242247
std::span(reinterpret_cast<bool *>(textMask.data()), textMask.size()),
243248
std::span(indices),
244-
std::span<float>(durPadded.data(), synthTokens * dCols),
249+
std::span<float>(durPtr, durSize),
245250
std::span(voice));
246251
auto audioTensor = decoding->at(0).toTensor();
247252

0 commit comments

Comments
 (0)