perf: eliminate redundant allocations in synthesize/stream

yocontra · yocontra · commit 241cae9da035 · 2026-03-06T20:50:25.000-08:00
- Skip 320KB durPadded allocation when dpTokens == synthTokens (common case)
- Replace temporary pause vectors with direct resize (avoid heap alloc)
- Capture audio by move in streaming callback (avoid full buffer copy)
- Remove pointless make_move_iterator on float vectors
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -99,13 +99,10 @@ std::vector<float> Kokoro::generate(std::string text, float speed) {
     size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
                          ? params::kPauseValues.at(lastPhoneme)
                          : params::kDefaultPause;
-    std::vector<float> pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
 
-    // Add audio part and pause to the main audio vector
-    audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()),
-                 std::make_move_iterator(audioPart.end()));
-    audio.insert(audio.end(), std::make_move_iterator(pause.begin()),
-                 std::make_move_iterator(pause.end()));
+    // Add audio part and silence pause to the main audio vector
+    audio.insert(audio.end(), audioPart.begin(), audioPart.end());
+    audio.resize(audio.size() + pauseMs * constants::kSamplesPerMilisecond, 0.F);
   }
 
   return audio;
@@ -119,12 +116,13 @@ void Kokoro::stream(std::string text, float speed,
   }
 
   // Build a full callback function
-  auto nativeCallback = [this, callback](const std::vector<float> &audioVec) {
+  auto nativeCallback = [this, callback](std::vector<float> audioVec) {
     if (this->isStreaming_) {
-      this->callInvoker_->invokeAsync([callback, audioVec](jsi::Runtime &rt) {
-        callback->call(rt,
-                       rnexecutorch::jsi_conversion::getJsiValue(audioVec, rt));
-      });
+      this->callInvoker_->invokeAsync(
+          [callback, audioVec = std::move(audioVec)](jsi::Runtime &rt) {
+            callback->call(
+                rt, rnexecutorch::jsi_conversion::getJsiValue(audioVec, rt));
+          });
     }
   };
 
@@ -167,14 +165,12 @@ void Kokoro::stream(std::string text, float speed,
     size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
                          ? params::kPauseValues.at(lastPhoneme)
                          : params::kDefaultPause;
-    std::vector<float> pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
 
-    // Add pause to the audio vector
-    audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()),
-                     std::make_move_iterator(pause.end()));
+    // Append silence pause directly
+    audioPart.resize(audioPart.size() + pauseMs * constants::kSamplesPerMilisecond, 0.F);
 
     // Push the audio right away to the JS side
-    nativeCallback(audioPart);
+    nativeCallback(std::move(audioPart));
   }
 
   // Mark the end of the streaming process
@@ -232,16 +228,25 @@ std::vector<float> Kokoro::synthesize(const std::u32string &phonemes,
   // Pad indices to the maximum duration limit
   indices.resize(context_.inputDurationLimit, 0);
 
-  // Pad d tensor: DP output is [1, dpTokens, 640], Synth expects [1, synthTokens, 640]
-  std::vector<float> durPadded(synthTokens * dCols, 0.0f);
-  std::copy_n(d.const_data_ptr<float>(), dpTokens * dCols, durPadded.data());
+  // Prepare duration data for Synthesizer.
+  // When sizes match, pass the DP tensor directly to avoid a 320KB copy.
+  size_t durSize = synthTokens * dCols;
+  std::vector<float> durPadded;
+  float *durPtr;
+  if (synthTokens == dpTokens) {
+    durPtr = d.mutable_data_ptr<float>();
+  } else {
+    durPadded.resize(durSize, 0.0f);
+    std::copy_n(d.const_data_ptr<float>(), dpTokens * dCols, durPadded.data());
+    durPtr = durPadded.data();
+  }
 
   // Inference 2 - Synthesizer
   auto decoding = synthesizer_.generate(
       std::span(tokens),
       std::span(reinterpret_cast<bool *>(textMask.data()), textMask.size()),
       std::span(indices),
-      std::span<float>(durPadded.data(), synthTokens * dCols),
+      std::span<float>(durPtr, durSize),
       std::span(voice));
   auto audioTensor = decoding->at(0).toTensor();