fix(kokoro): cap token limit to prevent speed-up, preserve phoneme order

yocontra · yocontra · commit e7b1c1b964d8 · 2026-03-09T08:36:35.000-07:00
The Synthesizer's attention drifts on longer sequences (60+ tokens),
causing later phonemes to be spoken progressively faster.  Cap
inputTokensLimit to 60 so the Partitioner splits text into shorter
chunks that stay faithful to the Duration Predictor's timing.

Also switch tokenize()'s std::partition to std::stable_partition so
phoneme token order is preserved when invalid tokens are filtered out.
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -36,6 +36,14 @@ Kokoro::Kokoro(const std::string &lang, const std::string &taggerDataSource,
 
   context_.inputTokensLimit = durationPredictor_.getTokensLimit();
   context_.inputDurationLimit = synthesizer_.getDurationLimit();
+
+  // Cap effective token limit to prevent the Synthesizer's attention from
+  // drifting on longer sequences, which manifests as progressive speed-up
+  // in the generated audio.  Shorter chunks keep timing faithful to the
+  // Duration Predictor's output.
+  static constexpr size_t kSafeTokensLimit = 60;
+  context_.inputTokensLimit =
+      std::min(context_.inputTokensLimit, kSafeTokensLimit);
 }
 
 void Kokoro::loadVoice(const std::string &voiceSource) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp
@@ -85,7 +85,7 @@ std::vector<Token> tokenize(const std::u32string &phonemes,
                               ? constants::kVocab.at(p)
                               : constants::kInvalidToken;
                  });
-  auto validSeqEnd = std::partition(
+  auto validSeqEnd = std::stable_partition(
       tokens.begin() + 1, tokens.begin() + effNoTokens + 1,
       [](Token t) -> bool { return t != constants::kInvalidToken; });
   std::fill(validSeqEnd, tokens.begin() + effNoTokens + 1,