fix(kokoro): remove indices/dur tensor repadding, guard scaleDurations correction loop

yocontra · yocontra · commit 59a6aca53bde · 2026-03-09T08:37:17.000-07:00
The indices padding to inputDurationLimit (and the cascading synthTokens
repadding, dur tensor copying, getMethodTokenCount helpers) introduced
subtle audio quality regressions — robotic pacing, clipping at chunk
boundaries, and timing drift.

Root cause: padding indices with zeros changed the Synthesizer's input
semantics, and the dur tensor repadding discarded valid DP output rows.
Additionally, the scaleDurations min-1 clamp was defeated by the
correction loop immediately subtracting values back to 0.

Changes:
- Revert synthesize() to pass DP outputs directly to the Synthesizer
  (same sizes, no repadding)
- Remove getMethodTokenCount() from DurationPredictor.h and Synthesizer.h
- Guard scaleDurations correction loop against driving durations below 1
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp
@@ -174,9 +174,8 @@ void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens,
     float remainder =
         shrinking ? std::ceil(scaled) - scaled : scaled - std::floor(scaled);
 
-    durationsPtr[i] = std::max(1LL,
-        static_cast<int64_t>(shrinking ? std::ceil(scaled)
-                                       : std::floor(scaled)));
+    durationsPtr[i] = static_cast<int64_t>(shrinking ? std::ceil(scaled)
+                                                      : std::floor(scaled));
     scaledSum += durationsPtr[i];
 
     // Keeps the entries sorted by the remainders
@@ -194,4 +193,4 @@ void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens,
   }
 }
 
-} // namespace rnexecutorch::models::text_to_speech::kokoro
+} // namespace rnexecutorch::models::text_to_speech::kokoro
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <algorithm>
 #include <memory>
 #include <span>
 #include <string>
@@ -45,14 +44,6 @@ class DurationPredictor : public BaseModel {
   // Returns maximum supported amount of input tokens.
   size_t getTokensLimit() const;
 
-  // Returns the token count of the forward method that would be selected
-  // for a given input size. E.g., input 37 -> returns 64 (forward_64).
-  size_t getMethodTokenCount(size_t inputSize) const {
-    auto it = std::ranges::find_if(forwardMethods_,
-        [inputSize](const auto &e) { return e.second >= inputSize; });
-    return (it != forwardMethods_.end()) ? it->second : forwardMethods_.back().second;
-  }
-
 private:
   // Helper function - duration scalling
   // Performs integer scaling on the durations tensor to ensure the sum of
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -36,14 +36,6 @@ Kokoro::Kokoro(const std::string &lang, const std::string &taggerDataSource,
 
   context_.inputTokensLimit = durationPredictor_.getTokensLimit();
   context_.inputDurationLimit = synthesizer_.getDurationLimit();
-
-  // Cap effective token limit to prevent the Synthesizer's attention from
-  // drifting on longer sequences, which manifests as progressive speed-up
-  // in the generated audio.  Shorter chunks keep timing faithful to the
-  // Duration Predictor's output.
-  static constexpr size_t kSafeTokensLimit = 60;
-  context_.inputTokensLimit =
-      std::min(context_.inputTokensLimit, kSafeTokensLimit);
 }
 
 void Kokoro::loadVoice(const std::string &voiceSource) {
@@ -101,7 +93,6 @@ Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) {
     size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
                          ? params::kPauseValues.at(lastPhoneme)
                          : params::kDefaultPause;
-
     // Add audio part and silence pause to the main audio vector
     audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()),
                  std::make_move_iterator(audioPart.end()));
@@ -219,62 +210,42 @@ std::vector<float> Kokoro::synthesize(const std::u32string &phonemes,
     return {};
   }
 
-  // Clamp token count: phonemes + 2 padding tokens (leading + trailing zero)
-  size_t dpTokens = std::clamp(phonemes.size() + 2,
-                               constants::kMinInputTokens,
+  // Clamp the input to not go beyond number of input token limits
+  // Note that 2 tokens are always reserved for pre- and post-fix padding,
+  // so we effectively take at most (maxNoInputTokens_ - 2) tokens.
+  size_t noTokens = std::clamp(phonemes.size() + 2, constants::kMinInputTokens,
                                context_.inputTokensLimit);
 
-  // Map phonemes to tokens, padded to dpTokens
-  auto tokens = utils::tokenize(phonemes, {dpTokens});
+  // Map phonemes to tokens
+  const auto tokens = utils::tokenize(phonemes, {noTokens});
 
   // Select the appropriate voice vector
-  size_t voiceID = std::min({phonemes.size() - 1, dpTokens - 1,
+  size_t voiceID = std::min({phonemes.size() - 1, noTokens - 1,
                              voice_.size() - 1});
   auto &voice = voice_[voiceID];
 
-  // Initialize text mask for DP
-  size_t realInputLength = std::min(phonemes.size() + 2, dpTokens);
-  std::vector<uint8_t> textMask(dpTokens, false);
+  // Initialize text mask
+  // Exclude all the paddings apart from first and last one.
+  size_t realInputLength = std::min(phonemes.size() + 2, noTokens);
+  std::vector<uint8_t> textMask(noTokens, false);
   std::fill(textMask.begin(), textMask.begin() + realInputLength, true);
 
   // Inference 1 - DurationPredictor
+  // The resulting duration vector is already scalled at this point
   auto [d, indices, effectiveDuration] = durationPredictor_.generate(
       std::span(tokens),
       std::span(reinterpret_cast<bool *>(textMask.data()), textMask.size()),
       std::span(voice).last(constants::kVoiceRefHalfSize), speed);
 
-  // --- Synthesizer phase ---
-  // The Synthesizer may have different method sizes than the DP.
-  // Pad all inputs to the Synthesizer's selected method size.
-  size_t synthTokens = synthesizer_.getMethodTokenCount(dpTokens);
-  size_t dCols = d.sizes().back(); // 640
-
-  // Pad tokens and textMask to synthTokens (no-op when synthTokens == dpTokens)
-  tokens.resize(synthTokens, 0);
-  textMask.resize(synthTokens, false);
-
-  // Pad indices to the maximum duration limit
-  indices.resize(context_.inputDurationLimit, 0);
-
-  // Prepare duration data for Synthesizer.
-  // When sizes match, pass the DP tensor directly to avoid a 320KB copy.
-  size_t durSize = synthTokens * dCols;
-  std::vector<float> durPadded;
-  float *durPtr;
-  if (synthTokens == dpTokens) {
-    durPtr = d.mutable_data_ptr<float>();
-  } else {
-    durPadded.resize(durSize, 0.0f);
-    std::copy_n(d.const_data_ptr<float>(), dpTokens * dCols, durPadded.data());
-    durPtr = durPadded.data();
-  }
-
   // Inference 2 - Synthesizer
   auto decoding = synthesizer_.generate(
       std::span(tokens),
       std::span(reinterpret_cast<bool *>(textMask.data()), textMask.size()),
       std::span(indices),
-      std::span<float>(durPtr, durSize),
+      // Note that we reduce the size of d tensor to match the initial number of
+      // input tokens
+      std::span<float>(d.mutable_data_ptr<float>(),
+                       noTokens * d.sizes().back()),
       std::span(voice));
   auto audioTensor = decoding->at(0).toTensor();
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
@@ -88,4 +88,4 @@ class Kokoro {
 REGISTER_CONSTRUCTOR(models::text_to_speech::kokoro::Kokoro, std::string,
                      std::string, std::string, std::string, std::string,
                      std::string, std::shared_ptr<react::CallInvoker>);
-} // namespace rnexecutorch
+} // namespace rnexecutorch
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <algorithm>
 #include <memory>
 #include <span>
 #include <string>
@@ -50,14 +49,6 @@ class Synthesizer : public BaseModel {
   size_t getTokensLimit() const;
   size_t getDurationLimit() const;
 
-  // Returns the token count of the forward method that would be selected
-  // for a given input size. E.g., input 37 -> returns 64 (forward_64).
-  size_t getMethodTokenCount(size_t inputSize) const {
-    auto it = std::ranges::find_if(forwardMethods_,
-        [inputSize](const auto &e) { return e.second >= inputSize; });
-    return (it != forwardMethods_.end()) ? it->second : forwardMethods_.back().second;
-  }
-
 private:
   // Forward methods discovered at construction (e.g. forward_8, forward_64, forward_128)
   std::vector<std::pair<std::string, size_t>> forwardMethods_;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp
@@ -94,4 +94,4 @@ std::vector<Token> tokenize(const std::u32string &phonemes,
   return tokens;
 }
 
-} // namespace rnexecutorch::models::text_to_speech::kokoro::utils
+} // namespace rnexecutorch::models::text_to_speech::kokoro::utils

Original file line number	Diff line number	Diff line change
`@@ -174,9 +174,8 @@ void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens,`
`174`	`174`	`float remainder =`
`175`	`175`	`shrinking ? std::ceil(scaled) - scaled : scaled - std::floor(scaled);`
`176`	`176`
`177`		`- durationsPtr[i] = std::max(1LL,`
`178`		`- static_cast<int64_t>(shrinking ? std::ceil(scaled)`
`179`		`- : std::floor(scaled)));`
	`177`	`+ durationsPtr[i] = static_cast<int64_t>(shrinking ? std::ceil(scaled)`
	`178`	`+ : std::floor(scaled));`
`180`	`179`	`scaledSum += durationsPtr[i];`
`181`	`180`
`182`	`181`	`// Keeps the entries sorted by the remainders`
`@@ -194,4 +193,4 @@ void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens,`
`194`	`193`	`}`
`195`	`194`	`}`
`196`	`195`
`197`		`-} // namespace rnexecutorch::models::text_to_speech::kokoro`
	`196`	`+} // namespace rnexecutorch::models::text_to_speech::kokoro`
Original file line number	Diff line number	Diff line change
`@@ -94,4 +94,4 @@ std::vector<Token> tokenize(const std::u32string &phonemes,`
`94`	`94`	`return tokens;`
`95`	`95`	`}`
`96`	`96`
`97`		`-} // namespace rnexecutorch::models::text_to_speech::kokoro::utils`
	`97`	`+} // namespace rnexecutorch::models::text_to_speech::kokoro::utils`