Skip to content

Commit 59a6aca

Browse files
committed
fix(kokoro): remove indices/dur tensor repadding, guard scaleDurations correction loop
The indices padding to inputDurationLimit (and the cascading synthTokens repadding, dur tensor copying, getMethodTokenCount helpers) introduced subtle audio quality regressions — robotic pacing, clipping at chunk boundaries, and timing drift. Root cause: padding indices with zeros changed the Synthesizer's input semantics, and the dur tensor repadding discarded valid DP output rows. Additionally, the scaleDurations min-1 clamp was defeated by the correction loop immediately subtracting values back to 0. Changes: - Revert synthesize() to pass DP outputs directly to the Synthesizer (same sizes, no repadding) - Remove getMethodTokenCount() from DurationPredictor.h and Synthesizer.h - Guard scaleDurations correction loop against driving durations below 1
1 parent eb42742 commit 59a6aca

File tree

6 files changed

+21
-69
lines changed

6 files changed

+21
-69
lines changed

packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,8 @@ void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens,
174174
float remainder =
175175
shrinking ? std::ceil(scaled) - scaled : scaled - std::floor(scaled);
176176

177-
durationsPtr[i] = std::max(1LL,
178-
static_cast<int64_t>(shrinking ? std::ceil(scaled)
179-
: std::floor(scaled)));
177+
durationsPtr[i] = static_cast<int64_t>(shrinking ? std::ceil(scaled)
178+
: std::floor(scaled));
180179
scaledSum += durationsPtr[i];
181180

182181
// Keeps the entries sorted by the remainders
@@ -194,4 +193,4 @@ void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens,
194193
}
195194
}
196195

197-
} // namespace rnexecutorch::models::text_to_speech::kokoro
196+
} // namespace rnexecutorch::models::text_to_speech::kokoro

packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#pragma once
22

3-
#include <algorithm>
43
#include <memory>
54
#include <span>
65
#include <string>
@@ -45,14 +44,6 @@ class DurationPredictor : public BaseModel {
4544
// Returns maximum supported amount of input tokens.
4645
size_t getTokensLimit() const;
4746

48-
// Returns the token count of the forward method that would be selected
49-
// for a given input size. E.g., input 37 -> returns 64 (forward_64).
50-
size_t getMethodTokenCount(size_t inputSize) const {
51-
auto it = std::ranges::find_if(forwardMethods_,
52-
[inputSize](const auto &e) { return e.second >= inputSize; });
53-
return (it != forwardMethods_.end()) ? it->second : forwardMethods_.back().second;
54-
}
55-
5647
private:
5748
// Helper function - duration scalling
5849
// Performs integer scaling on the durations tensor to ensure the sum of

packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp

Lines changed: 16 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,6 @@ Kokoro::Kokoro(const std::string &lang, const std::string &taggerDataSource,
3636

3737
context_.inputTokensLimit = durationPredictor_.getTokensLimit();
3838
context_.inputDurationLimit = synthesizer_.getDurationLimit();
39-
40-
// Cap effective token limit to prevent the Synthesizer's attention from
41-
// drifting on longer sequences, which manifests as progressive speed-up
42-
// in the generated audio. Shorter chunks keep timing faithful to the
43-
// Duration Predictor's output.
44-
static constexpr size_t kSafeTokensLimit = 60;
45-
context_.inputTokensLimit =
46-
std::min(context_.inputTokensLimit, kSafeTokensLimit);
4739
}
4840

4941
void Kokoro::loadVoice(const std::string &voiceSource) {
@@ -101,7 +93,6 @@ Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) {
10193
size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
10294
? params::kPauseValues.at(lastPhoneme)
10395
: params::kDefaultPause;
104-
10596
// Add audio part and silence pause to the main audio vector
10697
audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()),
10798
std::make_move_iterator(audioPart.end()));
@@ -219,62 +210,42 @@ std::vector<float> Kokoro::synthesize(const std::u32string &phonemes,
219210
return {};
220211
}
221212

222-
// Clamp token count: phonemes + 2 padding tokens (leading + trailing zero)
223-
size_t dpTokens = std::clamp(phonemes.size() + 2,
224-
constants::kMinInputTokens,
213+
// Clamp the input to not go beyond number of input token limits
214+
// Note that 2 tokens are always reserved for pre- and post-fix padding,
215+
// so we effectively take at most (maxNoInputTokens_ - 2) tokens.
216+
size_t noTokens = std::clamp(phonemes.size() + 2, constants::kMinInputTokens,
225217
context_.inputTokensLimit);
226218

227-
// Map phonemes to tokens, padded to dpTokens
228-
auto tokens = utils::tokenize(phonemes, {dpTokens});
219+
// Map phonemes to tokens
220+
const auto tokens = utils::tokenize(phonemes, {noTokens});
229221

230222
// Select the appropriate voice vector
231-
size_t voiceID = std::min({phonemes.size() - 1, dpTokens - 1,
223+
size_t voiceID = std::min({phonemes.size() - 1, noTokens - 1,
232224
voice_.size() - 1});
233225
auto &voice = voice_[voiceID];
234226

235-
// Initialize text mask for DP
236-
size_t realInputLength = std::min(phonemes.size() + 2, dpTokens);
237-
std::vector<uint8_t> textMask(dpTokens, false);
227+
// Initialize text mask
228+
// Exclude all the paddings apart from first and last one.
229+
size_t realInputLength = std::min(phonemes.size() + 2, noTokens);
230+
std::vector<uint8_t> textMask(noTokens, false);
238231
std::fill(textMask.begin(), textMask.begin() + realInputLength, true);
239232

240233
// Inference 1 - DurationPredictor
234+
// The resulting duration vector is already scalled at this point
241235
auto [d, indices, effectiveDuration] = durationPredictor_.generate(
242236
std::span(tokens),
243237
std::span(reinterpret_cast<bool *>(textMask.data()), textMask.size()),
244238
std::span(voice).last(constants::kVoiceRefHalfSize), speed);
245239

246-
// --- Synthesizer phase ---
247-
// The Synthesizer may have different method sizes than the DP.
248-
// Pad all inputs to the Synthesizer's selected method size.
249-
size_t synthTokens = synthesizer_.getMethodTokenCount(dpTokens);
250-
size_t dCols = d.sizes().back(); // 640
251-
252-
// Pad tokens and textMask to synthTokens (no-op when synthTokens == dpTokens)
253-
tokens.resize(synthTokens, 0);
254-
textMask.resize(synthTokens, false);
255-
256-
// Pad indices to the maximum duration limit
257-
indices.resize(context_.inputDurationLimit, 0);
258-
259-
// Prepare duration data for Synthesizer.
260-
// When sizes match, pass the DP tensor directly to avoid a 320KB copy.
261-
size_t durSize = synthTokens * dCols;
262-
std::vector<float> durPadded;
263-
float *durPtr;
264-
if (synthTokens == dpTokens) {
265-
durPtr = d.mutable_data_ptr<float>();
266-
} else {
267-
durPadded.resize(durSize, 0.0f);
268-
std::copy_n(d.const_data_ptr<float>(), dpTokens * dCols, durPadded.data());
269-
durPtr = durPadded.data();
270-
}
271-
272240
// Inference 2 - Synthesizer
273241
auto decoding = synthesizer_.generate(
274242
std::span(tokens),
275243
std::span(reinterpret_cast<bool *>(textMask.data()), textMask.size()),
276244
std::span(indices),
277-
std::span<float>(durPtr, durSize),
245+
// Note that we reduce the size of d tensor to match the initial number of
246+
// input tokens
247+
std::span<float>(d.mutable_data_ptr<float>(),
248+
noTokens * d.sizes().back()),
278249
std::span(voice));
279250
auto audioTensor = decoding->at(0).toTensor();
280251

packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,4 @@ class Kokoro {
8888
REGISTER_CONSTRUCTOR(models::text_to_speech::kokoro::Kokoro, std::string,
8989
std::string, std::string, std::string, std::string,
9090
std::string, std::shared_ptr<react::CallInvoker>);
91-
} // namespace rnexecutorch
91+
} // namespace rnexecutorch

packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#pragma once
22

3-
#include <algorithm>
43
#include <memory>
54
#include <span>
65
#include <string>
@@ -50,14 +49,6 @@ class Synthesizer : public BaseModel {
5049
size_t getTokensLimit() const;
5150
size_t getDurationLimit() const;
5251

53-
// Returns the token count of the forward method that would be selected
54-
// for a given input size. E.g., input 37 -> returns 64 (forward_64).
55-
size_t getMethodTokenCount(size_t inputSize) const {
56-
auto it = std::ranges::find_if(forwardMethods_,
57-
[inputSize](const auto &e) { return e.second >= inputSize; });
58-
return (it != forwardMethods_.end()) ? it->second : forwardMethods_.back().second;
59-
}
60-
6152
private:
6253
// Forward methods discovered at construction (e.g. forward_8, forward_64, forward_128)
6354
std::vector<std::pair<std::string, size_t>> forwardMethods_;

packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,4 @@ std::vector<Token> tokenize(const std::u32string &phonemes,
9494
return tokens;
9595
}
9696

97-
} // namespace rnexecutorch::models::text_to_speech::kokoro::utils
97+
} // namespace rnexecutorch::models::text_to_speech::kokoro::utils

0 commit comments

Comments
 (0)