Skip to content

Commit c7c8aa9

Browse files
committed
Address PR review feedback from IgorSwat and msluszniak
- Revert scaleDurations min-1 clamp to avoid exceeding 296 duration cap - Remove kSafeTokensLimit=60 cap that cut text mid-sentence - Use std::ranges::find_if in Synthesizer.cpp
1 parent b1e70d8 commit c7c8aa9

File tree

3 files changed

+4
-18
lines changed

3 files changed

+4
-18
lines changed

packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,8 @@ void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens,
174174
float remainder =
175175
shrinking ? std::ceil(scaled) - scaled : scaled - std::floor(scaled);
176176

177-
durationsPtr[i] = std::max(static_cast<int64_t>(1),
178-
static_cast<int64_t>(shrinking ? std::ceil(scaled)
179-
: std::floor(scaled)));
177+
durationsPtr[i] = static_cast<int64_t>(shrinking ? std::ceil(scaled)
178+
: std::floor(scaled));
180179
scaledSum += durationsPtr[i];
181180

182181
// Keeps the entries sorted by the remainders
@@ -189,13 +188,8 @@ void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens,
189188
int32_t diff = std::abs(targetDuration - scaledSum);
190189
for (uint32_t i = 0; i < diff; i++) {
191190
auto [remainder, idx] = remainders.top();
192-
remainders.pop();
193-
// Never drive a duration below 1 — the min-1 clamp above prevents
194-
// phoneme deletion, so the correction loop must respect it too.
195-
if (shrinking && durationsPtr[idx] <= 1) {
196-
continue;
197-
}
198191
durationsPtr[idx] += shrinking ? -1 : 1;
192+
remainders.pop();
199193
}
200194
}
201195

packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,6 @@ Kokoro::Kokoro(const std::string &lang, const std::string &taggerDataSource,
3535

3636
context_.inputTokensLimit = durationPredictor_.getTokensLimit();
3737
context_.inputDurationLimit = synthesizer_.getDurationLimit();
38-
39-
// Cap effective token limit to prevent the Synthesizer's attention from
40-
// drifting on longer sequences, which manifests as progressive speed-up
41-
// in the generated audio. Shorter chunks keep timing faithful to the
42-
// Duration Predictor's output.
43-
static constexpr size_t kSafeTokensLimit = 60;
44-
context_.inputTokensLimit =
45-
std::min(context_.inputTokensLimit, kSafeTokensLimit);
4638
}
4739

4840
void Kokoro::loadVoice(const std::string &voiceSource) {

packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ Result<std::vector<EValue>> Synthesizer::generate(std::span<const Token> tokens,
7171
ref_s.data(), ScalarType::Float);
7272

7373
// Select appropriate forward method based on token count
74-
auto it = std::find_if(forwardMethods_.begin(), forwardMethods_.end(),
74+
auto it = std::ranges::find_if(forwardMethods_,
7575
[noTokens](const auto &entry) { return static_cast<int32_t>(entry.second) >= noTokens; });
7676
std::string selectedMethod = (it != forwardMethods_.end()) ? it->first : forwardMethods_.back().first;
7777

0 commit comments

Comments
 (0)