Skip to content

Commit f9fcb04

Browse files
mkopcinsMateusz Kopciński
andauthored
bug: fix to corrupted utf-8 encoding for special characters in transcription (#652)
## Description Fix for issue #651. The problem was the serialization of cpp string into jsi::Value and then back to a string, causing invalid characters and bytes. Now cpp returns array of bytes and the decoding is done on the JS side. ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [x] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [x] Android ### Testing instructions <!-- Provide step-by-step instructions on how to test your changes. Include setup details if necessary. --> ### Screenshots <!-- Add screenshots here, if applicable --> ### Related issues <!-- Link related issues here using #issue-number --> ### Checklist - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [ ] My changes generate no new warnings ### Additional notes <!-- Include any additional information, assumptions, or context that reviewers might need to understand this PR. --> --------- Co-authored-by: Mateusz Kopciński <mateusz.kopcinski@swmansnion.com>
1 parent 0cfc2e5 commit f9fcb04

File tree

4 files changed

+60
-28
lines changed

4 files changed

+60
-28
lines changed

packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ inline JSTensorViewIn getValue<JSTensorViewIn>(const jsi::Value &val,
6666
tensorView.sizes.reserve(numShapeDims);
6767

6868
for (size_t i = 0; i < numShapeDims; ++i) {
69-
int32_t dim = getValue<int32_t>(shapeArray.getValueAtIndex(runtime, i), runtime);
69+
int32_t dim =
70+
getValue<int32_t>(shapeArray.getValueAtIndex(runtime, i), runtime);
7071
tensorView.sizes.push_back(dim);
7172
}
7273

@@ -173,23 +174,24 @@ inline std::vector<T> getArrayAsVector(const jsi::Value &val,
173174
return result;
174175
}
175176

176-
177177
// Template specializations for std::vector<T> types
178178
template <>
179-
inline std::vector<JSTensorViewIn> getValue<std::vector<JSTensorViewIn>>(const jsi::Value &val,
180-
jsi::Runtime &runtime) {
179+
inline std::vector<JSTensorViewIn>
180+
getValue<std::vector<JSTensorViewIn>>(const jsi::Value &val,
181+
jsi::Runtime &runtime) {
181182
return getArrayAsVector<JSTensorViewIn>(val, runtime);
182183
}
183184

184185
template <>
185-
inline std::vector<std::string> getValue<std::vector<std::string>>(const jsi::Value &val,
186-
jsi::Runtime &runtime) {
186+
inline std::vector<std::string>
187+
getValue<std::vector<std::string>>(const jsi::Value &val,
188+
jsi::Runtime &runtime) {
187189
return getArrayAsVector<std::string>(val, runtime);
188190
}
189191

190192
template <>
191-
inline std::vector<int32_t> getValue<std::vector<int32_t>>(const jsi::Value &val,
192-
jsi::Runtime &runtime) {
193+
inline std::vector<int32_t>
194+
getValue<std::vector<int32_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
193195
return getArrayAsVector<int32_t>(val, runtime);
194196
}
195197

@@ -280,6 +282,15 @@ inline jsi::Value getJsiValue(const std::vector<int32_t> &vec,
280282
return {runtime, array};
281283
}
282284

285+
inline jsi::Value getJsiValue(const std::vector<char> &vec,
286+
jsi::Runtime &runtime) {
287+
jsi::Array array(runtime, vec.size());
288+
for (size_t i = 0; i < vec.size(); i++) {
289+
array.setValueAtIndex(runtime, i, jsi::Value(vec[i]));
290+
}
291+
return {runtime, array};
292+
}
293+
283294
inline jsi::Value getJsiValue(int val, jsi::Runtime &runtime) {
284295
return {runtime, val};
285296
}

packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ SpeechToText::decode(std::span<int32_t> tokens,
4141
return std::make_shared<OwningArrayBuffer>(decoderOutput);
4242
}
4343

44-
std::string SpeechToText::transcribe(std::span<float> waveform,
45-
std::string languageOption) const {
44+
std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
45+
std::string languageOption) const {
4646
std::vector<Segment> segments =
4747
this->asr->transcribe(waveform, DecodingOptions(languageOption));
4848
std::string transcription;
@@ -60,7 +60,8 @@ std::string SpeechToText::transcribe(std::span<float> waveform,
6060
transcription += word.content;
6161
}
6262
}
63-
return transcription;
63+
64+
return {transcription.begin(), transcription.end()};
6465
}
6566

6667
size_t SpeechToText::getMemoryLowerBound() const noexcept {
@@ -74,16 +75,17 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
7475
throw std::runtime_error("Streaming is already in progress");
7576
}
7677

77-
auto nativeCallback = [this, callback](const std::string &committed,
78-
const std::string &nonCommitted,
79-
bool isDone) {
80-
this->callInvoker->invokeAsync(
81-
[callback, committed, nonCommitted, isDone](jsi::Runtime &rt) {
82-
callback->call(rt, jsi::String::createFromUtf8(rt, committed),
83-
jsi::String::createFromUtf8(rt, nonCommitted),
84-
jsi::Value(isDone));
78+
auto nativeCallback =
79+
[this, callback](const std::vector<char> &committedVec,
80+
const std::vector<char> &nonCommittedVec, bool isDone) {
81+
this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec,
82+
isDone](jsi::Runtime &rt) {
83+
callback->call(
84+
rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt),
85+
rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt),
86+
jsi::Value(isDone));
8587
});
86-
};
88+
};
8789

8890
this->isStreaming = true;
8991
while (this->isStreaming) {
@@ -94,12 +96,15 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
9496
}
9597
ProcessResult res =
9698
this->processor->processIter(DecodingOptions(languageOption));
97-
nativeCallback(res.committed, res.nonCommitted, false);
99+
100+
nativeCallback({res.committed.begin(), res.committed.end()},
101+
{res.nonCommitted.begin(), res.nonCommitted.end()}, false);
98102
this->readyToProcess = false;
99103
}
100104

101105
std::string committed = this->processor->finish();
102-
nativeCallback(committed, "", true);
106+
107+
nativeCallback({committed.begin(), committed.end()}, {}, true);
103108

104109
this->resetStreamState();
105110
}

packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#pragma once
22

33
#include "rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h"
4+
#include <span>
5+
#include <string>
6+
#include <vector>
47

58
namespace rnexecutorch {
69

@@ -17,8 +20,8 @@ class SpeechToText {
1720
std::shared_ptr<OwningArrayBuffer> encode(std::span<float> waveform) const;
1821
std::shared_ptr<OwningArrayBuffer>
1922
decode(std::span<int32_t> tokens, std::span<float> encoderOutput) const;
20-
std::string transcribe(std::span<float> waveform,
21-
std::string languageOption) const;
23+
std::vector<char> transcribe(std::span<float> waveform,
24+
std::string languageOption) const;
2225

2326
size_t getMemoryLowerBound() const noexcept;
2427

packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ export class SpeechToTextModule {
77

88
private modelConfig!: SpeechToTextModelConfig;
99

10+
private textDecoder = new TextDecoder('utf-8', {
11+
fatal: false,
12+
ignoreBOM: true,
13+
});
14+
1015
public async load(
1116
model: SpeechToTextModelConfig,
1217
onDownloadProgressCallback: (progress: number) => void = () => {}
@@ -87,8 +92,11 @@ export class SpeechToTextModule {
8792
);
8893
waveform = new Float32Array(waveform);
8994
}
90-
91-
return this.nativeModule.transcribe(waveform, options.language || '');
95+
const transcriptionBytes = await this.nativeModule.transcribe(
96+
waveform,
97+
options.language || ''
98+
);
99+
return this.textDecoder.decode(new Uint8Array(transcriptionBytes));
92100
}
93101

94102
public async *stream(
@@ -109,8 +117,13 @@ export class SpeechToTextModule {
109117
(async () => {
110118
try {
111119
await this.nativeModule.stream(
112-
(committed: string, nonCommitted: string, isDone: boolean) => {
113-
queue.push({ committed, nonCommitted });
120+
(committed: number[], nonCommitted: number[], isDone: boolean) => {
121+
queue.push({
122+
committed: this.textDecoder.decode(new Uint8Array(committed)),
123+
nonCommitted: this.textDecoder.decode(
124+
new Uint8Array(nonCommitted)
125+
),
126+
});
114127
if (isDone) {
115128
finished = true;
116129
}

0 commit comments

Comments
 (0)