bug: fix to corrupted utf-8 encoding for special characters in transcription (#652)

mkopcins · Mateusz Kopciński · web-flow · commit f9fcb04d253e · 2025-10-17T15:54:17.000+02:00
## Description Fix for issue #651. The problem was the serialization of cpp string into jsi::Value and then back to a string, causing invalid characters and bytes. Now cpp returns array of bytes and the decoding is done on the JS side. ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [x] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [x] Android ### Testing instructions  ### Screenshots  ### Related issues  ### Checklist - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [ ] My changes generate no new warnings ### Additional notes  --------- Co-authored-by: Mateusz Kopciński <mateusz.kopcinski@swmansnion.com>
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -66,7 +66,8 @@ inline JSTensorViewIn getValue<JSTensorViewIn>(const jsi::Value &val,
   tensorView.sizes.reserve(numShapeDims);
 
   for (size_t i = 0; i < numShapeDims; ++i) {
-    int32_t dim = getValue<int32_t>(shapeArray.getValueAtIndex(runtime, i), runtime);
+    int32_t dim =
+        getValue<int32_t>(shapeArray.getValueAtIndex(runtime, i), runtime);
     tensorView.sizes.push_back(dim);
   }
 
@@ -173,23 +174,24 @@ inline std::vector<T> getArrayAsVector(const jsi::Value &val,
   return result;
 }
 
-
 // Template specializations for std::vector<T> types
 template <>
-inline std::vector<JSTensorViewIn> getValue<std::vector<JSTensorViewIn>>(const jsi::Value &val,
-                                                       jsi::Runtime &runtime) {
+inline std::vector<JSTensorViewIn>
+getValue<std::vector<JSTensorViewIn>>(const jsi::Value &val,
+                                      jsi::Runtime &runtime) {
   return getArrayAsVector<JSTensorViewIn>(val, runtime);
 }
 
 template <>
-inline std::vector<std::string> getValue<std::vector<std::string>>(const jsi::Value &val,
-                                                       jsi::Runtime &runtime) {
+inline std::vector<std::string>
+getValue<std::vector<std::string>>(const jsi::Value &val,
+                                   jsi::Runtime &runtime) {
   return getArrayAsVector<std::string>(val, runtime);
 }
 
 template <>
-inline std::vector<int32_t> getValue<std::vector<int32_t>>(const jsi::Value &val,
-                                                       jsi::Runtime &runtime) {
+inline std::vector<int32_t>
+getValue<std::vector<int32_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
   return getArrayAsVector<int32_t>(val, runtime);
 }
 
@@ -280,6 +282,15 @@ inline jsi::Value getJsiValue(const std::vector<int32_t> &vec,
   return {runtime, array};
 }
 
+inline jsi::Value getJsiValue(const std::vector<char> &vec,
+                              jsi::Runtime &runtime) {
+  jsi::Array array(runtime, vec.size());
+  for (size_t i = 0; i < vec.size(); i++) {
+    array.setValueAtIndex(runtime, i, jsi::Value(vec[i]));
+  }
+  return {runtime, array};
+}
+
 inline jsi::Value getJsiValue(int val, jsi::Runtime &runtime) {
   return {runtime, val};
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -41,8 +41,8 @@ SpeechToText::decode(std::span<int32_t> tokens,
   return std::make_shared<OwningArrayBuffer>(decoderOutput);
 }
 
-std::string SpeechToText::transcribe(std::span<float> waveform,
-                                     std::string languageOption) const {
+std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
+                                           std::string languageOption) const {
   std::vector<Segment> segments =
       this->asr->transcribe(waveform, DecodingOptions(languageOption));
   std::string transcription;
@@ -60,7 +60,8 @@ std::string SpeechToText::transcribe(std::span<float> waveform,
       transcription += word.content;
     }
   }
-  return transcription;
+
+  return {transcription.begin(), transcription.end()};
 }
 
 size_t SpeechToText::getMemoryLowerBound() const noexcept {
@@ -74,16 +75,17 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     throw std::runtime_error("Streaming is already in progress");
   }
 
-  auto nativeCallback = [this, callback](const std::string &committed,
-                                         const std::string &nonCommitted,
-                                         bool isDone) {
-    this->callInvoker->invokeAsync(
-        [callback, committed, nonCommitted, isDone](jsi::Runtime &rt) {
-          callback->call(rt, jsi::String::createFromUtf8(rt, committed),
-                         jsi::String::createFromUtf8(rt, nonCommitted),
-                         jsi::Value(isDone));
+  auto nativeCallback =
+      [this, callback](const std::vector<char> &committedVec,
+                       const std::vector<char> &nonCommittedVec, bool isDone) {
+        this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec,
+                                        isDone](jsi::Runtime &rt) {
+          callback->call(
+              rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt),
+              rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt),
+              jsi::Value(isDone));
         });
-  };
+      };
 
   this->isStreaming = true;
   while (this->isStreaming) {
@@ -94,12 +96,15 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     }
     ProcessResult res =
         this->processor->processIter(DecodingOptions(languageOption));
-    nativeCallback(res.committed, res.nonCommitted, false);
+
+    nativeCallback({res.committed.begin(), res.committed.end()},
+                   {res.nonCommitted.begin(), res.nonCommitted.end()}, false);
     this->readyToProcess = false;
   }
 
   std::string committed = this->processor->finish();
-  nativeCallback(committed, "", true);
+
+  nativeCallback({committed.begin(), committed.end()}, {}, true);
 
   this->resetStreamState();
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -1,6 +1,9 @@
 #pragma once
 
 #include "rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h"
+#include <span>
+#include <string>
+#include <vector>
 
 namespace rnexecutorch {
 
@@ -17,8 +20,8 @@ class SpeechToText {
   std::shared_ptr<OwningArrayBuffer> encode(std::span<float> waveform) const;
   std::shared_ptr<OwningArrayBuffer>
   decode(std::span<int32_t> tokens, std::span<float> encoderOutput) const;
-  std::string transcribe(std::span<float> waveform,
-                         std::string languageOption) const;
+  std::vector<char> transcribe(std::span<float> waveform,
+                               std::string languageOption) const;
 
   size_t getMemoryLowerBound() const noexcept;
 
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -7,6 +7,11 @@ export class SpeechToTextModule {
 
   private modelConfig!: SpeechToTextModelConfig;
 
+  private textDecoder = new TextDecoder('utf-8', {
+    fatal: false,
+    ignoreBOM: true,
+  });
+
   public async load(
     model: SpeechToTextModelConfig,
     onDownloadProgressCallback: (progress: number) => void = () => {}
@@ -87,8 +92,11 @@ export class SpeechToTextModule {
       );
       waveform = new Float32Array(waveform);
     }
-
-    return this.nativeModule.transcribe(waveform, options.language || '');
+    const transcriptionBytes = await this.nativeModule.transcribe(
+      waveform,
+      options.language || ''
+    );
+    return this.textDecoder.decode(new Uint8Array(transcriptionBytes));
   }
 
   public async *stream(
@@ -109,8 +117,13 @@ export class SpeechToTextModule {
     (async () => {
       try {
         await this.nativeModule.stream(
-          (committed: string, nonCommitted: string, isDone: boolean) => {
-            queue.push({ committed, nonCommitted });
+          (committed: number[], nonCommitted: number[], isDone: boolean) => {
+            queue.push({
+              committed: this.textDecoder.decode(new Uint8Array(committed)),
+              nonCommitted: this.textDecoder.decode(
+                new Uint8Array(nonCommitted)
+              ),
+            });
             if (isDone) {
               finished = true;
             }