chore: migrate SpeechToTextModule to factory pattern, add SpeechToTextModelName type

chmjkb · claude · chmjkb · commit b62201acf7de · 2026-03-09T17:20:07.000+01:00
- Add SpeechToTextModelName union type
- Add modelName to SpeechToTextModelConfig
- SpeechToTextModule: private constructor, fromModelName, fromCustomModel
- useSpeechToText: use factory, add model.modelName to deps

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -24,41 +24,53 @@ export const useSpeechToText = ({
   const [isReady, setIsReady] = useState(false);
   const [isGenerating, setIsGenerating] = useState(false);
   const [downloadProgress, setDownloadProgress] = useState(0);
-
-  const [moduleInstance, _] = useState(() => new SpeechToTextModule());
+  const [moduleInstance, setModuleInstance] =
+    useState<SpeechToTextModule | null>(null);
 
   useEffect(() => {
     if (preventLoad) return;
-    let isMounted = true;
 
-    (async () => {
-      setDownloadProgress(0);
-      setError(null);
-      try {
-        setIsReady(false);
-        await moduleInstance.load(
-          {
-            isMultilingual: model.isMultilingual,
-            encoderSource: model.encoderSource,
-            decoderSource: model.decoderSource,
-            tokenizerSource: model.tokenizerSource,
-          },
-          (progress) => {
-            if (isMounted) setDownloadProgress(progress);
-          }
-        );
-        if (isMounted) setIsReady(true);
-      } catch (err) {
-        if (isMounted) setError(parseUnknownError(err));
+    let active = true;
+    setDownloadProgress(0);
+    setError(null);
+    setIsReady(false);
+
+    SpeechToTextModule.fromModelName(
+      {
+        modelName: model.modelName,
+        isMultilingual: model.isMultilingual,
+        encoderSource: model.encoderSource,
+        decoderSource: model.decoderSource,
+        tokenizerSource: model.tokenizerSource,
+      },
+      (p) => {
+        if (active) setDownloadProgress(p);
       }
-    })();
+    )
+      .then((mod) => {
+        if (!active) {
+          mod.delete();
+          return;
+        }
+        setModuleInstance((prev) => {
+          prev?.delete();
+          return mod;
+        });
+        setIsReady(true);
+      })
+      .catch((err) => {
+        if (active) setError(parseUnknownError(err));
+      });
 
     return () => {
-      isMounted = false;
-      moduleInstance.delete();
+      active = false;
+      setModuleInstance((prev) => {
+        prev?.delete();
+        return null;
+      });
     };
   }, [
-    moduleInstance,
+    model.modelName,
     model.isMultilingual,
     model.encoderSource,
     model.decoderSource,
@@ -71,7 +83,7 @@ export const useSpeechToText = ({
       waveform: Float32Array,
       options: DecodingOptions = {}
     ): Promise<TranscriptionResult> => {
-      if (!isReady) {
+      if (!isReady || !moduleInstance) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
           'The model is currently not loaded. Please load the model before calling this function.'
@@ -103,7 +115,7 @@ export const useSpeechToText = ({
       void,
       unknown
     > {
-      if (!isReady) {
+      if (!isReady || !moduleInstance) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
           'The model is currently not loaded. Please load the model before calling this function.'
@@ -131,17 +143,44 @@ export const useSpeechToText = ({
 
   const streamInsert = useCallback(
     (waveform: Float32Array) => {
-      if (!isReady) return;
+      if (!isReady || !moduleInstance) return;
       moduleInstance.streamInsert(waveform);
     },
     [isReady, moduleInstance]
   );
 
   const streamStop = useCallback(() => {
-    if (!isReady) return;
+    if (!isReady || !moduleInstance) return;
     moduleInstance.streamStop();
   }, [isReady, moduleInstance]);
 
+  const encode = useCallback(
+    (waveform: Float32Array): Promise<Float32Array> => {
+      if (!moduleInstance)
+        throw new RnExecutorchError(
+          RnExecutorchErrorCode.ModuleNotLoaded,
+          'The model is currently not loaded. Please load the model before calling this function.'
+        );
+      return moduleInstance.encode(waveform);
+    },
+    [moduleInstance]
+  );
+
+  const decode = useCallback(
+    (
+      tokens: Int32Array,
+      encoderOutput: Float32Array
+    ): Promise<Float32Array> => {
+      if (!moduleInstance)
+        throw new RnExecutorchError(
+          RnExecutorchErrorCode.ModuleNotLoaded,
+          'The model is currently not loaded. Please load the model before calling this function.'
+        );
+      return moduleInstance.decode(tokens, encoderOutput);
+    },
+    [moduleInstance]
+  );
+
   return {
     error,
     isReady,
@@ -151,7 +190,7 @@ export const useSpeechToText = ({
     stream,
     streamInsert,
     streamStop,
-    encode: moduleInstance.encode.bind(moduleInstance),
-    decode: moduleInstance.decode.bind(moduleInstance),
+    encode,
+    decode,
   };
 };
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -1,9 +1,11 @@
 import {
   DecodingOptions,
   SpeechToTextModelConfig,
+  SpeechToTextModelName,
   TranscriptionResult,
 } from '../../types/stt';
 import { ResourceFetcher } from '../../utils/ResourceFetcher';
+import { ResourceSource } from '../../types/common';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
 import { Logger } from '../../common/Logger';
@@ -17,50 +19,98 @@ export class SpeechToTextModule {
   private nativeModule: any;
   private modelConfig!: SpeechToTextModelConfig;
 
+  private constructor() {}
+
   /**
-   * Loads the model specified by the config object.
-   * `onDownloadProgressCallback` allows you to monitor the current progress of the model download.
+   * Creates a Speech to Text instance for a built-in model.
+   *
+   * @param namedSources - Configuration object containing model name, sources, and multilingual flag.
+   * @param onDownloadProgress - Optional callback to monitor download progress, receiving a value between 0 and 1.
+   * @returns A Promise resolving to a `SpeechToTextModule` instance.
    *
-   * @param model - Configuration object containing model sources.
-   * @param onDownloadProgressCallback - Optional callback to monitor download progress.
+   * @example
+   * ```ts
+   * import { SpeechToTextModule, WHISPER_TINY_EN } from 'react-native-executorch';
+   * const stt = await SpeechToTextModule.fromModelName(WHISPER_TINY_EN);
+   * ```
    */
-  public async load(
-    model: SpeechToTextModelConfig,
-    onDownloadProgressCallback: (progress: number) => void = () => {}
-  ) {
+  static async fromModelName(
+    namedSources: SpeechToTextModelConfig,
+    onDownloadProgress: (progress: number) => void = () => {}
+  ): Promise<SpeechToTextModule> {
+    const instance = new SpeechToTextModule();
     try {
-      this.modelConfig = model;
+      await instance.internalLoad(namedSources, onDownloadProgress);
+      return instance;
+    } catch (error) {
+      Logger.error('Load failed:', error);
+      throw parseUnknownError(error);
+    }
+  }
 
-      const tokenizerLoadPromise = ResourceFetcher.fetch(
-        undefined,
-        model.tokenizerSource
-      );
-      const encoderDecoderPromise = ResourceFetcher.fetch(
-        onDownloadProgressCallback,
-        model.encoderSource,
-        model.decoderSource
-      );
-      const [tokenizerSources, encoderDecoderResults] = await Promise.all([
-        tokenizerLoadPromise,
-        encoderDecoderPromise,
-      ]);
-      const encoderSource = encoderDecoderResults?.[0];
-      const decoderSource = encoderDecoderResults?.[1];
-      if (!encoderSource || !decoderSource || !tokenizerSources) {
-        throw new RnExecutorchError(
-          RnExecutorchErrorCode.DownloadInterrupted,
-          'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
-        );
-      }
-      this.nativeModule = await global.loadSpeechToText(
+  /**
+   * Creates a Speech to Text instance with user-provided model binaries.
+   * Use this when working with a custom-exported STT model.
+   * Internally uses `'custom'` as the model name for telemetry.
+   *
+   * @param encoderSource - A fetchable resource pointing to the encoder model binary.
+   * @param decoderSource - A fetchable resource pointing to the decoder model binary.
+   * @param tokenizerSource - A fetchable resource pointing to the tokenizer file.
+   * @param isMultilingual - Whether the model supports multiple languages.
+   * @param onDownloadProgress - Optional callback to monitor download progress, receiving a value between 0 and 1.
+   * @returns A Promise resolving to a `SpeechToTextModule` instance.
+   */
+  static fromCustomModel(
+    encoderSource: ResourceSource,
+    decoderSource: ResourceSource,
+    tokenizerSource: ResourceSource,
+    isMultilingual: boolean,
+    onDownloadProgress: (progress: number) => void = () => {}
+  ): Promise<SpeechToTextModule> {
+    return SpeechToTextModule.fromModelName(
+      {
+        modelName: 'custom' as SpeechToTextModelName,
         encoderSource,
         decoderSource,
-        tokenizerSources[0]!
+        tokenizerSource,
+        isMultilingual,
+      },
+      onDownloadProgress
+    );
+  }
+
+  private async internalLoad(
+    model: SpeechToTextModelConfig,
+    onDownloadProgressCallback: (progress: number) => void = () => {}
+  ) {
+    this.modelConfig = model;
+
+    const tokenizerLoadPromise = ResourceFetcher.fetch(
+      undefined,
+      model.tokenizerSource
+    );
+    const encoderDecoderPromise = ResourceFetcher.fetch(
+      onDownloadProgressCallback,
+      model.encoderSource,
+      model.decoderSource
+    );
+    const [tokenizerSources, encoderDecoderResults] = await Promise.all([
+      tokenizerLoadPromise,
+      encoderDecoderPromise,
+    ]);
+    const encoderSource = encoderDecoderResults?.[0];
+    const decoderSource = encoderDecoderResults?.[1];
+    if (!encoderSource || !decoderSource || !tokenizerSources) {
+      throw new RnExecutorchError(
+        RnExecutorchErrorCode.DownloadInterrupted,
+        'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
       );
-    } catch (error) {
-      Logger.error('Load failed:', error);
-      throw parseUnknownError(error);
     }
+    this.nativeModule = await global.loadSpeechToText(
+      encoderSource,
+      decoderSource,
+      tokenizerSources[0]!
+    );
   }
 
   /**
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
@@ -1,6 +1,20 @@
 import { ResourceSource } from './common';
 import { RnExecutorchError } from '../errors/errorUtils';
 
+/**
+ * Union of all built-in Speech-to-Text model names.
+ *
+ * @category Types
+ */
+export type SpeechToTextModelName =
+  | 'whisper-tiny-en'
+  | 'whisper-tiny-en-quantized'
+  | 'whisper-base-en'
+  | 'whisper-small-en'
+  | 'whisper-tiny'
+  | 'whisper-base'
+  | 'whisper-small';
+
 /**
  * Configuration for Speech to Text model.
  *
@@ -261,6 +275,12 @@ export interface TranscriptionResult {
  * @category Types
  */
 export interface SpeechToTextModelConfig {
+  /**
+   * The built-in model name (e.g. `'whisper-tiny-en'`). Used for telemetry and hook reload triggers.
+   * Pass one of the pre-built STT constants (e.g. `WHISPER_TINY_EN`) to populate all required fields.
+   */
+  modelName: SpeechToTextModelName;
+
   /**
    * A boolean flag indicating whether the model supports multiple languages.
    */