feat(computer-vision)!: TextToImage returns file URI instead of base64 (#1180)

msluszniak · web-flow · commit 7b592c294dc1 · 2026-05-25T11:40:30.000+02:00
## Description Closes #888. Moves PNG encoding for `TextToImage` from JS (`pngjs`) into the native side via the existing `image_processing::saveToTempFile` helper, mirroring how `StyleTransfer`'s `url` output mode already works. `TextToImageModule.forward` (and the `useTextToImage` `generate` hook) now resolves to a `file://` URI pointing to a PNG on disk instead of a base64-encoded payload. Also tightens the `StyleTransferModule.forward` JSDoc to document the `'pixelData'` / `'url'` output modes — that was the doc-correction half of #888. The `pngjs` dependency is no longer needed and is dropped from `packages/react-native-executorch/package.json` and the `apps/computer-vision` example. ### Introduces a breaking change? - [x] Yes - [ ] No `TextToImageModule.forward` / `useTextToImage.generate` now resolves to a `file://` URI instead of a base64-encoded PNG string. Callers should switch from `data:image/png;base64,\${image}` to using the URI directly: ```tsx <Image source={{ uri: image }} /> ``` ### Type of change - [x] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [x] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [ ] iOS - [x] Android ### Testing instructions - Run the computer-vision example app -> Text to Image screen, generate an image with each supported model. Image should render correctly from the returned URI. - Verify interrupt still works mid-generation (returned URI string is empty). - Verify Style Transfer's `forward(..., 'url')` mode still returns a `file://` URI (no behavior change, doc-only). ### Screenshots  ### Related issues Closes #888 ### Checklist - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [x] I have updated the documentation accordingly - [x] My changes generate no new warnings ### Additional notes The C++ integration tests covering `generate` are still `GTEST_SKIP`-ed pending the existing UNet emulator issue, but I refreshed them so they exercise the new URI return shape when re-enabled.
diff --git a/apps/computer-vision/app/text_to_image/index.tsx b/apps/computer-vision/app/text_to_image/index.tsx
@@ -143,7 +143,7 @@ export default function TextToImageScreen() {
             <Image
               style={styles.image}
               resizeMode="contain"
-              source={{ uri: `data:image/png;base64,${image}` }}
+              source={{ uri: image }}
             />
           ) : (
             <View style={styles.infoContainer}>
diff --git a/apps/computer-vision/package.json b/apps/computer-vision/package.json
@@ -45,7 +45,6 @@
   },
   "devDependencies": {
     "@babel/core": "^7.29.0",
-    "@types/pngjs": "^6.0.5",
     "@types/react": "~19.2.0",
     "@types/react-refresh": "^0",
     "babel-preset-expo": "~55.0.16",
diff --git a/docs/docs/03-hooks/02-computer-vision/useTextToImage.md b/docs/docs/03-hooks/02-computer-vision/useTextToImage.md
@@ -82,7 +82,8 @@ function App() {
   }
   //...
 
-  return <Image source={{ uri: `data:image/png;base64,${image}` }} />;
+  // `generate` returns a `file://` URI to the PNG saved on disk.
+  return <Image source={{ uri: image }} />;
 }
 ```
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.cpp
@@ -5,8 +5,10 @@
 #include <span>
 
 #include <executorch/extension/tensor/tensor.h>
+#include <opencv2/opencv.hpp>
 
 #include <rnexecutorch/Log.h>
+#include <rnexecutorch/data_processing/ImageProcessing.h>
 #include <rnexecutorch/models/text_to_image/Constants.h>
 
 #include <rnexecutorch/Error.h>
@@ -54,10 +56,9 @@ void TextToImage::setSeed(int32_t &seed) {
   seed = rd();
 }
 
-std::shared_ptr<OwningArrayBuffer>
-TextToImage::generate(std::string input, int32_t imageSize,
-                      size_t numInferenceSteps, int32_t seed,
-                      std::shared_ptr<jsi::Function> callback) {
+std::string TextToImage::generate(std::string input, int32_t imageSize,
+                                  size_t numInferenceSteps, int32_t seed,
+                                  std::shared_ptr<jsi::Function> callback) {
   std::scoped_lock lock(inference_mutex_);
   setImageSize(imageSize);
   setSeed(seed);
@@ -105,7 +106,7 @@ TextToImage::generate(std::string input, int32_t imageSize,
   }
   if (interrupted) {
     interrupted = false;
-    return std::make_shared<OwningArrayBuffer>(0);
+    return "";
   }
 
   for (auto &val : latents) {
@@ -116,18 +117,20 @@ TextToImage::generate(std::string input, int32_t imageSize,
   return postprocess(output);
 }
 
-std::shared_ptr<OwningArrayBuffer>
-TextToImage::postprocess(const std::vector<float> &output) const {
-  // Convert RGB to RGBA
-  int32_t imagePixelCount = imageSize * imageSize;
-  std::vector<uint8_t> outputRgba(imagePixelCount * 4);
-  for (int32_t i = 0; i < imagePixelCount; i++) {
-    outputRgba[i * 4 + 0] = output[i * 3 + 0];
-    outputRgba[i * 4 + 1] = output[i * 3 + 1];
-    outputRgba[i * 4 + 2] = output[i * 3 + 2];
-    outputRgba[i * 4 + 3] = 255;
+std::string TextToImage::postprocess(const std::vector<float> &output) const {
+  // Decoder output is HWC float RGB (values already in [0..255]). cv::imwrite
+  // expects a BGR matrix, so pack the channels in BGR order here.
+  cv::Mat bgr(imageSize, imageSize, CV_8UC3);
+  for (int32_t y = 0; y < imageSize; ++y) {
+    auto *row = bgr.ptr<cv::Vec3b>(y);
+    for (int32_t x = 0; x < imageSize; ++x) {
+      const int32_t idx = (y * imageSize + x) * 3;
+      row[x] = cv::Vec3b(static_cast<uint8_t>(output[idx + 2]),
+                         static_cast<uint8_t>(output[idx + 1]),
+                         static_cast<uint8_t>(output[idx + 0]));
+    }
   }
-  return std::make_shared<OwningArrayBuffer>(outputRgba);
+  return image_processing::saveToTempFile(bgr);
 }
 
 void TextToImage::interrupt() noexcept { interrupted = true; }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.h
@@ -8,7 +8,6 @@
 #include <ReactCommon/CallInvoker.h>
 #include <jsi/jsi.h>
 
-#include <rnexecutorch/jsi/OwningArrayBuffer.h>
 #include <rnexecutorch/metaprogramming/ConstructorHelpers.h>
 
 #include <rnexecutorch/models/text_to_image/Decoder.h>
@@ -30,18 +29,17 @@ class TextToImage final {
                        int32_t schedulerNumTrainTimesteps,
                        int32_t schedulerStepsOffset,
                        std::shared_ptr<react::CallInvoker> callInvoker);
-  std::shared_ptr<OwningArrayBuffer>
-  generate(std::string input, int32_t imageSize, size_t numInferenceSteps,
-           int32_t seed, std::shared_ptr<jsi::Function> callback);
+  std::string generate(std::string input, int32_t imageSize,
+                       size_t numInferenceSteps, int32_t seed,
+                       std::shared_ptr<jsi::Function> callback);
   void interrupt() noexcept;
   size_t getMemoryLowerBound() const noexcept;
   void unload() noexcept;
 
 private:
   void setImageSize(int32_t imageSize);
   void setSeed(int32_t &seed);
-  std::shared_ptr<OwningArrayBuffer>
-  postprocess(const std::vector<float> &output) const;
+  std::string postprocess(const std::vector<float> &output) const;
 
   size_t memorySizeLowerBound;
   int32_t imageSize;
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToImageTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToImageTest.cpp
@@ -1,8 +1,10 @@
 #include "BaseModelTests.h"
 #include <gtest/gtest.h>
+#include <opencv2/opencv.hpp>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/models/text_to_image/TextToImage.h>
 #include <string>
+#include <string_view>
 
 using namespace rnexecutorch;
 using namespace rnexecutorch::models::text_to_image;
@@ -111,7 +113,7 @@ TEST(TextToImageGenerateTests, ZeroStepsThrows) {
                RnExecutorchError);
 }
 
-TEST(TextToImageGenerateTests, GenerateReturnsNonNull) {
+TEST(TextToImageGenerateTests, GenerateReturnsFileUri) {
   // TODO: Investigate source of the issue
   GTEST_SKIP() << "Skipping TextToImage generation test in emulator "
                   "environment due to UNet forward call throwing error no. 1";
@@ -120,22 +122,8 @@ TEST(TextToImageGenerateTests, GenerateReturnsNonNull) {
                     kSchedulerNumTrainTimesteps, kSchedulerStepsOffset,
                     createMockCallInvoker());
   auto result = model.generate("a cat", 128, 1, 42, nullptr);
-  EXPECT_NE(result, nullptr);
-}
-
-TEST(TextToImageGenerateTests, GenerateReturnsCorrectSize) {
-  // TODO: Investigate source of the issue
-  GTEST_SKIP() << "Skipping TextToImage generation test in emulator "
-                  "environment due to UNet forward call throwing error no. 1";
-  TextToImage model(kValidTokenizerPath, kValidEncoderPath, kValidUnetPath,
-                    kValidDecoderPath, kSchedulerBetaStart, kSchedulerBetaEnd,
-                    kSchedulerNumTrainTimesteps, kSchedulerStepsOffset,
-                    createMockCallInvoker());
-  int32_t imageSize = 128;
-  auto result = model.generate("a cat", imageSize, 1, 42, nullptr);
-  ASSERT_NE(result, nullptr);
-  size_t expectedSize = imageSize * imageSize * 4;
-  EXPECT_EQ(result->size(), expectedSize);
+  EXPECT_FALSE(result.empty());
+  EXPECT_TRUE(result.starts_with("file://"));
 }
 
 TEST(TextToImageGenerateTests, SameSeedProducesSameResult) {
@@ -146,15 +134,20 @@ TEST(TextToImageGenerateTests, SameSeedProducesSameResult) {
                     kValidDecoderPath, kSchedulerBetaStart, kSchedulerBetaEnd,
                     kSchedulerNumTrainTimesteps, kSchedulerStepsOffset,
                     createMockCallInvoker());
-  auto result1 = model.generate("a cat", 128, 1, 42, nullptr);
-  auto result2 = model.generate("a cat", 128, 1, 42, nullptr);
-  ASSERT_NE(result1, nullptr);
-  ASSERT_NE(result2, nullptr);
-  ASSERT_EQ(result1->size(), result2->size());
-
-  auto data1 = static_cast<uint8_t *>(result1->data());
-  auto data2 = static_cast<uint8_t *>(result2->data());
-  for (size_t i = 0; i < result1->size(); i++) {
-    EXPECT_EQ(data1[i], data2[i]) << "at index: " << i;
-  }
+  auto path1 = model.generate("a cat", 128, 1, 42, nullptr);
+  auto path2 = model.generate("a cat", 128, 1, 42, nullptr);
+  ASSERT_FALSE(path1.empty());
+  ASSERT_FALSE(path2.empty());
+
+  const std::string kScheme = "file://";
+  auto stripScheme = [&kScheme](const std::string &uri) {
+    return uri.starts_with(kScheme) ? uri.substr(kScheme.size()) : uri;
+  };
+  cv::Mat img1 = cv::imread(stripScheme(path1), cv::IMREAD_UNCHANGED);
+  cv::Mat img2 = cv::imread(stripScheme(path2), cv::IMREAD_UNCHANGED);
+  ASSERT_FALSE(img1.empty());
+  ASSERT_FALSE(img2.empty());
+  ASSERT_EQ(img1.size(), img2.size());
+  ASSERT_EQ(img1.type(), img2.type());
+  EXPECT_EQ(cv::countNonZero(img1 != img2), 0);
 }
diff --git a/packages/react-native-executorch/package.json b/packages/react-native-executorch/package.json
@@ -124,7 +124,6 @@
     "@huggingface/jinja": "^0.5.0",
     "jsonrepair": "^3.12.0",
     "jsonschema": "^1.5.0",
-    "pngjs": "^7.0.0",
     "zod": "^4.3.6"
   }
 }
diff --git a/packages/react-native-executorch/src/modules/computer_vision/StyleTransferModule.ts b/packages/react-native-executorch/src/modules/computer_vision/StyleTransferModule.ts
@@ -64,6 +64,16 @@ export class StyleTransferModule extends VisionModule<PixelData | string> {
     );
   }
 
+  /**
+   * Executes style transfer on the provided image.
+   * @param input - Image source (string path/URI or `PixelData` from a frame library).
+   * @param outputType - Controls the output format. Defaults to `'pixelData'`, which
+   *   returns raw RGBA pixels suitable for direct rendering. Pass `'url'` to
+   *   have the stylized image saved to a temporary PNG on the device and
+   *   receive a `file://` URI string instead.
+   * @returns A Promise resolving to either a `PixelData` object or a `file://` URI string,
+   *   depending on `outputType`.
+   */
   async forward<O extends 'pixelData' | 'url' = 'pixelData'>(
     input: string | PixelData,
     outputType?: O
diff --git a/packages/react-native-executorch/src/modules/computer_vision/TextToImageModule.ts b/packages/react-native-executorch/src/modules/computer_vision/TextToImageModule.ts
@@ -3,7 +3,6 @@ import { ResourceSource } from '../../types/common';
 import { TextToImageModelName } from '../../types/tti';
 import { BaseModule } from '../BaseModule';
 
-import { PNG } from 'pngjs/browser';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { parseUnknownError, RnExecutorchError } from '../../errors/errorUtils';
 import { Logger } from '../../common/Logger';
@@ -147,40 +146,27 @@ export class TextToImageModule extends BaseModule {
 
   /**
    * Runs the model to generate an image described by `input`, and conditioned by `seed`, performing `numSteps` inference steps.
-   * The resulting image, with dimensions `imageSize`×`imageSize` pixels, is returned as a base64-encoded string.
+   * The resulting image, with dimensions `imageSize`×`imageSize` pixels, is saved as a PNG on the device and returned as a `file://` URI.
+   * If generation is interrupted before completion, an empty string is returned.
    * @param input - The text prompt to generate the image from.
    * @param imageSize - The desired width and height of the output image in pixels.
    * @param numSteps - The number of inference steps to perform.
    * @param seed - An optional seed for random number generation to ensure reproducibility.
-   * @returns A Base64-encoded string representing the generated PNG image.
+   * @returns A `file://` URI pointing to the generated PNG, or an empty string if generation was interrupted.
    */
   async forward(
     input: string,
     imageSize: number = 512,
     numSteps: number = 5,
     seed?: number
   ): Promise<string> {
-    const output = await this.nativeModule.generate(
+    return await this.nativeModule.generate(
       input,
       imageSize,
       numSteps,
       seed ? seed : -1,
       this.inferenceCallback
     );
-    const outputArray = new Uint8Array(output);
-    if (!outputArray.length) {
-      return '';
-    }
-    const png = new PNG({ width: imageSize, height: imageSize });
-    png.data = outputArray as unknown as Buffer;
-    const pngBuffer = PNG.sync.write(png, { colorType: 6 });
-    const pngArray = new Uint8Array(pngBuffer as unknown as ArrayBufferLike);
-    let binary = '';
-    const chunkSize = 8192;
-    for (let i = 0; i < pngArray.length; i += chunkSize) {
-      binary += String.fromCharCode(...pngArray.subarray(i, i + chunkSize));
-    }
-    return btoa(binary);
   }
 
   /**
diff --git a/packages/react-native-executorch/src/types/tti.ts b/packages/react-native-executorch/src/types/tti.ts
@@ -81,7 +81,7 @@ export interface TextToImageType {
    * @param [imageSize] - Optional. The target width and height of the generated image (e.g., 512 for 512x512). Defaults to the model's standard size if omitted.
    * @param [numSteps] - Optional. The number of denoising steps for the diffusion process. More steps generally yield higher quality at the cost of generation time.
    * @param [seed] - Optional. A random seed for reproducible generation. Should be a positive integer.
-   * @returns A Promise that resolves to a string representing the generated image (e.g., base64 string or file URI).
+   * @returns A Promise that resolves to a `file://` URI pointing to the generated PNG on the device, or an empty string if generation was interrupted.
    * @throws {RnExecutorchError} If the model is not loaded or is currently generating another image.
    */
   generate: (
diff --git a/yarn.lock b/yarn.lock
@@ -5986,15 +5986,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"@types/pngjs@npm:^6.0.5":
-  version: 6.0.5
-  resolution: "@types/pngjs@npm:6.0.5"
-  dependencies:
-    "@types/node": "npm:*"
-  checksum: 10/132fce25817d47a784ed48aa678332521b0f7e6edbaa76f3fa4e9ca1228078788ae712f99ad4d1a324d9ba0b14829958677eabf3ebef1fb6e120816f433f0cd8
-  languageName: node
-  linkType: hard
-
 "@types/react-refresh@npm:^0":
   version: 0.14.7
   resolution: "@types/react-refresh@npm:0.14.7"
@@ -7918,7 +7909,6 @@ __metadata:
     "@react-navigation/drawer": "npm:^7.9.4"
     "@react-navigation/native": "npm:^7.2.2"
     "@shopify/react-native-skia": "npm:2.6.2"
-    "@types/pngjs": "npm:^6.0.5"
     "@types/react": "npm:~19.2.0"
     "@types/react-refresh": "npm:^0"
     babel-preset-expo: "npm:~55.0.16"
@@ -14965,13 +14955,6 @@ __metadata:
   languageName: node
   linkType: hard
 
-"pngjs@npm:^7.0.0":
-  version: 7.0.0
-  resolution: "pngjs@npm:7.0.0"
-  checksum: 10/e843ebbb0df092ee0f3a3e7dbd91ff87a239a4e4c4198fff202916bfb33b67622f4b83b3c29f3ccae94fcb97180c289df06068624554f61686fe6b9a4811f7db
-  languageName: node
-  linkType: hard
-
 "possible-typed-array-names@npm:^1.0.0":
   version: 1.1.0
   resolution: "possible-typed-array-names@npm:1.1.0"
@@ -15468,7 +15451,6 @@ __metadata:
     jsonrepair: "npm:^3.12.0"
     jsonschema: "npm:^1.5.0"
     metro-react-native-babel-preset: "npm:^0.77.0"
-    pngjs: "npm:^7.0.0"
     react: "npm:19.1.0"
     react-native: "npm:0.81.5"
     react-native-builder-bob: "npm:^0.40.12"

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,8 @@ function App() {`
`82`	`82`	`}`
`83`	`83`	`//...`
`84`	`84`
`85`		- return <Image source={{ uri: `data:image/png;base64,${image}` }} />;
	`85`	+ // `generate` returns a `file://` URI to the PNG saved on disk.
	`86`	`+ return <Image source={{ uri: image }} />;`
`86`	`87`	`}`
`87`	`88`	```
`88`	`89`
Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,6 @@`
`124`	`124`	`"@huggingface/jinja": "^0.5.0",`
`125`	`125`	`"jsonrepair": "^3.12.0",`
`126`	`126`	`"jsonschema": "^1.5.0",`
`127`		`- "pngjs": "^7.0.0",`
`128`	`127`	`"zod": "^4.3.6"`
`129`	`128`	`}`
`130`	`129`	`}`