feat: use TensorPtrish type for Pixel data input

NorbertKlockiewicz · NorbertKlockiewicz · commit 933b63bf53eb · 2026-02-19T17:32:03.000+01:00
diff --git a/apps/computer-vision/app/object_detection/index.tsx b/apps/computer-vision/app/object_detection/index.tsx
@@ -4,69 +4,15 @@ import {
   Detection,
   useObjectDetection,
   SSDLITE_320_MOBILENET_V3_LARGE,
+  ScalarType,
+  PixelData,
 } from 'react-native-executorch';
 import { View, StyleSheet, Image, TouchableOpacity, Text } from 'react-native';
 import ImageWithBboxes from '../../components/ImageWithBboxes';
 import React, { useContext, useEffect, useState } from 'react';
 import { GeneratingContext } from '../../context';
 import ScreenWrapper from '../../ScreenWrapper';
 import ColorPalette from '../../colors';
-import { Images } from 'react-native-nitro-image';
-
-// Helper function to convert BGRA to RGB
-function convertBGRAtoRGB(
-  buffer: ArrayBuffer,
-  width: number,
-  height: number
-): ArrayBuffer {
-  const source = new Uint8Array(buffer);
-  const rgb = new Uint8Array(width * height * 3);
-
-  for (let i = 0; i < width * height; i++) {
-    // BGRA format: [B, G, R, A] → RGB: [R, G, B]
-    rgb[i * 3 + 0] = source[i * 4 + 2]; // R
-    rgb[i * 3 + 1] = source[i * 4 + 1]; // G
-    rgb[i * 3 + 2] = source[i * 4 + 0]; // B
-  }
-
-  return rgb.buffer;
-}
-
-// Helper function to convert image URI to raw RGB pixel data
-async function imageUriToPixelData(
-  uri: string,
-  targetWidth: number,
-  targetHeight: number
-): Promise<{
-  data: ArrayBuffer;
-  width: number;
-  height: number;
-  channels: number;
-}> {
-  try {
-    // Load image and resize to target dimensions
-    const image = await Images.loadFromFileAsync(uri);
-    const resized = image.resize(targetWidth, targetHeight);
-
-    // Get pixel data as ArrayBuffer (BGRA format from NitroImage)
-    const rawPixelData = resized.toRawPixelData();
-    const buffer =
-      rawPixelData instanceof ArrayBuffer ? rawPixelData : rawPixelData.buffer;
-
-    // Convert BGRA to RGB as required by the native API
-    const rgbBuffer = convertBGRAtoRGB(buffer, targetWidth, targetHeight);
-
-    return {
-      data: rgbBuffer,
-      width: targetWidth,
-      height: targetHeight,
-      channels: 3, // RGB
-    };
-  } catch (error) {
-    console.error('Error loading image with NitroImage:', error);
-    throw error;
-  }
-}
 
 export default function ObjectDetectionScreen() {
   const [imageUri, setImageUri] = useState('');
@@ -109,30 +55,45 @@ export default function ObjectDetectionScreen() {
   };
 
   const runForwardPixels = async () => {
-    if (imageUri && imageDimensions) {
-      try {
-        console.log('Converting image to pixel data...');
-        // Use original dimensions - let the model resize internally
-        const pixelData = await imageUriToPixelData(
-          imageUri,
-          imageDimensions.width,
-          imageDimensions.height
-        );
-
-        console.log('Running forward with pixel data...', {
-          width: pixelData.width,
-          height: pixelData.height,
-          channels: pixelData.channels,
-          dataSize: pixelData.data.byteLength,
-        });
-
-        // Run inference using unified forward() API
-        const output = await ssdLite.forward(pixelData, 0.3);
-        console.log('Pixel data result:', output.length, 'detections');
-        setResults(output);
-      } catch (e) {
-        console.error('Error in runForwardPixels:', e);
+    try {
+      console.log('Testing with hardcoded pixel data...');
+
+      // Create a simple 320x320 test image (all zeros - black image)
+      // In a real scenario, you would load actual image pixel data here
+      const width = 320;
+      const height = 320;
+      const channels = 3; // RGB
+
+      // Create a black image (you can replace this with actual pixel data)
+      const rgbData = new Uint8Array(width * height * channels);
+
+      // Optionally, add some test pattern (e.g., white square in center)
+      for (let y = 100; y < 220; y++) {
+        for (let x = 100; x < 220; x++) {
+          const idx = (y * width + x) * 3;
+          rgbData[idx + 0] = 255; // R
+          rgbData[idx + 1] = 255; // G
+          rgbData[idx + 2] = 255; // B
+        }
       }
+
+      const pixelData: PixelData = {
+        dataPtr: rgbData,
+        sizes: [height, width, channels],
+        scalarType: ScalarType.BYTE,
+      };
+
+      console.log('Running forward with hardcoded pixel data...', {
+        sizes: pixelData.sizes,
+        dataSize: pixelData.dataPtr.byteLength,
+      });
+
+      // Run inference using unified forward() API
+      const output = await ssdLite.forward(pixelData, 0.3);
+      console.log('Pixel data result:', output.length, 'detections');
+      setResults(output);
+    } catch (e) {
+      console.error('Error in runForwardPixels:', e);
     }
   };
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.cpp b/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.cpp
@@ -2,8 +2,6 @@
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/ErrorCodes.h>
 #include <rnexecutorch/Log.h>
-#include <rnexecutorch/host_objects/JSTensorViewIn.h>
-#include <rnexecutorch/host_objects/JsiConversions.h>
 #include <rnexecutorch/utils/FrameProcessor.h>
 
 namespace rnexecutorch {
@@ -21,12 +19,7 @@ cv::Mat VisionModel::extractFromFrame(jsi::Runtime &runtime,
   return preprocessFrame(frame);
 }
 
-cv::Mat VisionModel::extractFromPixels(jsi::Runtime &runtime,
-                                       const jsi::Object &pixelData) const {
-  // PixelData follows TensorPtr structure (dataPtr, sizes, scalarType)
-  // Use JSI conversion helper to extract the data
-  auto tensorView = jsi::fromHostObject<JSTensorViewIn>(runtime, pixelData);
-
+cv::Mat VisionModel::extractFromPixels(const JSTensorViewIn &tensorView) const {
   // Validate dimensions: sizes must be [height, width, channels]
   if (tensorView.sizes.size() != 3) {
     char errorMessage[100];
@@ -59,11 +52,11 @@ cv::Mat VisionModel::extractFromPixels(jsi::Runtime &runtime,
   }
 
   // Create cv::Mat directly from dataPtr (zero-copy view)
+  // Data is valid for the duration of this synchronous call
   uint8_t *dataPtr = static_cast<uint8_t *>(tensorView.dataPtr);
   cv::Mat image(height, width, CV_8UC3, dataPtr);
 
-  // Clone to own the data, since JS memory may be GC'd
-  return image.clone();
+  return image;
 }
 
 } // namespace models
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.h b/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.h
@@ -129,40 +129,35 @@ class VisionModel : public BaseModel {
                            const jsi::Value &frameData) const;
 
   /**
-   * @brief Extract cv::Mat from raw pixel data (ArrayBuffer) sent from
+   * @brief Extract cv::Mat from raw pixel data (TensorPtr) sent from
    * JavaScript
    *
    * This method enables users to run inference on raw pixel data without file
    * I/O. Useful for processing images already in memory (e.g., from canvas,
    * image library).
    *
-   * @param runtime JSI runtime
-   * @param pixelData JSI object containing:
-   *                  - data: ArrayBuffer with raw pixel values
-   *                  - width: number - image width
-   *                  - height: number - image height
-   *                  - channels: number - number of channels (3 for RGB, 4 for
-   * RGBA)
+   * @param tensorView JSTensorViewIn containing:
+   *                   - dataPtr: Pointer to raw pixel values (RGB format)
+   *                   - sizes: [height, width, channels] - must be 3D
+   *                   - scalarType: Must be ScalarType::Byte (Uint8Array)
    *
    * @return cv::Mat containing the pixel data
    *
-   * @throws std::runtime_error if pixelData format is invalid
+   * @throws RnExecutorchError if tensorView format is invalid
    *
    * @note The returned cv::Mat owns a copy of the data
-   * @note Expected pixel format: RGB or RGBA, row-major order
+   * @note Expected pixel format: RGB (3 channels), row-major order
    * @note Typical usage from JS:
    * @code
-   *   const pixels = new Uint8Array([...]);  // Raw pixel data
+   *   const pixels = new Uint8Array([...]);  // Raw RGB pixel data
    *   const result = model.generateFromPixels({
-   *     data: pixels.buffer,
-   *     width: 640,
-   *     height: 480,
-   *     channels: 3
+   *     dataPtr: pixels,
+   *     sizes: [480, 640, 3],
+   *     scalarType: ScalarType.BYTE
    *   }, 0.5);
    * @endcode
    */
-  cv::Mat extractFromPixels(jsi::Runtime &runtime,
-                            const jsi::Object &pixelData) const;
+  cv::Mat extractFromPixels(const JSTensorViewIn &tensorView) const;
 };
 
 } // namespace models
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp b/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp
@@ -4,6 +4,7 @@
 #include <rnexecutorch/ErrorCodes.h>
 #include <rnexecutorch/Log.h>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
+#include <rnexecutorch/host_objects/JsiConversions.h>
 #include <rnexecutorch/utils/FrameProcessor.h>
 
 namespace rnexecutorch::models::object_detection {
@@ -176,9 +177,12 @@ std::vector<types::Detection>
 ObjectDetection::generateFromPixels(jsi::Runtime &runtime,
                                     const jsi::Value &pixelData,
                                     double detectionThreshold) {
-  // Extract raw pixel data from JavaScript
-  auto pixelObj = pixelData.asObject(runtime);
-  cv::Mat image = extractFromPixels(runtime, pixelObj);
+  // Convert JSI value to JSTensorViewIn
+  auto tensorView =
+      jsi_conversion::getValue<JSTensorViewIn>(pixelData, runtime);
+
+  // Extract raw pixel data to cv::Mat
+  cv::Mat image = extractFromPixels(tensorView);
 
   // Use the internal helper - it handles locking, preprocessing, and inference
   return runInference(image, detectionThreshold);
diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/FrameExtractor.cpp b/packages/react-native-executorch/common/rnexecutorch/utils/FrameExtractor.cpp
@@ -86,7 +86,7 @@ cv::Mat FrameExtractor::extractFromAHardwareBuffer(void *hardwareBuffer) {
       buffer, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN, -1, nullptr, &data);
 
   if (lockResult != 0) {
-    throw RnExecutorchError(RnExecutorchErrorCode::AccessFailed,
+    throw RnExecutorchError(RnExecutorchErrorCode::UnknownError,
                             "Failed to lock AHardwareBuffer");
   }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/FrameProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/utils/FrameProcessor.cpp
@@ -9,13 +9,8 @@ namespace utils {
 
 cv::Mat FrameProcessor::extractFrame(jsi::Runtime &runtime,
                                      const jsi::Object &frameData) {
-  // Get frame dimensions
-  int width =
-      static_cast<int>(frameData.getProperty(runtime, "width").asNumber());
-  int height =
-      static_cast<int>(frameData.getProperty(runtime, "height").asNumber());
-
   // Try zero-copy path first (nativeBuffer)
+  // Native buffer contains dimensions, so we don't need width/height properties
   if (hasNativeBuffer(runtime, frameData)) {
     try {
       return extractFromNativeBuffer(runtime, frameData);
@@ -25,7 +20,12 @@ cv::Mat FrameProcessor::extractFrame(jsi::Runtime &runtime,
   }
 
   // Fallback to ArrayBuffer path (with copy)
+  // Get frame dimensions for ArrayBuffer path
   if (frameData.hasProperty(runtime, "data")) {
+    int width =
+        static_cast<int>(frameData.getProperty(runtime, "width").asNumber());
+    int height =
+        static_cast<int>(frameData.getProperty(runtime, "height").asNumber());
     return extractFromArrayBuffer(runtime, frameData, width, height);
   }
 
diff --git a/packages/react-native-executorch/src/modules/computer_vision/VisionModule.ts b/packages/react-native-executorch/src/modules/computer_vision/VisionModule.ts
@@ -73,7 +73,7 @@ export abstract class VisionModule<TOutput> extends BaseModule {
    * 1. **String path/URI**: File path, URL, or Base64-encoded string
    * 2. **PixelData**: Raw pixel data from image libraries (e.g., NitroImage)
    *
-   * **Note**: For VisionCamera frame processing, use `forwardSync` instead.
+   * **Note**: For VisionCamera frame processing, use `runOnFrame` instead.
    * This method is async and cannot be called in worklet context.
    *
    * @param input - Image source (string path or PixelData object)
diff --git a/packages/react-native-executorch/src/types/common.ts b/packages/react-native-executorch/src/types/common.ts
@@ -191,5 +191,5 @@ export interface Frame {
    *
    * Obtain from Vision Camera v5: `frame.getNativeBuffer().pointer`
    */
-  getNativeBuffer(): { pointer: number; release(): void };
+  getNativeBuffer(): { pointer: bigint; release(): void };
 }
diff --git a/packages/react-native-executorch/src/types/objectDetection.ts b/packages/react-native-executorch/src/types/objectDetection.ts
@@ -1,5 +1,5 @@
 import { RnExecutorchError } from '../errors/errorUtils';
-import { ResourceSource } from './common';
+import { ResourceSource, PixelData, Frame } from './common';
 
 /**
  * Represents a bounding box for a detected object in an image.
@@ -190,22 +190,14 @@ export interface ObjectDetectionType {
    *
    * // Pixel data
    * const detections2 = await model.forward({
-   *   data: pixelBuffer,
-   *   width: 640,
-   *   height: 480,
-   *   channels: 3
+   *   dataPtr: new Uint8Array(rgbPixels),
+   *   sizes: [480, 640, 3],
+   *   scalarType: ScalarType.BYTE
    * });
    * ```
    */
   forward: (
-    input:
-      | string
-      | {
-          data: ArrayBuffer;
-          width: number;
-          height: number;
-          channels: number;
-        },
+    input: string | PixelData,
     detectionThreshold?: number
   ) => Promise<Detection[]>;
 
@@ -236,5 +228,7 @@ export interface ObjectDetectionType {
    * @param detectionThreshold - The threshold for detection sensitivity. Default is 0.7.
    * @returns Array of Detection objects representing detected items in the frame.
    */
-  runOnFrame: ((frame: any, detectionThreshold?: number) => Detection[]) | null;
+  runOnFrame:
+    | ((frame: Frame, detectionThreshold?: number) => Detection[])
+    | null;
 }

Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ cv::Mat FrameExtractor::extractFromAHardwareBuffer(void *hardwareBuffer) {`
`86`	`86`	`buffer, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN, -1, nullptr, &data);`
`87`	`87`
`88`	`88`	`if (lockResult != 0) {`
`89`		`- throw RnExecutorchError(RnExecutorchErrorCode::AccessFailed,`
	`89`	`+ throw RnExecutorchError(RnExecutorchErrorCode::UnknownError,`
`90`	`90`	`"Failed to lock AHardwareBuffer");`
`91`	`91`	`}`
`92`	`92`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ export abstract class VisionModule<TOutput> extends BaseModule {`
`73`	`73`	`* 1. String path/URI: File path, URL, or Base64-encoded string`
`74`	`74`	`* 2. PixelData: Raw pixel data from image libraries (e.g., NitroImage)`
`75`	`75`	`*`
`76`		- * Note: For VisionCamera frame processing, use `forwardSync` instead.
	`76`	+ * Note: For VisionCamera frame processing, use `runOnFrame` instead.
`77`	`77`	`* This method is async and cannot be called in worklet context.`
`78`	`78`	`*`
`79`	`79`	`* @param input - Image source (string path or PixelData object)`
Original file line number	Diff line number	Diff line change
`@@ -191,5 +191,5 @@ export interface Frame {`
`191`	`191`	`*`
`192`	`192`	* Obtain from Vision Camera v5: `frame.getNativeBuffer().pointer`
`193`	`193`	`*/`
`194`		`- getNativeBuffer(): { pointer: number; release(): void };`
	`194`	`+ getNativeBuffer(): { pointer: bigint; release(): void };`
`195`	`195`	`}`