refactor: apply code review fixes for vision camera integration

NorbertKlockiewicz · claude · NorbertKlockiewicz · commit 2f29c67fa4a7 · 2026-03-11T13:42:07.000+01:00
- visionHostFunction: preserve RnExecutorchError code in catch block
- OCR/VerticalOCR generateFromFrame: add 90° CW rotation for landscape frames
- VisionModel: lift preprocessFrame and modelImageSize from 5 subclasses into base class

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -298,8 +298,24 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
 
         return jsi_conversion::getJsiValue(std::move(result), runtime);
       }
+    } catch (const RnExecutorchError &e) {
+      jsi::Object errorData(runtime);
+      errorData.setProperty(runtime, "code", e.getNumericCode());
+      errorData.setProperty(runtime, "message",
+                            jsi::String::createFromUtf8(runtime, e.what()));
+      throw jsi::JSError(runtime, jsi::Value(runtime, std::move(errorData)));
+    } catch (const std::runtime_error &e) {
+      // This catch should be merged with the next one
+      // (std::runtime_error inherits from std::exception) HOWEVER react
+      // native has broken RTTI which breaks proper exception type
+      // checking. Remove when the following change is present in our
+      // version:
+      // https://github.com/facebook/react-native/commit/3132cc88dd46f95898a756456bebeeb6c248f20e
+      throw jsi::JSError(runtime, e.what());
     } catch (const std::exception &e) {
       throw jsi::JSError(runtime, e.what());
+    } catch (...) {
+      throw jsi::JSError(runtime, "Unknown error in vision function");
     }
   }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.cpp b/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.cpp
@@ -23,6 +23,34 @@ cv::Mat VisionModel::extractFromFrame(jsi::Runtime &runtime,
   return frame;
 }
 
+cv::Mat VisionModel::preprocessFrame(const cv::Mat &frame) const {
+  cv::Mat rgb;
+
+  if (frame.channels() == 4) {
+#ifdef __APPLE__
+    cv::cvtColor(frame, rgb, cv::COLOR_BGRA2RGB);
+#else
+    cv::cvtColor(frame, rgb, cv::COLOR_RGBA2RGB);
+#endif
+  } else if (frame.channels() == 3) {
+    rgb = frame;
+  } else {
+    char errorMessage[100];
+    std::snprintf(errorMessage, sizeof(errorMessage),
+                  "Unsupported frame format: %d channels", frame.channels());
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            errorMessage);
+  }
+
+  if (rgb.size() != modelImageSize) {
+    cv::Mat resized;
+    cv::resize(rgb, resized, modelImageSize);
+    return resized;
+  }
+
+  return rgb;
+}
+
 cv::Mat VisionModel::extractFromPixels(const JSTensorViewIn &tensorView) const {
   if (tensorView.sizes.size() != 3) {
     char errorMessage[100];
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.h b/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.h
@@ -86,11 +86,9 @@ class VisionModel : public BaseModel {
   /**
    * @brief Preprocess a camera frame for model input
    *
-   * This method should implement model-specific preprocessing such as:
-   * - Resizing to the model's expected input size
-   * - Color space conversion (e.g., BGR to RGB)
-   * - Normalization
-   * - Any other model-specific transformations
+   * Converts 4-channel frames (BGRA on iOS, RGBA on Android) to RGB and
+   * resizes to modelImageSize if needed. Subclasses may override for
+   * model-specific preprocessing (e.g., normalisation).
    *
    * @param frame Input frame from camera (already extracted and rotated by
    * FrameExtractor)
@@ -99,7 +97,11 @@ class VisionModel : public BaseModel {
    * @note The input frame is already in RGB format and rotated 90° clockwise
    * @note This method is called under mutex protection in generateFromFrame()
    */
-  virtual cv::Mat preprocessFrame(const cv::Mat &frame) const = 0;
+  virtual cv::Mat preprocessFrame(const cv::Mat &frame) const;
+
+  /// Expected input image dimensions derived from the model's input shape.
+  /// Set by subclass constructors after loading the model.
+  cv::Size modelImageSize{0, 0};
 
   /**
    * @brief Extract and preprocess frame from VisionCamera in one call
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.cpp b/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.cpp
@@ -22,7 +22,7 @@ Classification::Classification(const std::string &modelSource,
   if (modelInputShape.size() < 2) {
     char errorMessage[100];
     std::snprintf(errorMessage, sizeof(errorMessage),
-                  "Unexpected model input size, expected at least 2 dimentions "
+                  "Unexpected model input size, expected at least 2 dimensions "
                   "but got: %zu.",
                   modelInputShape.size());
     throw RnExecutorchError(RnExecutorchErrorCode::WrongDimensions,
@@ -32,34 +32,6 @@ Classification::Classification(const std::string &modelSource,
                             modelInputShape[modelInputShape.size() - 2]);
 }
 
-cv::Mat Classification::preprocessFrame(const cv::Mat &frame) const {
-  cv::Mat rgb;
-
-  if (frame.channels() == 4) {
-#ifdef __APPLE__
-    cv::cvtColor(frame, rgb, cv::COLOR_BGRA2RGB);
-#else
-    cv::cvtColor(frame, rgb, cv::COLOR_RGBA2RGB);
-#endif
-  } else if (frame.channels() == 3) {
-    rgb = frame;
-  } else {
-    char errorMessage[100];
-    std::snprintf(errorMessage, sizeof(errorMessage),
-                  "Unsupported frame format: %d channels", frame.channels());
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            errorMessage);
-  }
-
-  if (rgb.size() != modelImageSize) {
-    cv::Mat resized;
-    cv::resize(rgb, resized, modelImageSize);
-    return resized;
-  }
-
-  return rgb;
-}
-
 std::unordered_map<std::string_view, float>
 Classification::runInference(cv::Mat image) {
   std::scoped_lock lock(inference_mutex_);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.h b/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.h
@@ -31,15 +31,10 @@ class Classification : public VisionModel {
       std::string_view, float>
   generateFromPixels(JSTensorViewIn pixelData);
 
-protected:
-  cv::Mat preprocessFrame(const cv::Mat &frame) const override;
-
 private:
   std::unordered_map<std::string_view, float> runInference(cv::Mat image);
 
   std::unordered_map<std::string_view, float> postprocess(const Tensor &tensor);
-
-  cv::Size modelImageSize{0, 0};
 };
 } // namespace models::classification
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/image/ImageEmbeddings.cpp b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/image/ImageEmbeddings.cpp
@@ -22,7 +22,7 @@ ImageEmbeddings::ImageEmbeddings(
   if (modelInputShape.size() < 2) {
     char errorMessage[100];
     std::snprintf(errorMessage, sizeof(errorMessage),
-                  "Unexpected model input size, expected at least 2 dimentions "
+                  "Unexpected model input size, expected at least 2 dimensions "
                   "but got: %zu.",
                   modelInputShape.size());
     throw RnExecutorchError(RnExecutorchErrorCode::WrongDimensions,
@@ -32,34 +32,6 @@ ImageEmbeddings::ImageEmbeddings(
                             modelInputShape[modelInputShape.size() - 2]);
 }
 
-cv::Mat ImageEmbeddings::preprocessFrame(const cv::Mat &frame) const {
-  cv::Mat rgb;
-
-  if (frame.channels() == 4) {
-#ifdef __APPLE__
-    cv::cvtColor(frame, rgb, cv::COLOR_BGRA2RGB);
-#else
-    cv::cvtColor(frame, rgb, cv::COLOR_RGBA2RGB);
-#endif
-  } else if (frame.channels() == 3) {
-    rgb = frame;
-  } else {
-    char errorMessage[100];
-    std::snprintf(errorMessage, sizeof(errorMessage),
-                  "Unsupported frame format: %d channels", frame.channels());
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            errorMessage);
-  }
-
-  if (rgb.size() != modelImageSize) {
-    cv::Mat resized;
-    cv::resize(rgb, resized, modelImageSize);
-    return resized;
-  }
-
-  return rgb;
-}
-
 std::shared_ptr<OwningArrayBuffer>
 ImageEmbeddings::runInference(cv::Mat image) {
   std::scoped_lock lock(inference_mutex_);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/image/ImageEmbeddings.h b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/image/ImageEmbeddings.h
@@ -31,13 +31,8 @@ class ImageEmbeddings final : public VisionModel {
       "Registered non-void function")]] std::shared_ptr<OwningArrayBuffer>
   generateFromPixels(JSTensorViewIn pixelData);
 
-protected:
-  cv::Mat preprocessFrame(const cv::Mat &frame) const override;
-
 private:
   std::shared_ptr<OwningArrayBuffer> runInference(cv::Mat image);
-
-  cv::Size modelImageSize{0, 0};
 };
 } // namespace models::embeddings
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp b/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp
@@ -23,7 +23,7 @@ ObjectDetection::ObjectDetection(
   if (modelInputShape.size() < 2) {
     char errorMessage[100];
     std::snprintf(errorMessage, sizeof(errorMessage),
-                  "Unexpected model input size, expected at least 2 dimentions "
+                  "Unexpected model input size, expected at least 2 dimensions "
                   "but got: %zu.",
                   modelInputShape.size());
     throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs,
@@ -45,39 +45,6 @@ ObjectDetection::ObjectDetection(
   }
 }
 
-cv::Mat ObjectDetection::preprocessFrame(const cv::Mat &frame) const {
-  const std::vector<int32_t> tensorDims = getAllInputShapes()[0];
-  cv::Size tensorSize = cv::Size(tensorDims[tensorDims.size() - 1],
-                                 tensorDims[tensorDims.size() - 2]);
-
-  cv::Mat rgb;
-
-  if (frame.channels() == 4) {
-#ifdef __APPLE__
-    cv::cvtColor(frame, rgb, cv::COLOR_BGRA2RGB);
-#else
-    cv::cvtColor(frame, rgb, cv::COLOR_RGBA2RGB);
-#endif
-  } else if (frame.channels() == 3) {
-    rgb = frame;
-  } else {
-    char errorMessage[100];
-    std::snprintf(errorMessage, sizeof(errorMessage),
-                  "Unsupported frame format: %d channels", frame.channels());
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            errorMessage);
-  }
-
-  // Only resize if dimensions don't match
-  if (rgb.size() != tensorSize) {
-    cv::Mat resized;
-    cv::resize(rgb, resized, tensorSize);
-    return resized;
-  }
-
-  return rgb;
-}
-
 std::vector<types::Detection>
 ObjectDetection::postprocess(const std::vector<EValue> &tensors,
                              cv::Size originalSize, double detectionThreshold) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.h b/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.h
@@ -77,7 +77,6 @@ class ObjectDetection : public VisionModel {
 protected:
   std::vector<types::Detection> runInference(cv::Mat image,
                                              double detectionThreshold);
-  cv::Mat preprocessFrame(const cv::Mat &frame) const override;
 
 private:
   /**
@@ -100,9 +99,6 @@ class ObjectDetection : public VisionModel {
                                             cv::Size originalSize,
                                             double detectionThreshold);
 
-  /// Expected input image dimensions derived from the model's input shape.
-  cv::Size modelImageSize{0, 0};
-
   /// Optional per-channel mean for input normalisation (set in constructor).
   std::optional<cv::Scalar> normMean_;
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp
@@ -53,6 +53,12 @@ std::vector<types::OCRDetection>
 OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
   auto frameObj = frameData.asObject(runtime);
   cv::Mat frame = ::rnexecutorch::utils::extractFrame(runtime, frameObj);
+  // Camera sensors deliver landscape frames; rotate to portrait orientation.
+  if (frame.cols > frame.rows) {
+    cv::Mat upright;
+    cv::rotate(frame, upright, cv::ROTATE_90_CLOCKWISE);
+    frame = std::move(upright);
+  }
   // extractFrame returns RGB; convert to BGR for consistency with readImage
   cv::cvtColor(frame, frame, cv::COLOR_RGB2BGR);
   return runInference(frame);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.cpp
@@ -50,30 +50,6 @@ void BaseSemanticSegmentation::initModelImageSize() {
   numModelPixels = modelImageSize.area();
 }
 
-cv::Mat BaseSemanticSegmentation::preprocessFrame(const cv::Mat &frame) const {
-  cv::Mat rgb;
-  if (frame.channels() == 4) {
-#ifdef __APPLE__
-    cv::cvtColor(frame, rgb, cv::COLOR_BGRA2RGB);
-#else
-    cv::cvtColor(frame, rgb, cv::COLOR_RGBA2RGB);
-#endif
-  } else if (frame.channels() == 3) {
-    rgb = frame;
-  } else {
-    char msg[64];
-    std::snprintf(msg, sizeof(msg), "Unsupported frame format: %d channels",
-                  frame.channels());
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, msg);
-  }
-  if (rgb.size() != modelImageSize) {
-    cv::Mat resized;
-    cv::resize(rgb, resized, modelImageSize);
-    return resized;
-  }
-  return rgb;
-}
-
 TensorPtr BaseSemanticSegmentation::preprocess(const std::string &imageSource,
                                                cv::Size &originalSize) {
   auto [inputTensor, origSize] = image_processing::readImageToTensor(
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.h b/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.h
@@ -42,15 +42,11 @@ class BaseSemanticSegmentation : public VisionModel {
                     bool resize);
 
 protected:
-  cv::Mat preprocessFrame(const cv::Mat &frame) const override;
-
   virtual image_segmentation::SegmentationResult
   computeResult(const Tensor &tensor, cv::Size originalSize,
                 std::vector<std::string> &allClasses,
                 std::set<std::string, std::less<>> &classesOfInterest,
                 bool resize);
-
-  cv::Size modelImageSize;
   std::size_t numModelPixels;
   std::optional<cv::Scalar> normMean_;
   std::optional<cv::Scalar> normStd_;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/style_transfer/StyleTransfer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/style_transfer/StyleTransfer.cpp
@@ -24,7 +24,7 @@ StyleTransfer::StyleTransfer(const std::string &modelSource,
   if (modelInputShape.size() < 2) {
     char errorMessage[100];
     std::snprintf(errorMessage, sizeof(errorMessage),
-                  "Unexpected model input size, expected at least 2 dimentions "
+                  "Unexpected model input size, expected at least 2 dimensions "
                   "but got: %zu.",
                   modelInputShape.size());
     throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs,
@@ -34,34 +34,6 @@ StyleTransfer::StyleTransfer(const std::string &modelSource,
                             modelInputShape[modelInputShape.size() - 2]);
 }
 
-cv::Mat StyleTransfer::preprocessFrame(const cv::Mat &frame) const {
-  cv::Mat rgb;
-
-  if (frame.channels() == 4) {
-#ifdef __APPLE__
-    cv::cvtColor(frame, rgb, cv::COLOR_BGRA2RGB);
-#else
-    cv::cvtColor(frame, rgb, cv::COLOR_RGBA2RGB);
-#endif
-  } else if (frame.channels() == 3) {
-    rgb = frame;
-  } else {
-    char errorMessage[100];
-    std::snprintf(errorMessage, sizeof(errorMessage),
-                  "Unsupported frame format: %d channels", frame.channels());
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            errorMessage);
-  }
-
-  if (rgb.size() != modelImageSize) {
-    cv::Mat resized;
-    cv::resize(rgb, resized, modelImageSize);
-    return resized;
-  }
-
-  return rgb;
-}
-
 PixelDataResult StyleTransfer::postprocess(const Tensor &tensor,
                                            cv::Size outputSize) {
   // Convert tensor output (at modelImageSize) to CV_8UC3 BGR mat
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/style_transfer/StyleTransfer.h b/packages/react-native-executorch/common/rnexecutorch/models/style_transfer/StyleTransfer.h
@@ -33,18 +33,13 @@ class StyleTransfer : public VisionModel {
   [[nodiscard("Registered non-void function")]] PixelDataResult
   generateFromPixels(JSTensorViewIn pixelData);
 
-protected:
-  cv::Mat preprocessFrame(const cv::Mat &frame) const override;
-
 private:
   // outputSize: size to resize the styled output to before returning.
   //   Pass modelImageSize for real-time frame processing (avoids large allocs).
   //   Pass the source image size for generateFromString/generateFromPixels.
   PixelDataResult runInference(cv::Mat image, cv::Size outputSize);
 
   PixelDataResult postprocess(const Tensor &tensor, cv::Size outputSize);
-
-  cv::Size modelImageSize{0, 0};
 };
 } // namespace models::style_transfer
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp