software-mansion
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/data_processing/ImageProcessing.cpp‎
Lines changed: 56 additions & 2 deletions b/‎packages/react-native-executorch/common/rnexecutorch/data_processing/ImageProcessing.cpp‎
Lines changed: 56 additions & 2 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/data_processing/ImageProcessing.h‎
Lines changed: 10 additions & 2 deletions b/‎packages/react-native-executorch/common/rnexecutorch/data_processing/ImageProcessing.h‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/ocr/Constants.h‎
Lines changed: 38 additions & 0 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/ocr/Constants.h‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.cpp‎
Lines changed: 88 additions & 0 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.cpp‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.h‎
Lines changed: 25 additions & 0 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.h‎
Lines changed: 25 additions & 0 deletions
@@ -126,9 +126,59 @@ cv::Mat getMatrixFromTensor(cv::Size size, const Tensor &tensor) {
                           size);
 }
 
+cv::Mat resizePadded(cv::Mat inputImage, cv::Size targetSize) {
+  cv::Size inputSize = inputImage.size();
+  const float heightRatio = (float)targetSize.height / inputSize.height;
+  const float widthRatio = (float)targetSize.width / inputSize.width;
+  const float resizeRatio = std::min(heightRatio, widthRatio);
+
+  const int newWidth = inputSize.width * resizeRatio;
+  const int newHeight = inputSize.height * resizeRatio;
+
+  cv::Mat resizedImg;
+  cv::resize(inputImage, resizedImg, cv::Size(newWidth, newHeight), 0, 0,
+             cv::INTER_AREA);
+
+  const int cornerPatchSize =
+      std::max(1, std::min(inputSize.height, inputSize.width) / 30);
+  std::vector<cv::Mat> corners = {
+      inputImage(cv::Rect(0, 0, cornerPatchSize, cornerPatchSize)),
+      inputImage(cv::Rect(inputSize.width - cornerPatchSize, 0, cornerPatchSize,
+                          cornerPatchSize)),
+      inputImage(cv::Rect(0, inputSize.height - cornerPatchSize,
+                          cornerPatchSize, cornerPatchSize)),
+      inputImage(cv::Rect(inputSize.width - cornerPatchSize,
+                          inputSize.height - cornerPatchSize, cornerPatchSize,
+                          cornerPatchSize))};
+
+  cv::Scalar backgroundScalar = cv::mean(corners[0]);
+  for (int i = 1; i < corners.size(); i++) {
+    backgroundScalar += cv::mean(corners[i]);
+  }
+  backgroundScalar /= (double)corners.size();
+
+  backgroundScalar[0] = cvFloor(backgroundScalar[0]);
+  backgroundScalar[1] = cvFloor(backgroundScalar[1]);
+  backgroundScalar[2] = cvFloor(backgroundScalar[2]);
+
+  const int deltaW = targetSize.width - newWidth;
+  const int deltaH = targetSize.height - newHeight;
+  const int top = deltaH / 2;
+  const int bottom = deltaH - top;
+  const int left = deltaW / 2;
+  const int right = deltaW - left;
+
+  cv::Mat centeredImg;
+  cv::copyMakeBorder(resizedImg, centeredImg, top, bottom, left, right,
+                     cv::BORDER_CONSTANT, backgroundScalar);
+
+  return centeredImg;
+}
+
 std::pair<TensorPtr, cv::Size>
 readImageToTensor(const std::string &path,
-                  const std::vector<int32_t> &tensorDims) {
+                  const std::vector<int32_t> &tensorDims,
+                  bool maintainAspectRatio) {
   cv::Mat input = imageprocessing::readImage(path);
   cv::Size imageSize = input.size();
 
@@ -143,7 +193,11 @@ readImageToTensor(const std::string &path,
   cv::Size tensorSize = cv::Size(tensorDims[tensorDims.size() - 1],
                                  tensorDims[tensorDims.size() - 2]);
 
-  cv::resize(input, input, tensorSize);
+  if (maintainAspectRatio) {
+    input = resizePadded(input, tensorSize);
+  } else {
+    cv::resize(input, input, tensorSize);
+  }
 
   cv::cvtColor(input, input, cv::COLOR_BGR2RGB);
 
 
@@ -29,10 +29,18 @@ cv::Mat readImage(const std::string &imageURI);
 TensorPtr getTensorFromMatrix(const std::vector<int32_t> &tensorDims,
                               const cv::Mat &mat);
 cv::Mat getMatrixFromTensor(cv::Size size, const Tensor &tensor);
+cv::Mat resizePadded(cv::Mat inputImage, cv::Size targetSize);
 /// @brief Read image, resize it and copy it to an ET tensor to store it.
+/// @param path Path to the image to be resized. Could be base64, local file or
+/// remote URL
+/// @param tensorDims The dimensions of the result tensor. The two last
+/// dimensions are taken as the image resolution.
+/// @param maintainAspectRatio If set to true the image will be resized to
+/// maintain the original aspect ratio. The rest of the tensor will be filled
+/// padding.
 /// @return Returns a tensor pointer and the original size of the image.
 std::pair<TensorPtr, cv::Size>
 readImageToTensor(const std::string &path,
-                  const std::vector<int32_t> &tensorDims);
-
+                  const std::vector<int32_t> &tensorDims,
+                  bool maintainAspectRatio = false);
 } // namespace rnexecutorch::imageprocessing
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <cstdint>
+
+#include <opencv2/opencv.hpp>
+
+namespace rnexecutorch::ocr {
+
+inline constexpr float textThreshold = 0.4;
+inline constexpr float textThresholdVertical = 0.3;
+inline constexpr float linkThreshold = 0.4;
+inline constexpr float lowTextThreshold = 0.7;
+inline constexpr float centerThreshold = 0.5;
+inline constexpr float distanceThreshold = 2.0;
+inline constexpr float heightThreshold = 2.0;
+inline constexpr float restoreRatio = 3.2;
+inline constexpr float restoreRatioVertical = 2.0;
+inline constexpr float singleCharacterCenterThreshold = 0.3;
+inline constexpr float lowConfidenceThreshold = 0.3;
+inline constexpr float adjustContrast = 0.2;
+inline constexpr int32_t minSideThreshold = 15;
+inline constexpr int32_t maxSideThreshold = 30;
+inline constexpr int32_t recognizerHeight = 64;
+inline constexpr int32_t largeRecognizerWidth = 512;
+inline constexpr int32_t mediumRecognizerWidth = 256;
+inline constexpr int32_t smallRecognizerWidth = 128;
+inline constexpr int32_t smallVerticalRecognizerWidth = 64;
+inline constexpr int32_t maxWidth =
+    largeRecognizerWidth + (largeRecognizerWidth * 0.15);
+inline constexpr int32_t minSize = 20;
+inline constexpr int32_t singleCharacterMinSize = 70;
+inline constexpr int32_t recognizerImageSize = 1280;
+inline constexpr int32_t verticalLineThreshold = 20;
+
+inline const cv::Scalar mean(0.485, 0.456, 0.406);
+inline const cv::Scalar variance(0.229, 0.224, 0.225);
+
+} // namespace rnexecutorch::ocr
@@ -0,0 +1,88 @@
+#include "Detector.h"
+
+#include <rnexecutorch/data_processing/ImageProcessing.h>
+#include <rnexecutorch/models/ocr/Constants.h>
+#include <rnexecutorch/models/ocr/DetectorUtils.h>
+
+namespace rnexecutorch {
+
+/*
+The model used as detector is based on CRAFT (Character Region Awareness for
+Text Detection) paper. https://arxiv.org/pdf/1904.01941
+*/
+
+Detector::Detector(const std::string &modelSource,
+                   std::shared_ptr<react::CallInvoker> callInvoker)
+    : BaseModel(modelSource, callInvoker) {
+  auto inputShapes = getInputShape();
+  if (inputShapes.size() == 0) {
+    throw std::runtime_error(
+        "Detector model seems to not take any input tensors.");
+  }
+  std::vector<int32_t> modelInputShape = inputShapes[0];
+  if (modelInputShape.size() < 2) {
+    char errorMessage[100];
+    std::snprintf(errorMessage, sizeof(errorMessage),
+                  "Unexpected detector model input size, expected at least 2 "
+                  "dimentions but got: %zu.",
+                  modelInputShape.size());
+    throw std::runtime_error(errorMessage);
+  }
+  modelImageSize = cv::Size(modelInputShape[modelInputShape.size() - 1],
+                            modelInputShape[modelInputShape.size() - 2]);
+}
+
+std::vector<DetectorBBox> Detector::forward(const std::string &imageSource) {
+  /*
+   Detector as an input accepts tensor with a shape of [1, 3, 800, 800].
+   Due to big influence of resize to quality of recognition the image preserves
+   original aspect ratio and the missing parts are filled with padding.
+   */
+  auto [inputTensor, originalSize] =
+      imageprocessing::readImageToTensor(imageSource, getInputShape()[0], true);
+
+  auto forwardResult = forwardET(inputTensor);
+  if (!forwardResult.ok()) {
+    throw std::runtime_error(
+        "Failed to forward, error: " +
+        std::to_string(static_cast<uint32_t>(forwardResult.error())));
+  }
+
+  return postprocess(forwardResult->at(0).toTensor(), originalSize);
+}
+
+std::vector<DetectorBBox> Detector::postprocess(const Tensor &tensor,
+                                                cv::Size originalSize) {
+  /*
+   The output of the model consists of two matrices (heat maps):
+   1. ScoreText(Score map) - The probability of a region containing character.
+   2. ScoreAffinity(Affinity map) - affinity between characters, used to to
+   group each character into a single instance (sequence) Both matrices are
+   H/2xW/2.
+
+   The result of this step is a list of bounding boxes that contain text.
+   */
+  std::span<const float> tensorData(
+      static_cast<const float *>(tensor.const_data_ptr()), tensor.numel());
+  /*
+   The output of the model is a matrix half the size of the input image
+   containing two channels representing the heatmaps.
+   */
+  auto [scoreTextMat, scoreAffinityMat] = ocr::interleavedArrayToMats(
+      tensorData,
+      cv::Size(modelImageSize.width / 2, modelImageSize.height / 2));
+  std::vector<DetectorBBox> bBoxesList = ocr::getDetBoxesFromTextMap(
+      scoreTextMat, scoreAffinityMat, ocr::textThreshold, ocr::linkThreshold,
+      ocr::lowTextThreshold);
+
+  ocr::restoreBboxRatio(bBoxesList, ocr::restoreRatio);
+
+  bBoxesList = ocr::groupTextBoxes(bBoxesList, ocr::centerThreshold,
+                                   ocr::distanceThreshold, ocr::heightThreshold,
+                                   ocr::minSideThreshold, ocr::maxSideThreshold,
+                                   ocr::maxWidth);
+
+  return bBoxesList;
+}
+
+} // namespace rnexecutorch
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <opencv2/opencv.hpp>
+
+#include <rnexecutorch/models/BaseModel.h>
+#include <rnexecutorch/models/ocr/Types.h>
+
+namespace rnexecutorch {
+using executorch::aten::Tensor;
+using executorch::extension::TensorPtr;
+
+class Detector : BaseModel {
+public:
+  Detector(const std::string &modelSource,
+           std::shared_ptr<react::CallInvoker> callInvoker);
+  std::vector<DetectorBBox> forward(const std::string &imageSource);
+
+private:
+  std::vector<DetectorBBox> postprocess(const Tensor &tensor,
+                                        cv::Size originalSize);
+
+  cv::Size modelImageSize;
+};
+} // namespace rnexecutorch