Skip to content

Commit 0a0f456

Browse files
committed
feat: port ocr detector
1 parent 1adeac9 commit 0a0f456

11 files changed

Lines changed: 881 additions & 4 deletions

File tree

packages/react-native-executorch/common/rnexecutorch/data_processing/ImageProcessing.cpp

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,59 @@ cv::Mat getMatrixFromTensor(cv::Size size, const Tensor &tensor) {
126126
size);
127127
}
128128

129+
cv::Mat resizePadded(cv::Mat inputImage, cv::Size targetSize) {
130+
cv::Size inputSize = inputImage.size();
131+
const float heightRatio = (float)targetSize.height / inputSize.height;
132+
const float widthRatio = (float)targetSize.width / inputSize.width;
133+
const float resizeRatio = std::min(heightRatio, widthRatio);
134+
135+
const int newWidth = inputSize.width * resizeRatio;
136+
const int newHeight = inputSize.height * resizeRatio;
137+
138+
cv::Mat resizedImg;
139+
cv::resize(inputImage, resizedImg, cv::Size(newWidth, newHeight), 0, 0,
140+
cv::INTER_AREA);
141+
142+
const int cornerPatchSize =
143+
std::max(1, std::min(inputSize.height, inputSize.width) / 30);
144+
std::vector<cv::Mat> corners = {
145+
inputImage(cv::Rect(0, 0, cornerPatchSize, cornerPatchSize)),
146+
inputImage(cv::Rect(inputSize.width - cornerPatchSize, 0, cornerPatchSize,
147+
cornerPatchSize)),
148+
inputImage(cv::Rect(0, inputSize.height - cornerPatchSize,
149+
cornerPatchSize, cornerPatchSize)),
150+
inputImage(cv::Rect(inputSize.width - cornerPatchSize,
151+
inputSize.height - cornerPatchSize, cornerPatchSize,
152+
cornerPatchSize))};
153+
154+
cv::Scalar backgroundScalar = cv::mean(corners[0]);
155+
for (int i = 1; i < corners.size(); i++) {
156+
backgroundScalar += cv::mean(corners[i]);
157+
}
158+
backgroundScalar /= (double)corners.size();
159+
160+
backgroundScalar[0] = cvFloor(backgroundScalar[0]);
161+
backgroundScalar[1] = cvFloor(backgroundScalar[1]);
162+
backgroundScalar[2] = cvFloor(backgroundScalar[2]);
163+
164+
const int deltaW = targetSize.width - newWidth;
165+
const int deltaH = targetSize.height - newHeight;
166+
const int top = deltaH / 2;
167+
const int bottom = deltaH - top;
168+
const int left = deltaW / 2;
169+
const int right = deltaW - left;
170+
171+
cv::Mat centeredImg;
172+
cv::copyMakeBorder(resizedImg, centeredImg, top, bottom, left, right,
173+
cv::BORDER_CONSTANT, backgroundScalar);
174+
175+
return centeredImg;
176+
}
177+
129178
std::pair<TensorPtr, cv::Size>
130179
readImageToTensor(const std::string &path,
131-
const std::vector<int32_t> &tensorDims) {
180+
const std::vector<int32_t> &tensorDims,
181+
bool maintainAspectRatio) {
132182
cv::Mat input = imageprocessing::readImage(path);
133183
cv::Size imageSize = input.size();
134184

@@ -143,7 +193,11 @@ readImageToTensor(const std::string &path,
143193
cv::Size tensorSize = cv::Size(tensorDims[tensorDims.size() - 1],
144194
tensorDims[tensorDims.size() - 2]);
145195

146-
cv::resize(input, input, tensorSize);
196+
if (maintainAspectRatio) {
197+
input = resizePadded(input, tensorSize);
198+
} else {
199+
cv::resize(input, input, tensorSize);
200+
}
147201

148202
cv::cvtColor(input, input, cv::COLOR_BGR2RGB);
149203

packages/react-native-executorch/common/rnexecutorch/data_processing/ImageProcessing.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,18 @@ cv::Mat readImage(const std::string &imageURI);
2929
TensorPtr getTensorFromMatrix(const std::vector<int32_t> &tensorDims,
3030
const cv::Mat &mat);
3131
cv::Mat getMatrixFromTensor(cv::Size size, const Tensor &tensor);
32+
cv::Mat resizePadded(cv::Mat inputImage, cv::Size targetSize);
3233
/// @brief Read image, resize it and copy it to an ET tensor to store it.
34+
/// @param path Path to the image to be resized. Could be base64, local file or
35+
/// remote URL
36+
/// @param tensorDims The dimensions of the result tensor. The two last
37+
/// dimensions are taken as the image resolution.
38+
/// @param maintainAspectRatio If set to true the image will be resized to
39+
/// maintain the original aspect ratio. The rest of the tensor will be filled
40+
/// padding.
3341
/// @return Returns a tensor pointer and the original size of the image.
3442
std::pair<TensorPtr, cv::Size>
3543
readImageToTensor(const std::string &path,
36-
const std::vector<int32_t> &tensorDims);
37-
44+
const std::vector<int32_t> &tensorDims,
45+
bool maintainAspectRatio = false);
3846
} // namespace rnexecutorch::imageprocessing
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#pragma once
2+
3+
#include <cstdint>
4+
5+
#include <opencv2/opencv.hpp>
6+
7+
namespace rnexecutorch::ocr {
8+
9+
inline constexpr float textThreshold = 0.4;
10+
inline constexpr float textThresholdVertical = 0.3;
11+
inline constexpr float linkThreshold = 0.4;
12+
inline constexpr float lowTextThreshold = 0.7;
13+
inline constexpr float centerThreshold = 0.5;
14+
inline constexpr float distanceThreshold = 2.0;
15+
inline constexpr float heightThreshold = 2.0;
16+
inline constexpr float restoreRatio = 3.2;
17+
inline constexpr float restoreRatioVertical = 2.0;
18+
inline constexpr float singleCharacterCenterThreshold = 0.3;
19+
inline constexpr float lowConfidenceThreshold = 0.3;
20+
inline constexpr float adjustContrast = 0.2;
21+
inline constexpr int32_t minSideThreshold = 15;
22+
inline constexpr int32_t maxSideThreshold = 30;
23+
inline constexpr int32_t recognizerHeight = 64;
24+
inline constexpr int32_t largeRecognizerWidth = 512;
25+
inline constexpr int32_t mediumRecognizerWidth = 256;
26+
inline constexpr int32_t smallRecognizerWidth = 128;
27+
inline constexpr int32_t smallVerticalRecognizerWidth = 64;
28+
inline constexpr int32_t maxWidth =
29+
largeRecognizerWidth + (largeRecognizerWidth * 0.15);
30+
inline constexpr int32_t minSize = 20;
31+
inline constexpr int32_t singleCharacterMinSize = 70;
32+
inline constexpr int32_t recognizerImageSize = 1280;
33+
inline constexpr int32_t verticalLineThreshold = 20;
34+
35+
inline const cv::Scalar mean(0.485, 0.456, 0.406);
36+
inline const cv::Scalar variance(0.229, 0.224, 0.225);
37+
38+
} // namespace rnexecutorch::ocr
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#include "Detector.h"
2+
3+
#include <rnexecutorch/data_processing/ImageProcessing.h>
4+
#include <rnexecutorch/models/ocr/Constants.h>
5+
#include <rnexecutorch/models/ocr/DetectorUtils.h>
6+
7+
namespace rnexecutorch {
8+
9+
/*
10+
The model used as detector is based on CRAFT (Character Region Awareness for
11+
Text Detection) paper. https://arxiv.org/pdf/1904.01941
12+
*/
13+
14+
Detector::Detector(const std::string &modelSource,
15+
std::shared_ptr<react::CallInvoker> callInvoker)
16+
: BaseModel(modelSource, callInvoker) {
17+
auto inputShapes = getInputShape();
18+
if (inputShapes.size() == 0) {
19+
throw std::runtime_error(
20+
"Detector model seems to not take any input tensors.");
21+
}
22+
std::vector<int32_t> modelInputShape = inputShapes[0];
23+
if (modelInputShape.size() < 2) {
24+
char errorMessage[100];
25+
std::snprintf(errorMessage, sizeof(errorMessage),
26+
"Unexpected detector model input size, expected at least 2 "
27+
"dimentions but got: %zu.",
28+
modelInputShape.size());
29+
throw std::runtime_error(errorMessage);
30+
}
31+
modelImageSize = cv::Size(modelInputShape[modelInputShape.size() - 1],
32+
modelInputShape[modelInputShape.size() - 2]);
33+
}
34+
35+
std::vector<DetectorBBox> Detector::forward(const std::string &imageSource) {
36+
/*
37+
Detector as an input accepts tensor with a shape of [1, 3, 800, 800].
38+
Due to big influence of resize to quality of recognition the image preserves
39+
original aspect ratio and the missing parts are filled with padding.
40+
*/
41+
auto [inputTensor, originalSize] =
42+
imageprocessing::readImageToTensor(imageSource, getInputShape()[0], true);
43+
44+
auto forwardResult = forwardET(inputTensor);
45+
if (!forwardResult.ok()) {
46+
throw std::runtime_error(
47+
"Failed to forward, error: " +
48+
std::to_string(static_cast<uint32_t>(forwardResult.error())));
49+
}
50+
51+
return postprocess(forwardResult->at(0).toTensor(), originalSize);
52+
}
53+
54+
std::vector<DetectorBBox> Detector::postprocess(const Tensor &tensor,
55+
cv::Size originalSize) {
56+
/*
57+
The output of the model consists of two matrices (heat maps):
58+
1. ScoreText(Score map) - The probability of a region containing character.
59+
2. ScoreAffinity(Affinity map) - affinity between characters, used to to
60+
group each character into a single instance (sequence) Both matrices are
61+
H/2xW/2.
62+
63+
The result of this step is a list of bounding boxes that contain text.
64+
*/
65+
std::span<const float> tensorData(
66+
static_cast<const float *>(tensor.const_data_ptr()), tensor.numel());
67+
/*
68+
The output of the model is a matrix half the size of the input image
69+
containing two channels representing the heatmaps.
70+
*/
71+
auto [scoreTextMat, scoreAffinityMat] = ocr::interleavedArrayToMats(
72+
tensorData,
73+
cv::Size(modelImageSize.width / 2, modelImageSize.height / 2));
74+
std::vector<DetectorBBox> bBoxesList = ocr::getDetBoxesFromTextMap(
75+
scoreTextMat, scoreAffinityMat, ocr::textThreshold, ocr::linkThreshold,
76+
ocr::lowTextThreshold);
77+
78+
ocr::restoreBboxRatio(bBoxesList, ocr::restoreRatio);
79+
80+
bBoxesList = ocr::groupTextBoxes(bBoxesList, ocr::centerThreshold,
81+
ocr::distanceThreshold, ocr::heightThreshold,
82+
ocr::minSideThreshold, ocr::maxSideThreshold,
83+
ocr::maxWidth);
84+
85+
return bBoxesList;
86+
}
87+
88+
} // namespace rnexecutorch
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#pragma once
2+
3+
#include <executorch/extension/tensor/tensor_ptr.h>
4+
#include <opencv2/opencv.hpp>
5+
6+
#include <rnexecutorch/models/BaseModel.h>
7+
#include <rnexecutorch/models/ocr/Types.h>
8+
9+
namespace rnexecutorch {
10+
using executorch::aten::Tensor;
11+
using executorch::extension::TensorPtr;
12+
13+
class Detector : BaseModel {
14+
public:
15+
Detector(const std::string &modelSource,
16+
std::shared_ptr<react::CallInvoker> callInvoker);
17+
std::vector<DetectorBBox> forward(const std::string &imageSource);
18+
19+
private:
20+
std::vector<DetectorBBox> postprocess(const Tensor &tensor,
21+
cv::Size originalSize);
22+
23+
cv::Size modelImageSize;
24+
};
25+
} // namespace rnexecutorch

0 commit comments

Comments
 (0)