Skip to content

Commit 21e8edf

Browse files
feat: model sees the same thing as user approach
1 parent 3da656a commit 21e8edf

11 files changed

Lines changed: 138 additions & 98 deletions

File tree

apps/computer-vision/app/vision_camera/index.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ export default function VisionCameraScreen() {
202202
outputs={frameOutput ? [frameOutput] : []}
203203
isActive={isFocused}
204204
format={format}
205-
orientationSource="interface"
205+
orientationSource="device"
206206
/>
207207

208208
{/* Layout sentinel — measures the full-screen area for bbox/canvas sizing */}

apps/computer-vision/components/vision_camera/tasks/ObjectDetectionTask.tsx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ export default function ObjectDetectionTask({
7171
pixelFormat: 'rgb',
7272
dropFramesWhileBusy: true,
7373
enablePreviewSizedOutputBuffers: true,
74+
7475
onFrame: useCallback(
7576
(frame: Frame) => {
7677
'worklet';
@@ -80,10 +81,10 @@ export default function ObjectDetectionTask({
8081
}
8182
try {
8283
if (!detRof) return;
83-
// C++ always does CW rotation, so output space is always frameH × frameW
84+
const result = detRof(frame, cameraPositionSync.getDirty(), 0.5);
85+
// C++ maps coords to screen space (portrait: frameH × frameW)
8486
const screenW = frame.height;
8587
const screenH = frame.width;
86-
const result = detRof(frame, cameraPositionSync.getDirty(), 0.5);
8788
if (result) {
8889
scheduleOnRN(updateDetections, {
8990
results: result,

apps/computer-vision/components/vision_camera/tasks/SegmentationTask.tsx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ export default function SegmentationTask({
140140
pixelFormat: 'rgb',
141141
dropFramesWhileBusy: true,
142142
enablePreviewSizedOutputBuffers: true,
143+
143144
onFrame: useCallback(
144145
(frame: Frame) => {
145146
'worklet';
@@ -149,7 +150,12 @@ export default function SegmentationTask({
149150
}
150151
try {
151152
if (!segRof) return;
152-
const result = segRof(frame, cameraPositionSync.getDirty(), [], false);
153+
const result = segRof(
154+
frame,
155+
cameraPositionSync.getDirty(),
156+
[],
157+
false
158+
);
153159
if (result?.ARGMAX) {
154160
const argmax: Int32Array = result.ARGMAX;
155161
const side = Math.round(Math.sqrt(argmax.length));

packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,10 +141,11 @@ ObjectDetection::generateFromFrame(jsi::Runtime &runtime,
141141
double detectionThreshold) {
142142
auto orient = extractFrameOrientation(runtime, frameData);
143143
cv::Mat frame = extractFromFrame(runtime, frameData);
144-
auto detections = runInference(frame, detectionThreshold);
144+
cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(frame, orient);
145+
auto detections = runInference(rotated, detectionThreshold);
145146
for (auto &det : detections) {
146-
::rnexecutorch::utils::transformBbox(det.x1, det.y1, det.x2, det.y2,
147-
orient);
147+
::rnexecutorch::utils::inverseRotateBbox(
148+
det.x1, det.y1, det.x2, det.y2, orient, rotated.cols, rotated.rows);
148149
}
149150
return detections;
150151
}

packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ std::vector<types::OCRDetection> OCR::generateFromString(std::string input) {
5252
std::vector<types::OCRDetection>
5353
OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
5454
auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData);
55-
5655
cv::Mat frame = ::rnexecutorch::utils::frameToMat(runtime, frameData);
5756
cv::Mat bgr;
5857
#ifdef __APPLE__
@@ -64,13 +63,8 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
6463
RnExecutorchErrorCode::PlatformNotSupported,
6564
"generateFromFrame is not supported on this platform");
6665
#endif
67-
std::vector<types::OCRDetection> detections = runInference(bgr);
68-
69-
for (auto &det : detections) {
70-
::rnexecutorch::utils::transformPoints(det.bbox, orient);
71-
}
72-
73-
return detections;
66+
cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(bgr, orient);
67+
return runInference(rotated);
7468
}
7569

7670
std::vector<types::OCRDetection>

packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.cpp

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -98,32 +98,27 @@ BaseSemanticSegmentation::generateFromFrame(
9898
std::set<std::string, std::less<>> classesOfInterest, bool resize) {
9999
auto orient = extractFrameOrientation(runtime, frameData);
100100
cv::Mat frame = extractFromFrame(runtime, frameData);
101-
auto result = runInference(frame, frame.size(), classesOfInterest, resize);
101+
cv::Mat rotated = utils::rotateFrameForModel(frame, orient);
102+
auto result = runInference(rotated, rotated.size(), classesOfInterest, resize);
102103

103-
// Pre-rotation dimensions from runInference — used to wrap raw buffers before transform.
104104
const int w = result.outputWidth;
105105
const int h = result.outputHeight;
106106

107-
// Transform argmax mask
108107
if (result.argmax && w > 0 && h > 0) {
109-
cv::Mat argmaxMat(h, w, CV_32SC1, result.argmax->data());
110-
cv::Mat transformed = utils::transformMat(argmaxMat, orient);
108+
cv::Mat m(h, w, CV_32SC1, result.argmax->data());
109+
cv::Mat inv = utils::inverseRotateMat(m, orient);
111110
result.argmax = std::make_shared<OwningArrayBuffer>(
112-
transformed.data,
113-
static_cast<size_t>(transformed.total() * transformed.elemSize()));
114-
// Update dimensions to reflect post-rotation layout (right/left swaps w↔h)
115-
result.outputWidth = transformed.cols;
116-
result.outputHeight = transformed.rows;
111+
inv.data, static_cast<size_t>(inv.total() * inv.elemSize()));
112+
result.outputWidth = inv.cols;
113+
result.outputHeight = inv.rows;
117114
}
118115

119-
// Transform each class probability buffer
120116
if (result.classBuffers && w > 0 && h > 0) {
121117
for (auto &[label, buf] : *result.classBuffers) {
122-
cv::Mat classMat(h, w, CV_32FC1, buf->data());
123-
cv::Mat transformed = utils::transformMat(classMat, orient);
118+
cv::Mat m(h, w, CV_32FC1, buf->data());
119+
cv::Mat inv = utils::inverseRotateMat(m, orient);
124120
buf = std::make_shared<OwningArrayBuffer>(
125-
transformed.data,
126-
static_cast<size_t>(transformed.total() * transformed.elemSize()));
121+
inv.data, static_cast<size_t>(inv.total() * inv.elemSize()));
127122
}
128123
}
129124

packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,8 @@ VerticalOCR::generateFromFrame(jsi::Runtime &runtime,
6767
RnExecutorchErrorCode::PlatformNotSupported,
6868
"generateFromFrame is not supported on this platform");
6969
#endif
70-
std::vector<types::OCRDetection> detections = runInference(bgr);
71-
72-
for (auto &det : detections) {
73-
::rnexecutorch::utils::transformPoints(det.bbox, orient);
74-
}
75-
76-
return detections;
70+
cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(bgr, orient);
71+
return runInference(rotated);
7772
}
7873

7974
std::vector<types::OCRDetection>

packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp

Lines changed: 69 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -16,44 +16,24 @@ void transformBbox(float &x1, float &y1, float &x2, float &y2,
1616
x2 = nx2;
1717
}
1818

19-
// Sensor native = landscape-left ("up" = no-op).
20-
// "up" = landscape-left: no-op.
21-
// "down" = landscape-right: 180°.
22-
// "left" = portrait: CCW (new_x = y, new_y = w - x).
23-
// "right" = upside-down portrait: CW (new_x = h - y, new_y = x).
24-
if (orient.orientation == "up") {
25-
// CW: new_x = h - y, new_y = x
26-
float nx1 = h - y2, ny1 = x1;
27-
float nx2 = h - y1, ny2 = x2;
28-
x1 = nx1; y1 = ny1;
29-
x2 = nx2; y2 = ny2;
30-
} else if (orient.orientation == "down") {
31-
// CW: new_x = h - y, new_y = x
32-
float nx1 = h - y2, ny1 = x1;
33-
float nx2 = h - y1, ny2 = x2;
34-
x1 = nx1; y1 = ny1;
35-
x2 = nx2; y2 = ny2;
36-
} else if (orient.orientation == "left") {
37-
// CW: new_x = h - y, new_y = x
19+
// Sensor native = landscape-left — apply CW rotation for all orientations.
20+
{
3821
float nx1 = h - y2, ny1 = x1;
3922
float nx2 = h - y1, ny2 = x2;
40-
x1 = nx1; y1 = ny1;
41-
x2 = nx2; y2 = ny2;
42-
} else {
43-
assert(orient.orientation == "right" && "Unknown orientation; expected up/right/left/down");
44-
// CW: new_x = h - y, new_y = x
45-
float nx1 = h - y2, ny1 = x1;
46-
float nx2 = h - y1, ny2 = x2;
47-
x1 = nx1; y1 = ny1;
48-
x2 = nx2; y2 = ny2;
23+
x1 = nx1;
24+
y1 = ny1;
25+
x2 = nx2;
26+
y2 = ny2;
4927
}
5028

5129
// Extra 180° in post-rotation screen space (screen dims are h x w after CW).
5230
if (orient.rotate180) {
5331
float nx1 = h - x2, ny1 = w - y2;
5432
float nx2 = h - x1, ny2 = w - y1;
55-
x1 = nx1; y1 = ny1;
56-
x2 = nx2; y2 = ny2;
33+
x1 = nx1;
34+
y1 = ny1;
35+
x2 = nx2;
36+
y2 = ny2;
5737
}
5838
}
5939

@@ -65,17 +45,31 @@ cv::Mat transformMat(const cv::Mat &mat, const FrameOrientation &orient) {
6545
cv::flip(result, result, 1);
6646
}
6747

68-
// Sensor native = landscape-left ("up" = no-op).
69-
if (orient.orientation == "up") {
48+
// Sensor native = landscape-left — apply CW rotation.
49+
cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
50+
51+
if (orient.rotate180) {
52+
cv::rotate(result, result, cv::ROTATE_180);
53+
}
54+
55+
return result;
56+
}
57+
58+
cv::Mat rotateFrameForModel(const cv::Mat &mat, const FrameOrientation &orient) {
59+
cv::Mat result = mat.clone();
60+
61+
if (orient.isMirrored) {
62+
cv::flip(result, result, 1);
63+
}
64+
65+
if (orient.orientation == "left") {
7066
cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
67+
} else if (orient.orientation == "right") {
68+
cv::rotate(result, result, cv::ROTATE_90_COUNTERCLOCKWISE);
7169
} else if (orient.orientation == "down") {
72-
cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
73-
} else if (orient.orientation == "left") {
74-
cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
75-
} else {
76-
assert(orient.orientation == "right" && "Unknown orientation; expected up/right/left/down");
77-
cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
70+
cv::rotate(result, result, cv::ROTATE_180);
7871
}
72+
// "up" = no rotation needed.
7973

8074
if (orient.rotate180) {
8175
cv::rotate(result, result, cv::ROTATE_180);
@@ -84,4 +78,41 @@ cv::Mat transformMat(const cv::Mat &mat, const FrameOrientation &orient) {
8478
return result;
8579
}
8680

81+
void inverseRotateBbox(float &x1, float &y1, float &x2, float &y2,
82+
const FrameOrientation &orient, int rW, int rH) {
83+
const float w = static_cast<float>(rW);
84+
const float h = static_cast<float>(rH);
85+
86+
if (orient.orientation == "up") {
87+
// CW: nx = h - y, ny = x
88+
float nx1 = h - y2, ny1 = x1;
89+
float nx2 = h - y1, ny2 = x2;
90+
x1 = nx1; y1 = ny1; x2 = nx2; y2 = ny2;
91+
} else if (orient.orientation == "right") {
92+
// 180°: nx = w - x, ny = h - y
93+
float nx1 = w - x2, ny1 = h - y2;
94+
float nx2 = w - x1, ny2 = h - y1;
95+
x1 = nx1; y1 = ny1; x2 = nx2; y2 = ny2;
96+
} else if (orient.orientation == "down") {
97+
// CCW: nx = y, ny = w - x
98+
float nx1 = y1, ny1 = w - x2;
99+
float nx2 = y2, ny2 = w - x1;
100+
x1 = nx1; y1 = ny1; x2 = nx2; y2 = ny2;
101+
}
102+
// "left": no-op
103+
}
104+
105+
cv::Mat inverseRotateMat(const cv::Mat &mat, const FrameOrientation &orient) {
106+
cv::Mat result = mat.clone();
107+
if (orient.orientation == "up") {
108+
cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
109+
} else if (orient.orientation == "right") {
110+
cv::rotate(result, result, cv::ROTATE_180);
111+
} else if (orient.orientation == "down") {
112+
cv::rotate(result, result, cv::ROTATE_90_COUNTERCLOCKWISE);
113+
}
114+
// "left": no-op
115+
return result;
116+
}
117+
87118
} // namespace rnexecutorch::utils

packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,37 @@ void transformBbox(float &x1, float &y1, float &x2, float &y2,
3838
*/
3939
cv::Mat transformMat(const cv::Mat &mat, const FrameOrientation &orient);
4040

41+
/**
42+
* @brief Rotate/flip a cv::Mat so the model sees an upright image.
43+
*
44+
* Applies the correct rotation per orientation so the output matches how a
45+
* human would see the scene, regardless of device orientation:
46+
* "up" (landscape-left) → no rotation
47+
* "down" (landscape-right) → 180°
48+
* "left" (portrait upright) → CW
49+
* "right" (portrait upside-down) → CCW
50+
* Also applies isMirrored flip and rotate180 (iOS front camera correction).
51+
* Returns a new mat (does not modify input).
52+
*/
53+
cv::Mat rotateFrameForModel(const cv::Mat &mat, const FrameOrientation &orient);
54+
55+
/**
56+
* @brief Map bbox coords from rotated-frame space back to screen space.
57+
*
58+
* Inverse of rotateFrameForModel for coordinates.
59+
* rW/rH are the rotated frame dimensions (rotated.cols / rotated.rows).
60+
*/
61+
void inverseRotateBbox(float &x1, float &y1, float &x2, float &y2,
62+
const FrameOrientation &orient, int rW, int rH);
63+
64+
/**
65+
* @brief Rotate a cv::Mat from rotated-frame space back to screen space.
66+
*
67+
* Inverse of rotateFrameForModel for matrices.
68+
* Returns a new mat (does not modify input).
69+
*/
70+
cv::Mat inverseRotateMat(const cv::Mat &mat, const FrameOrientation &orient);
71+
4172
/**
4273
* @brief Transform 4-point bbox from raw frame pixel space to screen space.
4374
*
@@ -60,26 +91,9 @@ void transformPoints(std::array<P, 4> &points,
6091
x = w - x;
6192
}
6293

63-
// Sensor native = landscape-left.
64-
float nx = x, ny = y;
65-
if (orient.orientation == "up") {
66-
// CW: new_x = h - y, new_y = x
67-
nx = h - y;
68-
ny = x;
69-
} else if (orient.orientation == "down") {
70-
// CW: new_x = h - y, new_y = x
71-
nx = h - y;
72-
ny = x;
73-
} else if (orient.orientation == "left") {
74-
// CW: new_x = h - y, new_y = x
75-
nx = h - y;
76-
ny = x;
77-
} else if (orient.orientation == "right") {
78-
// CW: new_x = h - y, new_y = x
79-
nx = h - y;
80-
ny = x;
81-
}
82-
// "up" = landscape-left: no-op
94+
// Sensor native = landscape-left — apply CW rotation for all orientations.
95+
float nx = h - y;
96+
float ny = x;
8397

8498
if (orient.rotate180) {
8599
nx = h - nx;

packages/react-native-executorch/src/modules/computer_vision/VisionModule.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ import { BaseModule } from '../BaseModule';
22
import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
33
import { RnExecutorchError } from '../../errors/errorUtils';
44
import { Frame, PixelData, ScalarType } from '../../types/common';
5-
import { Platform } from 'react-native';
65

76
export function isPixelData(input: unknown): input is PixelData {
87
return (
@@ -77,7 +76,7 @@ export abstract class VisionModule<TOutput> extends BaseModule {
7776
let nativeBuffer: any = null;
7877
try {
7978
nativeBuffer = frame.getNativeBuffer();
80-
console.log(frame.orientation);
79+
console.log(frame.orientation, frame.width, frame.height);
8180
const frameData = {
8281
nativeBuffer: nativeBuffer.pointer,
8382
orientation: frame.orientation,

0 commit comments

Comments
 (0)