Skip to content

Commit 2d535a4

Browse files
feat: handle all orientations when using vision camera (#980)
## Description - Add `FrameTransform` utilities (`rotateFrameForModel`, `inverseRotateBbox`, `inverseRotateMat`, `inverseRotatePoints`) that pre-rotate camera frames so vision models always see upright images, then map results back to screen space - Support all device orientations (portrait, landscape-left/right, upside-down), front/back cameras, and iOS vs Android platform differences - Pass `isFrontCamera` via `Synchronizable` (worklet-compatible) instead of React props, with iOS front-camera 180° fix (`#if __APPLE__` guarded) - Add 342 lines of unit tests covering all orientations for rotate/inverse functions - Add example app tasks for OCR and style transfer, update existing tasks ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [x] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [x] Android ### Testing instructions Run computer vision example app, test all models (back/front camera), rotate the phone ### Screenshots <!-- Add screenshots here, if applicable --> ### Related issues <!-- Link related issues here using #issue-number --> ### Checklist - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [x] I have updated the documentation accordingly - [x] My changes generate no new warnings ### Additional notes <!-- Include any additional information, assumptions, or context that reviewers might need to understand this PR. --> --------- Co-authored-by: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
1 parent e86ec7b commit 2d535a4

File tree

37 files changed

+1308
-176
lines changed

37 files changed

+1308
-176
lines changed

apps/computer-vision/app/vision_camera/index.tsx

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,16 @@ import ClassificationTask from '../../components/vision_camera/tasks/Classificat
3232
import ObjectDetectionTask from '../../components/vision_camera/tasks/ObjectDetectionTask';
3333
import SegmentationTask from '../../components/vision_camera/tasks/SegmentationTask';
3434
import InstanceSegmentationTask from '../../components/vision_camera/tasks/InstanceSegmentationTask';
35+
import OCRTask from '../../components/vision_camera/tasks/OCRTask';
36+
import StyleTransferTask from '../../components/vision_camera/tasks/StyleTransferTask';
3537

3638
type TaskId =
3739
| 'classification'
3840
| 'objectDetection'
3941
| 'segmentation'
40-
| 'instanceSegmentation';
42+
| 'instanceSegmentation'
43+
| 'ocr'
44+
| 'styleTransfer';
4145
type ModelId =
4246
| 'classification'
4347
| 'objectDetectionSsdlite'
@@ -50,7 +54,10 @@ type ModelId =
5054
| 'segmentationFcnResnet101'
5155
| 'segmentationSelfie'
5256
| 'instanceSegmentationYolo26n'
53-
| 'instanceSegmentationRfdetr';
57+
| 'instanceSegmentationRfdetr'
58+
| 'ocr'
59+
| 'styleTransferCandy'
60+
| 'styleTransferMosaic';
5461

5562
type TaskVariant = { id: ModelId; label: string };
5663
type Task = { id: TaskId; label: string; variants: TaskVariant[] };
@@ -90,11 +97,25 @@ const TASKS: Task[] = [
9097
{ id: 'objectDetectionRfdetr', label: 'RF-DETR Nano' },
9198
],
9299
},
100+
{
101+
id: 'ocr',
102+
label: 'OCR',
103+
variants: [{ id: 'ocr', label: 'English' }],
104+
},
105+
{
106+
id: 'styleTransfer',
107+
label: 'Style',
108+
variants: [
109+
{ id: 'styleTransferCandy', label: 'Candy' },
110+
{ id: 'styleTransferMosaic', label: 'Mosaic' },
111+
],
112+
},
93113
];
94114

95-
// Module-level const so worklets in task components can always reference the same stable object.
115+
// Module-level consts so worklets in task components can always reference the same stable objects.
96116
// Never replaced — only mutated via setBlocking to avoid closure staleness.
97117
const frameKillSwitch = createSynchronizable(false);
118+
const cameraPositionSync = createSynchronizable<'front' | 'back'>('back');
98119

99120
export default function VisionCameraScreen() {
100121
const insets = useSafeAreaInsets();
@@ -121,7 +142,7 @@ export default function VisionCameraScreen() {
121142
const format = useMemo(() => {
122143
if (device == null) return undefined;
123144
try {
124-
return getCameraFormat(device, Templates.FrameProcessing);
145+
return getCameraFormat(device, { ...Templates.FrameProcessing });
125146
} catch {
126147
return undefined;
127148
}
@@ -135,6 +156,10 @@ export default function VisionCameraScreen() {
135156
return () => clearTimeout(id);
136157
}, [activeModel]);
137158

159+
useEffect(() => {
160+
cameraPositionSync.setBlocking(cameraPosition);
161+
}, [cameraPosition]);
162+
138163
const handleFpsChange = useCallback((newFps: number, newMs: number) => {
139164
setFps(newFps);
140165
setFrameMs(newMs);
@@ -177,7 +202,7 @@ export default function VisionCameraScreen() {
177202
const taskProps = {
178203
activeModel,
179204
canvasSize,
180-
cameraPosition,
205+
cameraPositionSync,
181206
frameKillSwitch,
182207
onFrameOutputChange: setFrameOutput,
183208
onReadyChange: setIsReady,
@@ -196,7 +221,7 @@ export default function VisionCameraScreen() {
196221
outputs={frameOutput ? [frameOutput] : []}
197222
isActive={isFocused}
198223
format={format}
199-
orientationSource="interface"
224+
orientationSource="device"
200225
/>
201226

202227
{/* Layout sentinel — measures the full-screen area for bbox/canvas sizing */}
@@ -245,6 +270,15 @@ export default function VisionCameraScreen() {
245270
}
246271
/>
247272
)}
273+
{activeTask === 'ocr' && <OCRTask {...taskProps} />}
274+
{activeTask === 'styleTransfer' && (
275+
<StyleTransferTask
276+
{...taskProps}
277+
activeModel={
278+
activeModel as 'styleTransferCandy' | 'styleTransferMosaic'
279+
}
280+
/>
281+
)}
248282

249283
{!isReady && (
250284
<View style={styles.loadingOverlay}>

apps/computer-vision/components/vision_camera/tasks/ClassificationTask.tsx

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ import { scheduleOnRN } from 'react-native-worklets';
55
import { EFFICIENTNET_V2_S, useClassification } from 'react-native-executorch';
66
import { TaskProps } from './types';
77

8-
type Props = Omit<TaskProps, 'activeModel' | 'canvasSize' | 'cameraPosition'>;
8+
type Props = Omit<
9+
TaskProps,
10+
'activeModel' | 'canvasSize' | 'cameraPositionSync'
11+
>;
912

1013
export default function ClassificationTask({
1114
frameKillSwitch,
@@ -47,6 +50,7 @@ export default function ClassificationTask({
4750
const frameOutput = useFrameOutput({
4851
pixelFormat: 'rgb',
4952
dropFramesWhileBusy: true,
53+
enablePreviewSizedOutputBuffers: true,
5054
onFrame: useCallback(
5155
(frame: Frame) => {
5256
'worklet';
@@ -71,7 +75,7 @@ export default function ClassificationTask({
7175
scheduleOnRN(updateClass, { label: bestLabel, score: bestScore });
7276
}
7377
} catch {
74-
// ignore
78+
// Frame may be disposed before processing completes — transient, safe to ignore.
7579
} finally {
7680
frame.dispose();
7781
}

apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ type Props = TaskProps & { activeModel: InstSegModelId };
2727
export default function InstanceSegmentationTask({
2828
activeModel,
2929
canvasSize,
30-
cameraPosition,
30+
cameraPositionSync,
3131
frameKillSwitch,
3232
onFrameOutputChange,
3333
onReadyChange,
@@ -96,6 +96,7 @@ export default function InstanceSegmentationTask({
9696
const frameOutput = useFrameOutput({
9797
pixelFormat: 'rgb',
9898
dropFramesWhileBusy: true,
99+
enablePreviewSizedOutputBuffers: true,
99100
onFrame: useCallback(
100101
(frame: Frame) => {
101102
'worklet';
@@ -105,9 +106,10 @@ export default function InstanceSegmentationTask({
105106
}
106107
try {
107108
if (!instSegRof) return;
109+
const isFrontCamera = cameraPositionSync.getDirty() === 'front';
108110
const iw = frame.width > frame.height ? frame.height : frame.width;
109111
const ih = frame.width > frame.height ? frame.width : frame.height;
110-
const result = instSegRof(frame, {
112+
const result = instSegRof(frame, isFrontCamera, {
111113
confidenceThreshold: 0.5,
112114
iouThreshold: 0.5,
113115
maxInstances: 5,
@@ -129,7 +131,13 @@ export default function InstanceSegmentationTask({
129131
frame.dispose();
130132
}
131133
},
132-
[instSegRof, frameKillSwitch, updateInstances, activeModel]
134+
[
135+
instSegRof,
136+
frameKillSwitch,
137+
updateInstances,
138+
activeModel,
139+
cameraPositionSync,
140+
]
133141
),
134142
});
135143

@@ -145,13 +153,7 @@ export default function InstanceSegmentationTask({
145153
const offsetY = (canvasSize.height - imageSize.height * scale) / 2;
146154

147155
return (
148-
<View
149-
style={[
150-
StyleSheet.absoluteFill,
151-
cameraPosition === 'front' && { transform: [{ scaleX: -1 }] },
152-
]}
153-
pointerEvents="none"
154-
>
156+
<View style={StyleSheet.absoluteFill} pointerEvents="none">
155157
{/* Render masks */}
156158
<Canvas style={StyleSheet.absoluteFill} pointerEvents="none">
157159
{instances.map((inst, i) => {
@@ -197,7 +199,6 @@ export default function InstanceSegmentationTask({
197199
style={[
198200
styles.bboxLabel,
199201
{ backgroundColor: labelColorBg(label) },
200-
cameraPosition === 'front' && { transform: [{ scaleX: -1 }] },
201202
]}
202203
>
203204
<Text style={styles.bboxLabelText}>
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import React, { useCallback, useEffect, useRef, useState } from 'react';
2+
import { StyleSheet, View } from 'react-native';
3+
import { Frame, useFrameOutput } from 'react-native-vision-camera';
4+
import { scheduleOnRN } from 'react-native-worklets';
5+
import { OCR_ENGLISH, OCRDetection, useOCR } from 'react-native-executorch';
6+
import Svg, { Polygon, Text as SvgText } from 'react-native-svg';
7+
import { TaskProps } from './types';
8+
9+
type Props = Omit<TaskProps, 'activeModel'>;
10+
11+
export default function OCRTask({
12+
canvasSize,
13+
cameraPositionSync,
14+
frameKillSwitch,
15+
onFrameOutputChange,
16+
onReadyChange,
17+
onProgressChange,
18+
onGeneratingChange,
19+
onFpsChange,
20+
}: Props) {
21+
const model = useOCR({ model: OCR_ENGLISH });
22+
const [detections, setDetections] = useState<OCRDetection[]>([]);
23+
const [imageSize, setImageSize] = useState({ width: 1, height: 1 });
24+
const lastFrameTimeRef = useRef(Date.now());
25+
26+
useEffect(() => {
27+
onReadyChange(model.isReady);
28+
}, [model.isReady, onReadyChange]);
29+
30+
useEffect(() => {
31+
onProgressChange(model.downloadProgress);
32+
}, [model.downloadProgress, onProgressChange]);
33+
34+
useEffect(() => {
35+
onGeneratingChange(model.isGenerating);
36+
}, [model.isGenerating, onGeneratingChange]);
37+
38+
const ocrRof = model.runOnFrame;
39+
40+
const updateDetections = useCallback(
41+
(p: { results: OCRDetection[]; frameW: number; frameH: number }) => {
42+
setDetections(p.results);
43+
setImageSize({ width: p.frameW, height: p.frameH });
44+
const now = Date.now();
45+
const diff = now - lastFrameTimeRef.current;
46+
if (diff > 0) onFpsChange(Math.round(1000 / diff), diff);
47+
lastFrameTimeRef.current = now;
48+
},
49+
[onFpsChange]
50+
);
51+
52+
const frameOutput = useFrameOutput({
53+
pixelFormat: 'rgb',
54+
dropFramesWhileBusy: true,
55+
enablePreviewSizedOutputBuffers: true,
56+
onFrame: useCallback(
57+
(frame: Frame) => {
58+
'worklet';
59+
if (frameKillSwitch.getDirty()) {
60+
frame.dispose();
61+
return;
62+
}
63+
try {
64+
if (!ocrRof) return;
65+
const isFrontCamera = cameraPositionSync.getDirty() === 'front';
66+
const result = ocrRof(frame, isFrontCamera);
67+
if (result) {
68+
// Sensor frames are landscape-native, so width/height are swapped
69+
// relative to portrait screen orientation.
70+
scheduleOnRN(updateDetections, {
71+
results: result,
72+
frameW: frame.height,
73+
frameH: frame.width,
74+
});
75+
}
76+
} catch {
77+
// Frame may be disposed before processing completes — transient, safe to ignore.
78+
} finally {
79+
frame.dispose();
80+
}
81+
},
82+
[cameraPositionSync, frameKillSwitch, ocrRof, updateDetections]
83+
),
84+
});
85+
86+
useEffect(() => {
87+
onFrameOutputChange(frameOutput);
88+
}, [frameOutput, onFrameOutputChange]);
89+
90+
const scale = Math.max(
91+
canvasSize.width / imageSize.width,
92+
canvasSize.height / imageSize.height
93+
);
94+
const offsetX = (canvasSize.width - imageSize.width * scale) / 2;
95+
const offsetY = (canvasSize.height - imageSize.height * scale) / 2;
96+
97+
if (!detections.length) return null;
98+
99+
return (
100+
<View style={StyleSheet.absoluteFill} pointerEvents="none">
101+
<Svg
102+
width={canvasSize.width}
103+
height={canvasSize.height}
104+
style={StyleSheet.absoluteFill}
105+
>
106+
{detections.map((det, i) => {
107+
const pts = det.bbox
108+
.map((p) => `${p.x * scale + offsetX},${p.y * scale + offsetY}`)
109+
.join(' ');
110+
const labelX = det.bbox[0]!.x * scale + offsetX;
111+
const labelY = det.bbox[0]!.y * scale + offsetY - 4;
112+
return (
113+
<React.Fragment key={i}>
114+
<Polygon points={pts} fill="none" stroke="cyan" strokeWidth={2} />
115+
<SvgText
116+
x={labelX}
117+
y={labelY}
118+
fill="white"
119+
fontSize={12}
120+
fontWeight="bold"
121+
>
122+
{det.text}
123+
</SvgText>
124+
</React.Fragment>
125+
);
126+
})}
127+
</Svg>
128+
</View>
129+
);
130+
}

0 commit comments

Comments
 (0)