feat: model sees the same thing as user approach

NorbertKlockiewicz · NorbertKlockiewicz · commit 21e8edf6c263 · 2026-03-20T09:44:04.000+01:00
diff --git a/apps/computer-vision/app/vision_camera/index.tsx b/apps/computer-vision/app/vision_camera/index.tsx
@@ -202,7 +202,7 @@ export default function VisionCameraScreen() {
         outputs={frameOutput ? [frameOutput] : []}
         isActive={isFocused}
         format={format}
-        orientationSource="interface"
+        orientationSource="device"
       />
 
       {/* Layout sentinel — measures the full-screen area for bbox/canvas sizing */}
diff --git a/apps/computer-vision/components/vision_camera/tasks/ObjectDetectionTask.tsx b/apps/computer-vision/components/vision_camera/tasks/ObjectDetectionTask.tsx
@@ -71,6 +71,7 @@ export default function ObjectDetectionTask({
     pixelFormat: 'rgb',
     dropFramesWhileBusy: true,
     enablePreviewSizedOutputBuffers: true,
+
     onFrame: useCallback(
       (frame: Frame) => {
         'worklet';
@@ -80,10 +81,10 @@ export default function ObjectDetectionTask({
         }
         try {
           if (!detRof) return;
-          // C++ always does CW rotation, so output space is always frameH × frameW
+          const result = detRof(frame, cameraPositionSync.getDirty(), 0.5);
+          // C++ maps coords to screen space (portrait: frameH × frameW)
           const screenW = frame.height;
           const screenH = frame.width;
-          const result = detRof(frame, cameraPositionSync.getDirty(), 0.5);
           if (result) {
             scheduleOnRN(updateDetections, {
               results: result,
diff --git a/apps/computer-vision/components/vision_camera/tasks/SegmentationTask.tsx b/apps/computer-vision/components/vision_camera/tasks/SegmentationTask.tsx
@@ -140,6 +140,7 @@ export default function SegmentationTask({
     pixelFormat: 'rgb',
     dropFramesWhileBusy: true,
     enablePreviewSizedOutputBuffers: true,
+
     onFrame: useCallback(
       (frame: Frame) => {
         'worklet';
@@ -149,7 +150,12 @@ export default function SegmentationTask({
         }
         try {
           if (!segRof) return;
-          const result = segRof(frame, cameraPositionSync.getDirty(), [], false);
+          const result = segRof(
+            frame,
+            cameraPositionSync.getDirty(),
+            [],
+            false
+          );
           if (result?.ARGMAX) {
             const argmax: Int32Array = result.ARGMAX;
             const side = Math.round(Math.sqrt(argmax.length));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp b/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp
@@ -141,10 +141,11 @@ ObjectDetection::generateFromFrame(jsi::Runtime &runtime,
                                    double detectionThreshold) {
   auto orient = extractFrameOrientation(runtime, frameData);
   cv::Mat frame = extractFromFrame(runtime, frameData);
-  auto detections = runInference(frame, detectionThreshold);
+  cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(frame, orient);
+  auto detections = runInference(rotated, detectionThreshold);
   for (auto &det : detections) {
-    ::rnexecutorch::utils::transformBbox(det.x1, det.y1, det.x2, det.y2,
-                                         orient);
+    ::rnexecutorch::utils::inverseRotateBbox(
+        det.x1, det.y1, det.x2, det.y2, orient, rotated.cols, rotated.rows);
   }
   return detections;
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp
@@ -52,7 +52,6 @@ std::vector<types::OCRDetection> OCR::generateFromString(std::string input) {
 std::vector<types::OCRDetection>
 OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
   auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData);
-
   cv::Mat frame = ::rnexecutorch::utils::frameToMat(runtime, frameData);
   cv::Mat bgr;
 #ifdef __APPLE__
@@ -64,13 +63,8 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
       RnExecutorchErrorCode::PlatformNotSupported,
       "generateFromFrame is not supported on this platform");
 #endif
-  std::vector<types::OCRDetection> detections = runInference(bgr);
-
-  for (auto &det : detections) {
-    ::rnexecutorch::utils::transformPoints(det.bbox, orient);
-  }
-
-  return detections;
+  cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(bgr, orient);
+  return runInference(rotated);
 }
 
 std::vector<types::OCRDetection>
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.cpp
@@ -98,32 +98,27 @@ BaseSemanticSegmentation::generateFromFrame(
     std::set<std::string, std::less<>> classesOfInterest, bool resize) {
   auto orient = extractFrameOrientation(runtime, frameData);
   cv::Mat frame = extractFromFrame(runtime, frameData);
-  auto result = runInference(frame, frame.size(), classesOfInterest, resize);
+  cv::Mat rotated = utils::rotateFrameForModel(frame, orient);
+  auto result = runInference(rotated, rotated.size(), classesOfInterest, resize);
 
-  // Pre-rotation dimensions from runInference — used to wrap raw buffers before transform.
   const int w = result.outputWidth;
   const int h = result.outputHeight;
 
-  // Transform argmax mask
   if (result.argmax && w > 0 && h > 0) {
-    cv::Mat argmaxMat(h, w, CV_32SC1, result.argmax->data());
-    cv::Mat transformed = utils::transformMat(argmaxMat, orient);
+    cv::Mat m(h, w, CV_32SC1, result.argmax->data());
+    cv::Mat inv = utils::inverseRotateMat(m, orient);
     result.argmax = std::make_shared<OwningArrayBuffer>(
-        transformed.data,
-        static_cast<size_t>(transformed.total() * transformed.elemSize()));
-    // Update dimensions to reflect post-rotation layout (right/left swaps w↔h)
-    result.outputWidth = transformed.cols;
-    result.outputHeight = transformed.rows;
+        inv.data, static_cast<size_t>(inv.total() * inv.elemSize()));
+    result.outputWidth = inv.cols;
+    result.outputHeight = inv.rows;
   }
 
-  // Transform each class probability buffer
   if (result.classBuffers && w > 0 && h > 0) {
     for (auto &[label, buf] : *result.classBuffers) {
-      cv::Mat classMat(h, w, CV_32FC1, buf->data());
-      cv::Mat transformed = utils::transformMat(classMat, orient);
+      cv::Mat m(h, w, CV_32FC1, buf->data());
+      cv::Mat inv = utils::inverseRotateMat(m, orient);
       buf = std::make_shared<OwningArrayBuffer>(
-          transformed.data,
-          static_cast<size_t>(transformed.total() * transformed.elemSize()));
+          inv.data, static_cast<size_t>(inv.total() * inv.elemSize()));
     }
   }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp
@@ -67,13 +67,8 @@ VerticalOCR::generateFromFrame(jsi::Runtime &runtime,
       RnExecutorchErrorCode::PlatformNotSupported,
       "generateFromFrame is not supported on this platform");
 #endif
-  std::vector<types::OCRDetection> detections = runInference(bgr);
-
-  for (auto &det : detections) {
-    ::rnexecutorch::utils::transformPoints(det.bbox, orient);
-  }
-
-  return detections;
+  cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(bgr, orient);
+  return runInference(rotated);
 }
 
 std::vector<types::OCRDetection>
diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp
@@ -16,44 +16,24 @@ void transformBbox(float &x1, float &y1, float &x2, float &y2,
     x2 = nx2;
   }
 
-  // Sensor native = landscape-left ("up" = no-op).
-  // "up"    = landscape-left: no-op.
-  // "down"  = landscape-right: 180°.
-  // "left"  = portrait: CCW (new_x = y, new_y = w - x).
-  // "right" = upside-down portrait: CW (new_x = h - y, new_y = x).
-  if (orient.orientation == "up") {
-    // CW: new_x = h - y, new_y = x
-    float nx1 = h - y2, ny1 = x1;
-    float nx2 = h - y1, ny2 = x2;
-    x1 = nx1; y1 = ny1;
-    x2 = nx2; y2 = ny2;
-  } else if (orient.orientation == "down") {
-    // CW: new_x = h - y, new_y = x
-    float nx1 = h - y2, ny1 = x1;
-    float nx2 = h - y1, ny2 = x2;
-    x1 = nx1; y1 = ny1;
-    x2 = nx2; y2 = ny2;
-  } else if (orient.orientation == "left") {
-    // CW: new_x = h - y, new_y = x
+  // Sensor native = landscape-left — apply CW rotation for all orientations.
+  {
     float nx1 = h - y2, ny1 = x1;
     float nx2 = h - y1, ny2 = x2;
-    x1 = nx1; y1 = ny1;
-    x2 = nx2; y2 = ny2;
-  } else {
-    assert(orient.orientation == "right" && "Unknown orientation; expected up/right/left/down");
-    // CW: new_x = h - y, new_y = x
-    float nx1 = h - y2, ny1 = x1;
-    float nx2 = h - y1, ny2 = x2;
-    x1 = nx1; y1 = ny1;
-    x2 = nx2; y2 = ny2;
+    x1 = nx1;
+    y1 = ny1;
+    x2 = nx2;
+    y2 = ny2;
   }
 
   // Extra 180° in post-rotation screen space (screen dims are h x w after CW).
   if (orient.rotate180) {
     float nx1 = h - x2, ny1 = w - y2;
     float nx2 = h - x1, ny2 = w - y1;
-    x1 = nx1; y1 = ny1;
-    x2 = nx2; y2 = ny2;
+    x1 = nx1;
+    y1 = ny1;
+    x2 = nx2;
+    y2 = ny2;
   }
 }
 
@@ -65,17 +45,31 @@ cv::Mat transformMat(const cv::Mat &mat, const FrameOrientation &orient) {
     cv::flip(result, result, 1);
   }
 
-  // Sensor native = landscape-left ("up" = no-op).
-  if (orient.orientation == "up") {
+  // Sensor native = landscape-left — apply CW rotation.
+  cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
+
+  if (orient.rotate180) {
+    cv::rotate(result, result, cv::ROTATE_180);
+  }
+
+  return result;
+}
+
+cv::Mat rotateFrameForModel(const cv::Mat &mat, const FrameOrientation &orient) {
+  cv::Mat result = mat.clone();
+
+  if (orient.isMirrored) {
+    cv::flip(result, result, 1);
+  }
+
+  if (orient.orientation == "left") {
     cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
+  } else if (orient.orientation == "right") {
+    cv::rotate(result, result, cv::ROTATE_90_COUNTERCLOCKWISE);
   } else if (orient.orientation == "down") {
-    cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
-  } else if (orient.orientation == "left") {
-    cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
-  } else {
-    assert(orient.orientation == "right" && "Unknown orientation; expected up/right/left/down");
-    cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
+    cv::rotate(result, result, cv::ROTATE_180);
   }
+  // "up" = no rotation needed.
 
   if (orient.rotate180) {
     cv::rotate(result, result, cv::ROTATE_180);
@@ -84,4 +78,41 @@ cv::Mat transformMat(const cv::Mat &mat, const FrameOrientation &orient) {
   return result;
 }
 
+void inverseRotateBbox(float &x1, float &y1, float &x2, float &y2,
+                       const FrameOrientation &orient, int rW, int rH) {
+  const float w = static_cast<float>(rW);
+  const float h = static_cast<float>(rH);
+
+  if (orient.orientation == "up") {
+    // CW: nx = h - y, ny = x
+    float nx1 = h - y2, ny1 = x1;
+    float nx2 = h - y1, ny2 = x2;
+    x1 = nx1; y1 = ny1; x2 = nx2; y2 = ny2;
+  } else if (orient.orientation == "right") {
+    // 180°: nx = w - x, ny = h - y
+    float nx1 = w - x2, ny1 = h - y2;
+    float nx2 = w - x1, ny2 = h - y1;
+    x1 = nx1; y1 = ny1; x2 = nx2; y2 = ny2;
+  } else if (orient.orientation == "down") {
+    // CCW: nx = y, ny = w - x
+    float nx1 = y1,     ny1 = w - x2;
+    float nx2 = y2,     ny2 = w - x1;
+    x1 = nx1; y1 = ny1; x2 = nx2; y2 = ny2;
+  }
+  // "left": no-op
+}
+
+cv::Mat inverseRotateMat(const cv::Mat &mat, const FrameOrientation &orient) {
+  cv::Mat result = mat.clone();
+  if (orient.orientation == "up") {
+    cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE);
+  } else if (orient.orientation == "right") {
+    cv::rotate(result, result, cv::ROTATE_180);
+  } else if (orient.orientation == "down") {
+    cv::rotate(result, result, cv::ROTATE_90_COUNTERCLOCKWISE);
+  }
+  // "left": no-op
+  return result;
+}
+
 } // namespace rnexecutorch::utils
diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h
@@ -38,6 +38,37 @@ void transformBbox(float &x1, float &y1, float &x2, float &y2,
  */
 cv::Mat transformMat(const cv::Mat &mat, const FrameOrientation &orient);
 
+/**
+ * @brief Rotate/flip a cv::Mat so the model sees an upright image.
+ *
+ * Applies the correct rotation per orientation so the output matches how a
+ * human would see the scene, regardless of device orientation:
+ *   "up"    (landscape-left)       → no rotation
+ *   "down"  (landscape-right)      → 180°
+ *   "left"  (portrait upright)     → CW
+ *   "right" (portrait upside-down) → CCW
+ * Also applies isMirrored flip and rotate180 (iOS front camera correction).
+ * Returns a new mat (does not modify input).
+ */
+cv::Mat rotateFrameForModel(const cv::Mat &mat, const FrameOrientation &orient);
+
+/**
+ * @brief Map bbox coords from rotated-frame space back to screen space.
+ *
+ * Inverse of rotateFrameForModel for coordinates.
+ * rW/rH are the rotated frame dimensions (rotated.cols / rotated.rows).
+ */
+void inverseRotateBbox(float &x1, float &y1, float &x2, float &y2,
+                       const FrameOrientation &orient, int rW, int rH);
+
+/**
+ * @brief Rotate a cv::Mat from rotated-frame space back to screen space.
+ *
+ * Inverse of rotateFrameForModel for matrices.
+ * Returns a new mat (does not modify input).
+ */
+cv::Mat inverseRotateMat(const cv::Mat &mat, const FrameOrientation &orient);
+
 /**
  * @brief Transform 4-point bbox from raw frame pixel space to screen space.
  *
@@ -60,26 +91,9 @@ void transformPoints(std::array<P, 4> &points,
       x = w - x;
     }
 
-    // Sensor native = landscape-left.
-    float nx = x, ny = y;
-    if (orient.orientation == "up") {
-      // CW: new_x = h - y, new_y = x
-      nx = h - y;
-      ny = x;
-    } else if (orient.orientation == "down") {
-      // CW: new_x = h - y, new_y = x
-      nx = h - y;
-      ny = x;
-    } else if (orient.orientation == "left") {
-      // CW: new_x = h - y, new_y = x
-      nx = h - y;
-      ny = x;
-    } else if (orient.orientation == "right") {
-      // CW: new_x = h - y, new_y = x
-      nx = h - y;
-      ny = x;
-    }
-    // "up" = landscape-left: no-op
+    // Sensor native = landscape-left — apply CW rotation for all orientations.
+    float nx = h - y;
+    float ny = x;
 
     if (orient.rotate180) {
       nx = h - nx;
diff --git a/packages/react-native-executorch/src/modules/computer_vision/VisionModule.ts b/packages/react-native-executorch/src/modules/computer_vision/VisionModule.ts
@@ -2,7 +2,6 @@ import { BaseModule } from '../BaseModule';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError } from '../../errors/errorUtils';
 import { Frame, PixelData, ScalarType } from '../../types/common';
-import { Platform } from 'react-native';
 
 export function isPixelData(input: unknown): input is PixelData {
   return (
@@ -77,7 +76,7 @@ export abstract class VisionModule<TOutput> extends BaseModule {
       let nativeBuffer: any = null;
       try {
         nativeBuffer = frame.getNativeBuffer();
-        console.log(frame.orientation);
+        console.log(frame.orientation, frame.width, frame.height);
         const frameData = {
           nativeBuffer: nativeBuffer.pointer,
           orientation: frame.orientation,
diff --git a/packages/react-native-executorch/src/types/objectDetection.ts b/packages/react-native-executorch/src/types/objectDetection.ts
@@ -143,6 +143,10 @@ export interface ObjectDetectionType<L extends LabelEnum> {
    * @returns Array of Detection objects representing detected items in the frame.
    */
   runOnFrame:
-    | ((frame: Frame, cameraPosition: string, detectionThreshold: number) => Detection<L>[])
+    | ((
+        frame: Frame,
+        cameraPosition: string,
+        detectionThreshold: number
+      ) => Detection<L>[])
     | null;
 }