Switch to neon for interleave (pytorch#20137)

metascroy · web-flow · commit 92e6a4ced262 · 2026-06-10T16:43:08.000Z
Differential Revision: D107958353 Pull Request resolved: pytorch#20137
diff --git a/extension/image/CMakeLists.txt b/extension/image/CMakeLists.txt
@@ -9,8 +9,8 @@ cmake_minimum_required(VERSION 3.19)
 if(APPLE)
   enable_language(OBJCXX)
   add_library(
-    extension_image image_processor_common.cpp image_processor_apple.cpp
-                    image_processor_apple_gpu.mm
+    extension_image image_processor_common.cpp image_processor_simd.cpp
+                    image_processor_apple.cpp image_processor_apple_gpu.mm
   )
   set_source_files_properties(
     image_processor_apple_gpu.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc"
@@ -39,7 +39,10 @@ else()
   )
   FetchContent_MakeAvailable(stb)
 
-  add_library(extension_image image_processor_common.cpp image_processor.cpp)
+  add_library(
+    extension_image image_processor_common.cpp image_processor_simd.cpp
+                    image_processor.cpp
+  )
 
   # stb_image_resize.h lives under deprecated/ in current stb. Private: only the
   # .cpp uses it, not the installed public headers.
diff --git a/extension/image/image_processor.cpp b/extension/image/image_processor.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/extension/image/image_processor.h>
+#include <executorch/extension/image/image_processor_simd.h>
 
 #include <algorithm>
 #include <cstring>
@@ -420,25 +421,23 @@ Error ImageProcessor::process_into(
         InvalidArgument,
         "normalization std_dev must be nonzero");
   }
-  // Source (resized RGB) carries input_channels; the output tensor carries
-  // output_channels. They are equal today, so channels map 1:1; a future
-  // divergence (e.g. grayscale) would need an explicit channel map here.
-  for (int32_t y = 0; y < resize_h; ++y) {
-    for (int32_t x = 0; x < resize_w; ++x) {
-      const int32_t src_idx = (y * resize_w + x) * input_channels;
-      const int32_t dst_y = y + offset_y;
-      const int32_t dst_x = x + offset_x;
-      for (int32_t c = 0; c < output_channels; ++c) {
-        const float val =
-            (resized_buf[src_idx + c] * norm.scale_factor - norm.mean[c]) /
-            norm.std_dev[c];
-        const size_t out_idx = static_cast<size_t>(c) * final_w * final_h +
-            static_cast<size_t>(dst_y) * final_w + dst_x;
-        output[out_idx] = val;
-      }
-    }
-  }
-  return Error::Ok;
+  // Deinterleave + normalize the resized interleaved RGB (R/G/B at byte
+  // offsets 0/1/2) into the CHW output.
+  return deinterleave_to_chw(
+      resized_buf.data(),
+      resize_w,
+      resize_h,
+      resize_w * input_channels,
+      input_channels,
+      /*r_off=*/0,
+      /*g_off=*/1,
+      /*b_off=*/2,
+      output,
+      final_w,
+      final_h,
+      offset_x,
+      offset_y,
+      norm);
 }
 
 Error ImageProcessor::process_yuv_into(
diff --git a/extension/image/image_processor_apple.cpp b/extension/image/image_processor_apple.cpp
@@ -20,6 +20,7 @@
 
 #include <executorch/extension/image/image_processor.h>
 #include <executorch/extension/image/image_processor_apple.h>
+#include <executorch/extension/image/image_processor_simd.h>
 
 #include <algorithm>
 #include <cstring>
@@ -391,85 +392,6 @@ size_t compute_scale_temp_size(
   return temp_size > 0 ? static_cast<size_t>(temp_size) : 0;
 }
 
-// Deinterleave BGRA uint8 → planar RGB float with fused normalization.
-// Handles offset for letterbox padding.
-//
-// Per channel (R, G, B): vDSP_vfltu8 reads the matching byte from BGRA via
-// stride=4 and converts uint8→float, then vDSP_vsmsa applies the fused
-// affine `out = in * (scale_factor / std_dev) + (-mean / std_dev)` in-place.
-Error deinterleave_bgra_to_chw(
-    const uint8_t* bgra_data,
-    int32_t src_w,
-    int32_t src_h,
-    int32_t src_stride,
-    float* output,
-    int32_t final_w,
-    int32_t final_h,
-    int32_t offset_x,
-    int32_t offset_y,
-    const Normalization& norm) {
-  const size_t spatial = static_cast<size_t>(final_w) * final_h;
-
-  // Per-channel affine coefficients for `out = in * a + b`.
-  // BGRA byte layout: byte 0 = B, byte 1 = G, byte 2 = R; norm.{mean,std_dev}
-  // are indexed in RGB order (channel 0 = R, 1 = G, 2 = B).
-  const float a_r = norm.scale_factor / norm.std_dev[0];
-  const float a_g = norm.scale_factor / norm.std_dev[1];
-  const float a_b = norm.scale_factor / norm.std_dev[2];
-  const float b_r = -norm.mean[0] / norm.std_dev[0];
-  const float b_g = -norm.mean[1] / norm.std_dev[1];
-  const float b_b = -norm.mean[2] / norm.std_dev[2];
-
-  // When the bias is zero (e.g. zeroToOne / mean=0), a plain scale (vsmul) is
-  // cheaper than the fused scale+add (vsmsa).
-  const bool no_offset = (b_r == 0.0f && b_g == 0.0f && b_b == 0.0f);
-  auto scale_bias =
-      [no_offset](float* p, const float* a, const float* b, vDSP_Length n) {
-        if (no_offset) {
-          vDSP_vsmul(p, 1, a, p, 1, n);
-        } else {
-          vDSP_vsmsa(p, 1, a, b, p, 1, n);
-        }
-      };
-
-  // Output planes in CHW order: R, G, B. Each plane is final_w × final_h
-  // floats; we write a src_h × src_w region starting at (offset_y, offset_x).
-  float* r_plane = output + 0 * spatial;
-  float* g_plane = output + 1 * spatial;
-  float* b_plane = output + 2 * spatial;
-
-  // Fast path: source is contiguous and destination region is the entire
-  // plane (offsets 0, src dims == final dims).
-  if (src_stride == src_w * 4 && offset_x == 0 && offset_y == 0 &&
-      src_w == final_w && src_h == final_h) {
-    const vDSP_Length n = static_cast<vDSP_Length>(src_w) * src_h;
-    vDSP_vfltu8(bgra_data + 2, 4, r_plane, 1, n);
-    scale_bias(r_plane, &a_r, &b_r, n);
-    vDSP_vfltu8(bgra_data + 1, 4, g_plane, 1, n);
-    scale_bias(g_plane, &a_g, &b_g, n);
-    vDSP_vfltu8(bgra_data + 0, 4, b_plane, 1, n);
-    scale_bias(b_plane, &a_b, &b_b, n);
-    return Error::Ok;
-  }
-
-  // Slow path: row-by-row to handle stride padding and/or letterbox offsets.
-  for (int32_t y = 0; y < src_h; ++y) {
-    const uint8_t* src_row = bgra_data + y * src_stride;
-    const ptrdiff_t dst_off = (y + offset_y) * final_w + offset_x;
-    float* r_dst = r_plane + dst_off;
-    float* g_dst = g_plane + dst_off;
-    float* b_dst = b_plane + dst_off;
-    const vDSP_Length n = static_cast<vDSP_Length>(src_w);
-    vDSP_vfltu8(src_row + 2, 4, r_dst, 1, n);
-    scale_bias(r_dst, &a_r, &b_r, n);
-    vDSP_vfltu8(src_row + 1, 4, g_dst, 1, n);
-    scale_bias(g_dst, &a_g, &b_g, n);
-    vDSP_vfltu8(src_row + 0, 4, b_dst, 1, n);
-    scale_bias(b_dst, &a_b, &b_b, n);
-  }
-  return Error::Ok;
-}
-
 // Rotate an interleaved BGRA (ARGB8888 layout) buffer by `orientation` using
 // vImage's SIMD/cache-aware 90-degree rotation, writing a tightly-packed result
 // into `scratch`. UP is handled by the caller (no rotation). out_data/out_w/
@@ -590,11 +512,16 @@ Error normalize_bgra_into(
     offset_y = offset.second;
   }
 
-  return deinterleave_bgra_to_chw(
+  // BGRA byte layout: B=0, G=1, R=2 (alpha dropped); norm is RGB-indexed.
+  return deinterleave_to_chw(
       bgra_data,
       width,
       height,
       stride,
+      /*in_channels=*/4,
+      /*r_off=*/2,
+      /*g_off=*/1,
+      /*b_off=*/0,
       out,
       final_w,
       final_h,
@@ -1380,6 +1307,7 @@ Error process_pixelbuffer_into(
 
 // Allocate a CHW float tensor sized to the configured target and fill it via
 // process_pixelbuffer_into.
+// cppcheck-suppress unusedFunction
 Result<TensorPtr> process_pixelbuffer(
     const ImageProcessor& processor,
     CVPixelBufferRef pixelBuffer,
diff --git a/extension/image/image_processor_simd.cpp b/extension/image/image_processor_simd.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/image/image_processor_simd.h>
+
+#include <cstddef>
+
+#include <executorch/runtime/platform/assert.h>
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define ET_IMAGE_USE_NEON 1
+#else
+#define ET_IMAGE_USE_NEON 0
+#endif
+
+namespace executorch {
+namespace extension {
+namespace image {
+
+using runtime::Error;
+
+namespace {
+
+#if ET_IMAGE_USE_NEON
+// Widen 16 uint8 -> 4x float32x4, apply out = in * a + b (single-rounding FMA),
+// and store the 16 resulting floats.
+__attribute__((always_inline)) inline void
+widen_fma_store(uint8x16_t ch, float* dst, float32x4_t a, float32x4_t b) {
+  uint16x8_t lo = vmovl_u8(vget_low_u8(ch));
+  uint16x8_t hi = vmovl_u8(vget_high_u8(ch));
+  vst1q_f32(
+      dst + 0, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))), a));
+  vst1q_f32(
+      dst + 4, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo))), a));
+  vst1q_f32(
+      dst + 8, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi))), a));
+  vst1q_f32(
+      dst + 12, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi))), a));
+}
+#endif // ET_IMAGE_USE_NEON
+
+// Deinterleave + normalize one contiguous run of `n` pixels (stride
+// in_channels bytes/pixel) into the r/g/b float planes. NEON when available,
+// scalar otherwise; the scalar tail also finishes the final (<16) pixels.
+void deinterleave_run(
+    const uint8_t* __restrict src,
+    size_t n,
+    int32_t in_channels,
+    int32_t r_off,
+    int32_t g_off,
+    int32_t b_off,
+    float* __restrict r_out,
+    float* __restrict g_out,
+    float* __restrict b_out,
+    float a_r,
+    float b_r,
+    float a_g,
+    float b_g,
+    float a_b,
+    float b_b) {
+  size_t i = 0;
+#if ET_IMAGE_USE_NEON
+  const float32x4_t va_r = vdupq_n_f32(a_r);
+  const float32x4_t vb_r = vdupq_n_f32(b_r);
+  const float32x4_t va_g = vdupq_n_f32(a_g);
+  const float32x4_t vb_g = vdupq_n_f32(b_g);
+  const float32x4_t va_b = vdupq_n_f32(a_b);
+  const float32x4_t vb_b = vdupq_n_f32(b_b);
+  if (in_channels == 4) {
+    for (; i + 16 <= n; i += 16) {
+      uint8x16x4_t px = vld4q_u8(src + i * 4);
+      widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r);
+      widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g);
+      widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b);
+    }
+  } else { // in_channels == 3
+    for (; i + 16 <= n; i += 16) {
+      uint8x16x3_t px = vld3q_u8(src + i * 3);
+      widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r);
+      widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g);
+      widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b);
+    }
+  }
+#endif // ET_IMAGE_USE_NEON
+  for (; i < n; ++i) {
+    const uint8_t* p = src + i * in_channels;
+    r_out[i] = static_cast<float>(p[r_off]) * a_r + b_r;
+    g_out[i] = static_cast<float>(p[g_off]) * a_g + b_g;
+    b_out[i] = static_cast<float>(p[b_off]) * a_b + b_b;
+  }
+}
+
+} // namespace
+
+Error deinterleave_to_chw(
+    const uint8_t* src,
+    int32_t src_w,
+    int32_t src_h,
+    int32_t src_stride,
+    int32_t in_channels,
+    int32_t r_off,
+    int32_t g_off,
+    int32_t b_off,
+    float* output,
+    int32_t final_w,
+    int32_t final_h,
+    int32_t offset_x,
+    int32_t offset_y,
+    const Normalization& norm) {
+  ET_DCHECK_MSG(
+      in_channels == 3 || in_channels == 4, "in_channels must be 3 or 4");
+  ET_DCHECK_MSG(
+      r_off < in_channels && g_off < in_channels && b_off < in_channels,
+      "channel offsets must be < in_channels");
+  const size_t spatial = static_cast<size_t>(final_w) * final_h;
+
+  // Per-channel affine coefficients for `out = in * a + b`, in RGB order.
+  const float a_r = norm.scale_factor / norm.std_dev[0];
+  const float a_g = norm.scale_factor / norm.std_dev[1];
+  const float a_b = norm.scale_factor / norm.std_dev[2];
+  const float b_r = -norm.mean[0] / norm.std_dev[0];
+  const float b_g = -norm.mean[1] / norm.std_dev[1];
+  const float b_b = -norm.mean[2] / norm.std_dev[2];
+
+  // Output planes in CHW order: R, G, B.
+  float* r_plane = output + 0 * spatial;
+  float* g_plane = output + 1 * spatial;
+  float* b_plane = output + 2 * spatial;
+
+  // Fast path: contiguous source covering the entire plane (no stride padding,
+  // no letterbox offset, src dims == final dims) -> one run over all pixels.
+  if (src_stride == src_w * in_channels && offset_x == 0 && offset_y == 0 &&
+      src_w == final_w && src_h == final_h) {
+    deinterleave_run(
+        src,
+        static_cast<size_t>(src_w) * src_h,
+        in_channels,
+        r_off,
+        g_off,
+        b_off,
+        r_plane,
+        g_plane,
+        b_plane,
+        a_r,
+        b_r,
+        a_g,
+        b_g,
+        a_b,
+        b_b);
+    return Error::Ok;
+  }
+
+  // Slow path: row by row to honor stride padding and/or a letterbox offset.
+  for (int32_t y = 0; y < src_h; ++y) {
+    const uint8_t* src_row = src + static_cast<size_t>(y) * src_stride;
+    const size_t dst_off =
+        static_cast<size_t>(y + offset_y) * final_w + offset_x;
+    deinterleave_run(
+        src_row,
+        src_w,
+        in_channels,
+        r_off,
+        g_off,
+        b_off,
+        r_plane + dst_off,
+        g_plane + dst_off,
+        b_plane + dst_off,
+        a_r,
+        b_r,
+        a_g,
+        b_g,
+        a_b,
+        b_b);
+  }
+  return Error::Ok;
+}
+
+} // namespace image
+} // namespace extension
+} // namespace executorch
diff --git a/extension/image/image_processor_simd.h b/extension/image/image_processor_simd.h
diff --git a/extension/image/targets.bzl b/extension/image/targets.bzl