Skip to content

Commit 92e6a4c

Browse files
authored
Switch to neon for interleave (pytorch#20137)
Differential Revision: D107958353 Pull Request resolved: pytorch#20137
1 parent eb851a5 commit 92e6a4c

6 files changed

Lines changed: 278 additions & 103 deletions

File tree

extension/image/CMakeLists.txt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ cmake_minimum_required(VERSION 3.19)
99
if(APPLE)
1010
enable_language(OBJCXX)
1111
add_library(
12-
extension_image image_processor_common.cpp image_processor_apple.cpp
13-
image_processor_apple_gpu.mm
12+
extension_image image_processor_common.cpp image_processor_simd.cpp
13+
image_processor_apple.cpp image_processor_apple_gpu.mm
1414
)
1515
set_source_files_properties(
1616
image_processor_apple_gpu.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc"
@@ -39,7 +39,10 @@ else()
3939
)
4040
FetchContent_MakeAvailable(stb)
4141

42-
add_library(extension_image image_processor_common.cpp image_processor.cpp)
42+
add_library(
43+
extension_image image_processor_common.cpp image_processor_simd.cpp
44+
image_processor.cpp
45+
)
4346

4447
# stb_image_resize.h lives under deprecated/ in current stb. Private: only the
4548
# .cpp uses it, not the installed public headers.

extension/image/image_processor.cpp

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
*/
88

99
#include <executorch/extension/image/image_processor.h>
10+
#include <executorch/extension/image/image_processor_simd.h>
1011

1112
#include <algorithm>
1213
#include <cstring>
@@ -420,25 +421,23 @@ Error ImageProcessor::process_into(
420421
InvalidArgument,
421422
"normalization std_dev must be nonzero");
422423
}
423-
// Source (resized RGB) carries input_channels; the output tensor carries
424-
// output_channels. They are equal today, so channels map 1:1; a future
425-
// divergence (e.g. grayscale) would need an explicit channel map here.
426-
for (int32_t y = 0; y < resize_h; ++y) {
427-
for (int32_t x = 0; x < resize_w; ++x) {
428-
const int32_t src_idx = (y * resize_w + x) * input_channels;
429-
const int32_t dst_y = y + offset_y;
430-
const int32_t dst_x = x + offset_x;
431-
for (int32_t c = 0; c < output_channels; ++c) {
432-
const float val =
433-
(resized_buf[src_idx + c] * norm.scale_factor - norm.mean[c]) /
434-
norm.std_dev[c];
435-
const size_t out_idx = static_cast<size_t>(c) * final_w * final_h +
436-
static_cast<size_t>(dst_y) * final_w + dst_x;
437-
output[out_idx] = val;
438-
}
439-
}
440-
}
441-
return Error::Ok;
424+
// Deinterleave + normalize the resized interleaved RGB (R/G/B at byte
425+
// offsets 0/1/2) into the CHW output.
426+
return deinterleave_to_chw(
427+
resized_buf.data(),
428+
resize_w,
429+
resize_h,
430+
resize_w * input_channels,
431+
input_channels,
432+
/*r_off=*/0,
433+
/*g_off=*/1,
434+
/*b_off=*/2,
435+
output,
436+
final_w,
437+
final_h,
438+
offset_x,
439+
offset_y,
440+
norm);
442441
}
443442

444443
Error ImageProcessor::process_yuv_into(

extension/image/image_processor_apple.cpp

Lines changed: 8 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
#include <executorch/extension/image/image_processor.h>
2222
#include <executorch/extension/image/image_processor_apple.h>
23+
#include <executorch/extension/image/image_processor_simd.h>
2324

2425
#include <algorithm>
2526
#include <cstring>
@@ -391,85 +392,6 @@ size_t compute_scale_temp_size(
391392
return temp_size > 0 ? static_cast<size_t>(temp_size) : 0;
392393
}
393394

394-
// Deinterleave BGRA uint8 → planar RGB float with fused normalization.
395-
// Handles offset for letterbox padding.
396-
//
397-
// Per channel (R, G, B): vDSP_vfltu8 reads the matching byte from BGRA via
398-
// stride=4 and converts uint8→float, then vDSP_vsmsa applies the fused
399-
// affine `out = in * (scale_factor / std_dev) + (-mean / std_dev)` in-place.
400-
Error deinterleave_bgra_to_chw(
401-
const uint8_t* bgra_data,
402-
int32_t src_w,
403-
int32_t src_h,
404-
int32_t src_stride,
405-
float* output,
406-
int32_t final_w,
407-
int32_t final_h,
408-
int32_t offset_x,
409-
int32_t offset_y,
410-
const Normalization& norm) {
411-
const size_t spatial = static_cast<size_t>(final_w) * final_h;
412-
413-
// Per-channel affine coefficients for `out = in * a + b`.
414-
// BGRA byte layout: byte 0 = B, byte 1 = G, byte 2 = R; norm.{mean,std_dev}
415-
// are indexed in RGB order (channel 0 = R, 1 = G, 2 = B).
416-
const float a_r = norm.scale_factor / norm.std_dev[0];
417-
const float a_g = norm.scale_factor / norm.std_dev[1];
418-
const float a_b = norm.scale_factor / norm.std_dev[2];
419-
const float b_r = -norm.mean[0] / norm.std_dev[0];
420-
const float b_g = -norm.mean[1] / norm.std_dev[1];
421-
const float b_b = -norm.mean[2] / norm.std_dev[2];
422-
423-
// When the bias is zero (e.g. zeroToOne / mean=0), a plain scale (vsmul) is
424-
// cheaper than the fused scale+add (vsmsa).
425-
const bool no_offset = (b_r == 0.0f && b_g == 0.0f && b_b == 0.0f);
426-
auto scale_bias =
427-
[no_offset](float* p, const float* a, const float* b, vDSP_Length n) {
428-
if (no_offset) {
429-
vDSP_vsmul(p, 1, a, p, 1, n);
430-
} else {
431-
vDSP_vsmsa(p, 1, a, b, p, 1, n);
432-
}
433-
};
434-
435-
// Output planes in CHW order: R, G, B. Each plane is final_w × final_h
436-
// floats; we write a src_h × src_w region starting at (offset_y, offset_x).
437-
float* r_plane = output + 0 * spatial;
438-
float* g_plane = output + 1 * spatial;
439-
float* b_plane = output + 2 * spatial;
440-
441-
// Fast path: source is contiguous and destination region is the entire
442-
// plane (offsets 0, src dims == final dims).
443-
if (src_stride == src_w * 4 && offset_x == 0 && offset_y == 0 &&
444-
src_w == final_w && src_h == final_h) {
445-
const vDSP_Length n = static_cast<vDSP_Length>(src_w) * src_h;
446-
vDSP_vfltu8(bgra_data + 2, 4, r_plane, 1, n);
447-
scale_bias(r_plane, &a_r, &b_r, n);
448-
vDSP_vfltu8(bgra_data + 1, 4, g_plane, 1, n);
449-
scale_bias(g_plane, &a_g, &b_g, n);
450-
vDSP_vfltu8(bgra_data + 0, 4, b_plane, 1, n);
451-
scale_bias(b_plane, &a_b, &b_b, n);
452-
return Error::Ok;
453-
}
454-
455-
// Slow path: row-by-row to handle stride padding and/or letterbox offsets.
456-
for (int32_t y = 0; y < src_h; ++y) {
457-
const uint8_t* src_row = bgra_data + y * src_stride;
458-
const ptrdiff_t dst_off = (y + offset_y) * final_w + offset_x;
459-
float* r_dst = r_plane + dst_off;
460-
float* g_dst = g_plane + dst_off;
461-
float* b_dst = b_plane + dst_off;
462-
const vDSP_Length n = static_cast<vDSP_Length>(src_w);
463-
vDSP_vfltu8(src_row + 2, 4, r_dst, 1, n);
464-
scale_bias(r_dst, &a_r, &b_r, n);
465-
vDSP_vfltu8(src_row + 1, 4, g_dst, 1, n);
466-
scale_bias(g_dst, &a_g, &b_g, n);
467-
vDSP_vfltu8(src_row + 0, 4, b_dst, 1, n);
468-
scale_bias(b_dst, &a_b, &b_b, n);
469-
}
470-
return Error::Ok;
471-
}
472-
473395
// Rotate an interleaved BGRA (ARGB8888 layout) buffer by `orientation` using
474396
// vImage's SIMD/cache-aware 90-degree rotation, writing a tightly-packed result
475397
// into `scratch`. UP is handled by the caller (no rotation). out_data/out_w/
@@ -590,11 +512,16 @@ Error normalize_bgra_into(
590512
offset_y = offset.second;
591513
}
592514

593-
return deinterleave_bgra_to_chw(
515+
// BGRA byte layout: B=0, G=1, R=2 (alpha dropped); norm is RGB-indexed.
516+
return deinterleave_to_chw(
594517
bgra_data,
595518
width,
596519
height,
597520
stride,
521+
/*in_channels=*/4,
522+
/*r_off=*/2,
523+
/*g_off=*/1,
524+
/*b_off=*/0,
598525
out,
599526
final_w,
600527
final_h,
@@ -1380,6 +1307,7 @@ Error process_pixelbuffer_into(
13801307

13811308
// Allocate a CHW float tensor sized to the configured target and fill it via
13821309
// process_pixelbuffer_into.
1310+
// cppcheck-suppress unusedFunction
13831311
Result<TensorPtr> process_pixelbuffer(
13841312
const ImageProcessor& processor,
13851313
CVPixelBufferRef pixelBuffer,
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/extension/image/image_processor_simd.h>
10+
11+
#include <cstddef>
12+
13+
#include <executorch/runtime/platform/assert.h>
14+
15+
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
16+
#include <arm_neon.h>
17+
#define ET_IMAGE_USE_NEON 1
18+
#else
19+
#define ET_IMAGE_USE_NEON 0
20+
#endif
21+
22+
namespace executorch {
23+
namespace extension {
24+
namespace image {
25+
26+
using runtime::Error;
27+
28+
namespace {
29+
30+
#if ET_IMAGE_USE_NEON
31+
// Widen 16 uint8 -> 4x float32x4, apply out = in * a + b (single-rounding FMA),
32+
// and store the 16 resulting floats.
33+
__attribute__((always_inline)) inline void
34+
widen_fma_store(uint8x16_t ch, float* dst, float32x4_t a, float32x4_t b) {
35+
uint16x8_t lo = vmovl_u8(vget_low_u8(ch));
36+
uint16x8_t hi = vmovl_u8(vget_high_u8(ch));
37+
vst1q_f32(
38+
dst + 0, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))), a));
39+
vst1q_f32(
40+
dst + 4, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo))), a));
41+
vst1q_f32(
42+
dst + 8, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi))), a));
43+
vst1q_f32(
44+
dst + 12, vfmaq_f32(b, vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi))), a));
45+
}
46+
#endif // ET_IMAGE_USE_NEON
47+
48+
// Deinterleave + normalize one contiguous run of `n` pixels (stride
49+
// in_channels bytes/pixel) into the r/g/b float planes. NEON when available,
50+
// scalar otherwise; the scalar tail also finishes the final (<16) pixels.
51+
void deinterleave_run(
52+
const uint8_t* __restrict src,
53+
size_t n,
54+
int32_t in_channels,
55+
int32_t r_off,
56+
int32_t g_off,
57+
int32_t b_off,
58+
float* __restrict r_out,
59+
float* __restrict g_out,
60+
float* __restrict b_out,
61+
float a_r,
62+
float b_r,
63+
float a_g,
64+
float b_g,
65+
float a_b,
66+
float b_b) {
67+
size_t i = 0;
68+
#if ET_IMAGE_USE_NEON
69+
const float32x4_t va_r = vdupq_n_f32(a_r);
70+
const float32x4_t vb_r = vdupq_n_f32(b_r);
71+
const float32x4_t va_g = vdupq_n_f32(a_g);
72+
const float32x4_t vb_g = vdupq_n_f32(b_g);
73+
const float32x4_t va_b = vdupq_n_f32(a_b);
74+
const float32x4_t vb_b = vdupq_n_f32(b_b);
75+
if (in_channels == 4) {
76+
for (; i + 16 <= n; i += 16) {
77+
uint8x16x4_t px = vld4q_u8(src + i * 4);
78+
widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r);
79+
widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g);
80+
widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b);
81+
}
82+
} else { // in_channels == 3
83+
for (; i + 16 <= n; i += 16) {
84+
uint8x16x3_t px = vld3q_u8(src + i * 3);
85+
widen_fma_store(px.val[r_off], r_out + i, va_r, vb_r);
86+
widen_fma_store(px.val[g_off], g_out + i, va_g, vb_g);
87+
widen_fma_store(px.val[b_off], b_out + i, va_b, vb_b);
88+
}
89+
}
90+
#endif // ET_IMAGE_USE_NEON
91+
for (; i < n; ++i) {
92+
const uint8_t* p = src + i * in_channels;
93+
r_out[i] = static_cast<float>(p[r_off]) * a_r + b_r;
94+
g_out[i] = static_cast<float>(p[g_off]) * a_g + b_g;
95+
b_out[i] = static_cast<float>(p[b_off]) * a_b + b_b;
96+
}
97+
}
98+
99+
} // namespace
100+
101+
Error deinterleave_to_chw(
102+
const uint8_t* src,
103+
int32_t src_w,
104+
int32_t src_h,
105+
int32_t src_stride,
106+
int32_t in_channels,
107+
int32_t r_off,
108+
int32_t g_off,
109+
int32_t b_off,
110+
float* output,
111+
int32_t final_w,
112+
int32_t final_h,
113+
int32_t offset_x,
114+
int32_t offset_y,
115+
const Normalization& norm) {
116+
ET_DCHECK_MSG(
117+
in_channels == 3 || in_channels == 4, "in_channels must be 3 or 4");
118+
ET_DCHECK_MSG(
119+
r_off < in_channels && g_off < in_channels && b_off < in_channels,
120+
"channel offsets must be < in_channels");
121+
const size_t spatial = static_cast<size_t>(final_w) * final_h;
122+
123+
// Per-channel affine coefficients for `out = in * a + b`, in RGB order.
124+
const float a_r = norm.scale_factor / norm.std_dev[0];
125+
const float a_g = norm.scale_factor / norm.std_dev[1];
126+
const float a_b = norm.scale_factor / norm.std_dev[2];
127+
const float b_r = -norm.mean[0] / norm.std_dev[0];
128+
const float b_g = -norm.mean[1] / norm.std_dev[1];
129+
const float b_b = -norm.mean[2] / norm.std_dev[2];
130+
131+
// Output planes in CHW order: R, G, B.
132+
float* r_plane = output + 0 * spatial;
133+
float* g_plane = output + 1 * spatial;
134+
float* b_plane = output + 2 * spatial;
135+
136+
// Fast path: contiguous source covering the entire plane (no stride padding,
137+
// no letterbox offset, src dims == final dims) -> one run over all pixels.
138+
if (src_stride == src_w * in_channels && offset_x == 0 && offset_y == 0 &&
139+
src_w == final_w && src_h == final_h) {
140+
deinterleave_run(
141+
src,
142+
static_cast<size_t>(src_w) * src_h,
143+
in_channels,
144+
r_off,
145+
g_off,
146+
b_off,
147+
r_plane,
148+
g_plane,
149+
b_plane,
150+
a_r,
151+
b_r,
152+
a_g,
153+
b_g,
154+
a_b,
155+
b_b);
156+
return Error::Ok;
157+
}
158+
159+
// Slow path: row by row to honor stride padding and/or a letterbox offset.
160+
for (int32_t y = 0; y < src_h; ++y) {
161+
const uint8_t* src_row = src + static_cast<size_t>(y) * src_stride;
162+
const size_t dst_off =
163+
static_cast<size_t>(y + offset_y) * final_w + offset_x;
164+
deinterleave_run(
165+
src_row,
166+
src_w,
167+
in_channels,
168+
r_off,
169+
g_off,
170+
b_off,
171+
r_plane + dst_off,
172+
g_plane + dst_off,
173+
b_plane + dst_off,
174+
a_r,
175+
b_r,
176+
a_g,
177+
b_g,
178+
a_b,
179+
b_b);
180+
}
181+
return Error::Ok;
182+
}
183+
184+
} // namespace image
185+
} // namespace extension
186+
} // namespace executorch

0 commit comments

Comments
 (0)