Skip to content

Commit 0dec8a0

Browse files
committed
Refactor HWY per-pixel ops and add strided ROI fallback tests
Generic per-pixel HWY operation helpers for binary and ternary ops, refactors add/sub/mul/div/mad HWY implementations to use these helpers, and ensures HWY SIMD is only used for contiguous channel ranges. Adds a new test to verify correct fallback to scalar code for strided (non-contiguous) ROI channel ranges. Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
1 parent 6e16329 commit 0dec8a0

File tree

6 files changed

+259
-325
lines changed

6 files changed

+259
-325
lines changed

src/include/imageio_pvt.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ extern int oiio_log_times;
4343
extern int openexr_core;
4444
extern int jpeg_com_attributes;
4545
extern int png_linear_premult;
46+
extern int enable_hwy;
4647
extern int limit_channels;
4748
extern int limit_imagesize_MB;
4849
extern int imagebuf_print_uncaught_errors;

src/libOpenImageIO/imagebufalgo_addsub.cpp

Lines changed: 51 additions & 191 deletions
Original file line numberDiff line numberDiff line change
@@ -64,107 +64,28 @@ add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
6464

6565

6666
#if OIIO_USE_HWY
67+
6768
// Native integer add using SaturatedAdd (scale-invariant, no float conversion)
6869
template<class T>
6970
static bool
7071
add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
7172
ROI roi, int nthreads)
7273
{
73-
auto Rv = HwyPixels(R);
74-
auto Av = HwyPixels(A);
75-
auto Bv = HwyPixels(B);
76-
ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
77-
const int nchannels = RoiNChannels(roi);
78-
const bool contig = ChannelsContiguous<T>(Rv, nchannels)
79-
&& ChannelsContiguous<T>(Av, nchannels)
80-
&& ChannelsContiguous<T>(Bv, nchannels);
81-
82-
for (int y = roi.ybegin; y < roi.yend; ++y) {
83-
T* r_row = RoiRowPtr<T>(Rv, y, roi);
84-
const T* a_row = RoiRowPtr<T>(Av, y, roi);
85-
const T* b_row = RoiRowPtr<T>(Bv, y, roi);
86-
87-
if (contig) {
88-
// Native integer saturated add - much faster than float conversion!
89-
size_t n = static_cast<size_t>(roi.width())
90-
* static_cast<size_t>(nchannels);
91-
RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
92-
[](auto d, auto a, auto b) {
93-
return hn::SaturatedAdd(a, b);
94-
});
95-
} else {
96-
// Scalar fallback
97-
for (int x = roi.xbegin; x < roi.xend; ++x) {
98-
T* r_ptr = ChannelPtr<T>(Rv, x, y, roi.chbegin);
99-
const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
100-
const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
101-
for (int c = 0; c < nchannels; ++c) {
102-
// Saturating add in scalar
103-
int64_t sum = (int64_t)a_ptr[c] + (int64_t)b_ptr[c];
104-
if constexpr (std::is_unsigned_v<T>) {
105-
r_ptr[c] = (sum > std::numeric_limits<T>::max())
106-
? std::numeric_limits<T>::max()
107-
: (T)sum;
108-
} else {
109-
r_ptr[c] = (sum > std::numeric_limits<T>::max())
110-
? std::numeric_limits<T>::max()
111-
: (sum < std::numeric_limits<T>::min())
112-
? std::numeric_limits<T>::min()
113-
: (T)sum;
114-
}
115-
}
116-
}
117-
}
118-
}
119-
});
120-
return true;
74+
return hwy_binary_native_int_perpixel_op<T>(R, A, B, roi, nthreads,
75+
[](auto /*d*/, auto a, auto b) {
76+
return hn::SaturatedAdd(a, b);
77+
});
12178
}
12279

12380
template<class Rtype, class Atype, class Btype>
12481
static bool
12582
add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
12683
int nthreads)
12784
{
128-
auto Rv = HwyPixels(R);
129-
auto Av = HwyPixels(A);
130-
auto Bv = HwyPixels(B);
131-
ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
132-
const int nchannels = RoiNChannels(roi);
133-
const bool contig = ChannelsContiguous<Rtype>(Rv, nchannels)
134-
&& ChannelsContiguous<Atype>(Av, nchannels)
135-
&& ChannelsContiguous<Btype>(Bv, nchannels);
136-
137-
for (int y = roi.ybegin; y < roi.yend; ++y) {
138-
Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi);
139-
const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
140-
const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
141-
142-
if (contig) {
143-
// Process whole line as one vector stream
144-
size_t n = static_cast<size_t>(roi.width())
145-
* static_cast<size_t>(nchannels);
146-
RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
147-
[](auto d, auto a, auto b) {
148-
return hn::Add(a, b);
149-
});
150-
} else {
151-
// Process pixel by pixel (scalar fallback for strided channels)
152-
for (int x = roi.xbegin; x < roi.xend; ++x) {
153-
Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
154-
const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
155-
roi.chbegin);
156-
const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
157-
roi.chbegin);
158-
for (int c = 0; c < nchannels; ++c) {
159-
r_ptr[c] = static_cast<Rtype>(
160-
static_cast<float>(a_ptr[c])
161-
+ static_cast<float>(b_ptr[c]));
162-
}
163-
}
164-
}
165-
}
166-
});
167-
return true;
85+
return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
86+
[](auto /*d*/, auto a, auto b) {
87+
return hn::Add(a, b);
88+
});
16889
}
16990

17091
template<class Rtype, class Atype>
@@ -204,15 +125,24 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
204125
#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
205126
if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
206127
&& B.localpixels()) {
207-
// Use native integer path for scale-invariant add when all types match
208-
// and are integer types (much faster: 6-12x vs 3-5x with float conversion)
209-
constexpr bool all_same = std::is_same_v<Rtype, Atype>
210-
&& std::is_same_v<Atype, Btype>;
211-
constexpr bool is_integer = std::is_integral_v<Rtype>;
212-
if constexpr (all_same && is_integer) {
213-
return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
128+
auto Rv = HwyPixels(R);
129+
auto Av = HwyPixels(A);
130+
auto Bv = HwyPixels(B);
131+
const int nchannels = RoiNChannels(roi);
132+
const bool contig = ChannelsContiguous<Rtype>(Rv, nchannels)
133+
&& ChannelsContiguous<Atype>(Av, nchannels)
134+
&& ChannelsContiguous<Btype>(Bv, nchannels);
135+
if (contig) {
136+
// Use native integer path for scale-invariant add when all types
137+
// match and are integer types (much faster: 6-12x vs 3-5x with
138+
// float conversion).
139+
constexpr bool all_same = std::is_same_v<Rtype, Atype>
140+
&& std::is_same_v<Atype, Btype>;
141+
constexpr bool is_integer = std::is_integral_v<Rtype>;
142+
if constexpr (all_same && is_integer)
143+
return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
144+
return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
214145
}
215-
return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
216146
}
217147
#endif
218148
return add_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
@@ -236,100 +166,21 @@ static bool
236166
sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
237167
ROI roi, int nthreads)
238168
{
239-
auto Rv = HwyPixels(R);
240-
auto Av = HwyPixels(A);
241-
auto Bv = HwyPixels(B);
242-
ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
243-
const int nchannels = RoiNChannels(roi);
244-
const bool contig = ChannelsContiguous<T>(Rv, nchannels)
245-
&& ChannelsContiguous<T>(Av, nchannels)
246-
&& ChannelsContiguous<T>(Bv, nchannels);
247-
248-
for (int y = roi.ybegin; y < roi.yend; ++y) {
249-
T* r_row = RoiRowPtr<T>(Rv, y, roi);
250-
const T* a_row = RoiRowPtr<T>(Av, y, roi);
251-
const T* b_row = RoiRowPtr<T>(Bv, y, roi);
252-
253-
if (contig) {
254-
// Native integer saturated sub - much faster than float conversion!
255-
size_t n = static_cast<size_t>(roi.width())
256-
* static_cast<size_t>(nchannels);
257-
RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
258-
[](auto d, auto a, auto b) {
259-
return hn::SaturatedSub(a, b);
260-
});
261-
} else {
262-
// Scalar fallback
263-
for (int x = roi.xbegin; x < roi.xend; ++x) {
264-
T* r_ptr = ChannelPtr<T>(Rv, x, y, roi.chbegin);
265-
const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
266-
const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
267-
for (int c = 0; c < nchannels; ++c) {
268-
// Saturating sub in scalar
269-
if constexpr (std::is_unsigned_v<T>) {
270-
r_ptr[c] = (a_ptr[c] > b_ptr[c])
271-
? (a_ptr[c] - b_ptr[c])
272-
: T(0);
273-
} else {
274-
int64_t diff = (int64_t)a_ptr[c]
275-
- (int64_t)b_ptr[c];
276-
r_ptr[c] = (diff > std::numeric_limits<T>::max())
277-
? std::numeric_limits<T>::max()
278-
: (diff < std::numeric_limits<T>::min())
279-
? std::numeric_limits<T>::min()
280-
: (T)diff;
281-
}
282-
}
283-
}
284-
}
285-
}
286-
});
287-
return true;
169+
return hwy_binary_native_int_perpixel_op<T>(R, A, B, roi, nthreads,
170+
[](auto /*d*/, auto a, auto b) {
171+
return hn::SaturatedSub(a, b);
172+
});
288173
}
289174

290175
template<class Rtype, class Atype, class Btype>
291176
static bool
292177
sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
293178
int nthreads)
294179
{
295-
auto Rv = HwyPixels(R);
296-
auto Av = HwyPixels(A);
297-
auto Bv = HwyPixels(B);
298-
ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
299-
const int nchannels = RoiNChannels(roi);
300-
const bool contig = ChannelsContiguous<Rtype>(Rv, nchannels)
301-
&& ChannelsContiguous<Atype>(Av, nchannels)
302-
&& ChannelsContiguous<Btype>(Bv, nchannels);
303-
304-
for (int y = roi.ybegin; y < roi.yend; ++y) {
305-
Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi);
306-
const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
307-
const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
308-
309-
if (contig) {
310-
size_t n = static_cast<size_t>(roi.width())
311-
* static_cast<size_t>(nchannels);
312-
RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
313-
[](auto d, auto a, auto b) {
314-
return hn::Sub(a, b);
315-
});
316-
} else {
317-
for (int x = roi.xbegin; x < roi.xend; ++x) {
318-
Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
319-
const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
320-
roi.chbegin);
321-
const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
322-
roi.chbegin);
323-
for (int c = 0; c < nchannels; ++c) {
324-
r_ptr[c] = static_cast<Rtype>(
325-
static_cast<float>(a_ptr[c])
326-
- static_cast<float>(b_ptr[c]));
327-
}
328-
}
329-
}
330-
}
331-
});
332-
return true;
180+
return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
181+
[](auto /*d*/, auto a, auto b) {
182+
return hn::Sub(a, b);
183+
});
333184
}
334185
#endif // defined(OIIO_USE_HWY) && OIIO_USE_HWY
335186

@@ -341,15 +192,24 @@ sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
341192
#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
342193
if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
343194
&& B.localpixels()) {
344-
// Use native integer path for scale-invariant sub when all types match
345-
// and are integer types (much faster: 6-12x vs 3-5x with float conversion)
346-
constexpr bool all_same = std::is_same_v<Rtype, Atype>
347-
&& std::is_same_v<Atype, Btype>;
348-
constexpr bool is_integer = std::is_integral_v<Rtype>;
349-
if constexpr (all_same && is_integer) {
350-
return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
195+
auto Rv = HwyPixels(R);
196+
auto Av = HwyPixels(A);
197+
auto Bv = HwyPixels(B);
198+
const int nchannels = RoiNChannels(roi);
199+
const bool contig = ChannelsContiguous<Rtype>(Rv, nchannels)
200+
&& ChannelsContiguous<Atype>(Av, nchannels)
201+
&& ChannelsContiguous<Btype>(Bv, nchannels);
202+
if (contig) {
203+
// Use native integer path for scale-invariant sub when all types
204+
// match and are integer types (much faster: 6-12x vs 3-5x with
205+
// float conversion).
206+
constexpr bool all_same = std::is_same_v<Rtype, Atype>
207+
&& std::is_same_v<Atype, Btype>;
208+
constexpr bool is_integer = std::is_integral_v<Rtype>;
209+
if constexpr (all_same && is_integer)
210+
return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
211+
return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
351212
}
352-
return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
353213
}
354214
#endif
355215
return sub_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);

src/libOpenImageIO/imagebufalgo_hwy_pvt.h

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <OpenImageIO/half.h>
88
#include <OpenImageIO/imagebuf.h>
9+
#include <OpenImageIO/imagebufalgo_util.h>
910
#include <OpenImageIO/imageio.h>
1011
#include <algorithm>
1112
#include <cstddef>
@@ -756,6 +757,90 @@ RunHwyTernaryCmd(Rtype* r, const ABCtype* a, const ABCtype* b, const ABCtype* c,
756757
}
757758
}
758759

760+
// -----------------------------------------------------------------------
761+
// Per-pixel Ops (ImageBufAlgo, contiguous interleaved channels)
762+
// -----------------------------------------------------------------------
763+
764+
/// Execute a binary per-pixel HWY operation for interleaved, contiguous
765+
/// channels. The caller is responsible for ensuring that the channel range is
766+
/// contiguous for R/A/B (i.e. no per-pixel padding, and the ROI channel range
767+
/// covers the full pixel).
768+
template<typename Rtype, typename Atype, typename Btype, typename OpFunc>
769+
inline bool
770+
hwy_binary_perpixel_op(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
771+
int nthreads, OpFunc op)
772+
{
773+
auto Rv = HwyPixels(R);
774+
auto Av = HwyPixels(A);
775+
auto Bv = HwyPixels(B);
776+
ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
777+
const int nchannels = RoiNChannels(roi);
778+
const size_t n = static_cast<size_t>(roi.width())
779+
* static_cast<size_t>(nchannels);
780+
for (int y = roi.ybegin; y < roi.yend; ++y) {
781+
Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi);
782+
const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
783+
const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
784+
RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n, op);
785+
}
786+
});
787+
return true;
788+
}
789+
790+
/// Execute a ternary per-pixel HWY operation for interleaved, contiguous
791+
/// channels. The caller is responsible for ensuring that the channel range is
792+
/// contiguous for R/A/B/C (i.e. no per-pixel padding, and the ROI channel range
793+
/// covers the full pixel).
794+
template<typename Rtype, typename ABCtype, typename OpFunc>
795+
inline bool
796+
hwy_ternary_perpixel_op(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
797+
const ImageBuf& C, ROI roi, int nthreads, OpFunc op)
798+
{
799+
auto Rv = HwyPixels(R);
800+
auto Av = HwyPixels(A);
801+
auto Bv = HwyPixels(B);
802+
auto Cv = HwyPixels(C);
803+
ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
804+
const int nchannels = RoiNChannels(roi);
805+
const size_t n = static_cast<size_t>(roi.width())
806+
* static_cast<size_t>(nchannels);
807+
for (int y = roi.ybegin; y < roi.yend; ++y) {
808+
Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi);
809+
const ABCtype* a_row = RoiRowPtr<ABCtype>(Av, y, roi);
810+
const ABCtype* b_row = RoiRowPtr<ABCtype>(Bv, y, roi);
811+
const ABCtype* c_row = RoiRowPtr<ABCtype>(Cv, y, roi);
812+
RunHwyTernaryCmd<Rtype, ABCtype>(r_row, a_row, b_row, c_row, n, op);
813+
}
814+
});
815+
return true;
816+
}
817+
818+
/// Execute a binary per-pixel HWY operation on native integer arrays (no type
819+
/// promotion/normalization). The caller is responsible for ensuring that the
820+
/// channel range is contiguous for R/A/B.
821+
template<typename T, typename OpFunc>
822+
inline bool
823+
hwy_binary_native_int_perpixel_op(ImageBuf& R, const ImageBuf& A,
824+
const ImageBuf& B, ROI roi, int nthreads,
825+
OpFunc op)
826+
{
827+
auto Rv = HwyPixels(R);
828+
auto Av = HwyPixels(A);
829+
auto Bv = HwyPixels(B);
830+
ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
831+
const int nchannels = RoiNChannels(roi);
832+
const size_t n = static_cast<size_t>(roi.width())
833+
* static_cast<size_t>(nchannels);
834+
for (int y = roi.ybegin; y < roi.yend; ++y) {
835+
T* r_row = RoiRowPtr<T>(Rv, y, roi);
836+
const T* a_row = RoiRowPtr<T>(Av, y, roi);
837+
const T* b_row = RoiRowPtr<T>(Bv, y, roi);
838+
RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n, op);
839+
}
840+
});
841+
return true;
842+
}
843+
759844
// -----------------------------------------------------------------------
760845
// Interleaved Channel Load/Store Helpers
761846
// -----------------------------------------------------------------------

0 commit comments

Comments
 (0)