@@ -64,107 +64,28 @@ add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
6464
6565
6666#if OIIO_USE_HWY
67+
6768// Native integer add using SaturatedAdd (scale-invariant, no float conversion)
6869template <class T >
6970static bool
7071add_impl_hwy_native_int (ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
7172 ROI roi, int nthreads)
7273{
73- auto Rv = HwyPixels (R);
74- auto Av = HwyPixels (A);
75- auto Bv = HwyPixels (B);
76- ImageBufAlgo::parallel_image (roi, nthreads, [&](ROI roi) {
77- const int nchannels = RoiNChannels (roi);
78- const bool contig = ChannelsContiguous<T>(Rv, nchannels)
79- && ChannelsContiguous<T>(Av, nchannels)
80- && ChannelsContiguous<T>(Bv, nchannels);
81-
82- for (int y = roi.ybegin ; y < roi.yend ; ++y) {
83- T* r_row = RoiRowPtr<T>(Rv, y, roi);
84- const T* a_row = RoiRowPtr<T>(Av, y, roi);
85- const T* b_row = RoiRowPtr<T>(Bv, y, roi);
86-
87- if (contig) {
88- // Native integer saturated add - much faster than float conversion!
89- size_t n = static_cast <size_t >(roi.width ())
90- * static_cast <size_t >(nchannels);
91- RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
92- [](auto d, auto a, auto b) {
93- return hn::SaturatedAdd (a, b);
94- });
95- } else {
96- // Scalar fallback
97- for (int x = roi.xbegin ; x < roi.xend ; ++x) {
98- T* r_ptr = ChannelPtr<T>(Rv, x, y, roi.chbegin );
99- const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin );
100- const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin );
101- for (int c = 0 ; c < nchannels; ++c) {
102- // Saturating add in scalar
103- int64_t sum = (int64_t )a_ptr[c] + (int64_t )b_ptr[c];
104- if constexpr (std::is_unsigned_v<T>) {
105- r_ptr[c] = (sum > std::numeric_limits<T>::max ())
106- ? std::numeric_limits<T>::max ()
107- : (T)sum;
108- } else {
109- r_ptr[c] = (sum > std::numeric_limits<T>::max ())
110- ? std::numeric_limits<T>::max ()
111- : (sum < std::numeric_limits<T>::min ())
112- ? std::numeric_limits<T>::min ()
113- : (T)sum;
114- }
115- }
116- }
117- }
118- }
119- });
120- return true ;
74+ return hwy_binary_native_int_perpixel_op<T>(R, A, B, roi, nthreads,
75+ [](auto /* d*/ , auto a, auto b) {
76+ return hn::SaturatedAdd (a, b);
77+ });
12178}
12279
12380template <class Rtype , class Atype , class Btype >
12481static bool
12582add_impl_hwy (ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
12683 int nthreads)
12784{
128- auto Rv = HwyPixels (R);
129- auto Av = HwyPixels (A);
130- auto Bv = HwyPixels (B);
131- ImageBufAlgo::parallel_image (roi, nthreads, [&](ROI roi) {
132- const int nchannels = RoiNChannels (roi);
133- const bool contig = ChannelsContiguous<Rtype>(Rv, nchannels)
134- && ChannelsContiguous<Atype>(Av, nchannels)
135- && ChannelsContiguous<Btype>(Bv, nchannels);
136-
137- for (int y = roi.ybegin ; y < roi.yend ; ++y) {
138- Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi);
139- const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
140- const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
141-
142- if (contig) {
143- // Process whole line as one vector stream
144- size_t n = static_cast <size_t >(roi.width ())
145- * static_cast <size_t >(nchannels);
146- RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
147- [](auto d, auto a, auto b) {
148- return hn::Add (a, b);
149- });
150- } else {
151- // Process pixel by pixel (scalar fallback for strided channels)
152- for (int x = roi.xbegin ; x < roi.xend ; ++x) {
153- Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin );
154- const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
155- roi.chbegin );
156- const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
157- roi.chbegin );
158- for (int c = 0 ; c < nchannels; ++c) {
159- r_ptr[c] = static_cast <Rtype>(
160- static_cast <float >(a_ptr[c])
161- + static_cast <float >(b_ptr[c]));
162- }
163- }
164- }
165- }
166- });
167- return true ;
85+ return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
86+ [](auto /* d*/ , auto a, auto b) {
87+ return hn::Add (a, b);
88+ });
16889}
16990
17091template <class Rtype , class Atype >
@@ -204,15 +125,24 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
204125#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
205126 if (OIIO::pvt::enable_hwy && R.localpixels () && A.localpixels ()
206127 && B.localpixels ()) {
207- // Use native integer path for scale-invariant add when all types match
208- // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
209- constexpr bool all_same = std::is_same_v<Rtype, Atype>
210- && std::is_same_v<Atype, Btype>;
211- constexpr bool is_integer = std::is_integral_v<Rtype>;
212- if constexpr (all_same && is_integer) {
213- return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
128+ auto Rv = HwyPixels (R);
129+ auto Av = HwyPixels (A);
130+ auto Bv = HwyPixels (B);
131+ const int nchannels = RoiNChannels (roi);
132+ const bool contig = ChannelsContiguous<Rtype>(Rv, nchannels)
133+ && ChannelsContiguous<Atype>(Av, nchannels)
134+ && ChannelsContiguous<Btype>(Bv, nchannels);
135+ if (contig) {
136+ // Use native integer path for scale-invariant add when all types
137+ // match and are integer types (much faster: 6-12x vs 3-5x with
138+ // float conversion).
139+ constexpr bool all_same = std::is_same_v<Rtype, Atype>
140+ && std::is_same_v<Atype, Btype>;
141+ constexpr bool is_integer = std::is_integral_v<Rtype>;
142+ if constexpr (all_same && is_integer)
143+ return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
144+ return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
214145 }
215- return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
216146 }
217147#endif
218148 return add_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
@@ -236,100 +166,21 @@ static bool
236166sub_impl_hwy_native_int (ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
237167 ROI roi, int nthreads)
238168{
239- auto Rv = HwyPixels (R);
240- auto Av = HwyPixels (A);
241- auto Bv = HwyPixels (B);
242- ImageBufAlgo::parallel_image (roi, nthreads, [&](ROI roi) {
243- const int nchannels = RoiNChannels (roi);
244- const bool contig = ChannelsContiguous<T>(Rv, nchannels)
245- && ChannelsContiguous<T>(Av, nchannels)
246- && ChannelsContiguous<T>(Bv, nchannels);
247-
248- for (int y = roi.ybegin ; y < roi.yend ; ++y) {
249- T* r_row = RoiRowPtr<T>(Rv, y, roi);
250- const T* a_row = RoiRowPtr<T>(Av, y, roi);
251- const T* b_row = RoiRowPtr<T>(Bv, y, roi);
252-
253- if (contig) {
254- // Native integer saturated sub - much faster than float conversion!
255- size_t n = static_cast <size_t >(roi.width ())
256- * static_cast <size_t >(nchannels);
257- RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
258- [](auto d, auto a, auto b) {
259- return hn::SaturatedSub (a, b);
260- });
261- } else {
262- // Scalar fallback
263- for (int x = roi.xbegin ; x < roi.xend ; ++x) {
264- T* r_ptr = ChannelPtr<T>(Rv, x, y, roi.chbegin );
265- const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin );
266- const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin );
267- for (int c = 0 ; c < nchannels; ++c) {
268- // Saturating sub in scalar
269- if constexpr (std::is_unsigned_v<T>) {
270- r_ptr[c] = (a_ptr[c] > b_ptr[c])
271- ? (a_ptr[c] - b_ptr[c])
272- : T (0 );
273- } else {
274- int64_t diff = (int64_t )a_ptr[c]
275- - (int64_t )b_ptr[c];
276- r_ptr[c] = (diff > std::numeric_limits<T>::max ())
277- ? std::numeric_limits<T>::max ()
278- : (diff < std::numeric_limits<T>::min ())
279- ? std::numeric_limits<T>::min ()
280- : (T)diff;
281- }
282- }
283- }
284- }
285- }
286- });
287- return true ;
169+ return hwy_binary_native_int_perpixel_op<T>(R, A, B, roi, nthreads,
170+ [](auto /* d*/ , auto a, auto b) {
171+ return hn::SaturatedSub (a, b);
172+ });
288173}
289174
290175template <class Rtype , class Atype , class Btype >
291176static bool
292177sub_impl_hwy (ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
293178 int nthreads)
294179{
295- auto Rv = HwyPixels (R);
296- auto Av = HwyPixels (A);
297- auto Bv = HwyPixels (B);
298- ImageBufAlgo::parallel_image (roi, nthreads, [&](ROI roi) {
299- const int nchannels = RoiNChannels (roi);
300- const bool contig = ChannelsContiguous<Rtype>(Rv, nchannels)
301- && ChannelsContiguous<Atype>(Av, nchannels)
302- && ChannelsContiguous<Btype>(Bv, nchannels);
303-
304- for (int y = roi.ybegin ; y < roi.yend ; ++y) {
305- Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi);
306- const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
307- const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
308-
309- if (contig) {
310- size_t n = static_cast <size_t >(roi.width ())
311- * static_cast <size_t >(nchannels);
312- RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
313- [](auto d, auto a, auto b) {
314- return hn::Sub (a, b);
315- });
316- } else {
317- for (int x = roi.xbegin ; x < roi.xend ; ++x) {
318- Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin );
319- const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
320- roi.chbegin );
321- const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
322- roi.chbegin );
323- for (int c = 0 ; c < nchannels; ++c) {
324- r_ptr[c] = static_cast <Rtype>(
325- static_cast <float >(a_ptr[c])
326- - static_cast <float >(b_ptr[c]));
327- }
328- }
329- }
330- }
331- });
332- return true ;
180+ return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
181+ [](auto /* d*/ , auto a, auto b) {
182+ return hn::Sub (a, b);
183+ });
333184}
334185#endif // defined(OIIO_USE_HWY) && OIIO_USE_HWY
335186
@@ -341,15 +192,24 @@ sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
341192#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
342193 if (OIIO::pvt::enable_hwy && R.localpixels () && A.localpixels ()
343194 && B.localpixels ()) {
344- // Use native integer path for scale-invariant sub when all types match
345- // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
346- constexpr bool all_same = std::is_same_v<Rtype, Atype>
347- && std::is_same_v<Atype, Btype>;
348- constexpr bool is_integer = std::is_integral_v<Rtype>;
349- if constexpr (all_same && is_integer) {
350- return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
195+ auto Rv = HwyPixels (R);
196+ auto Av = HwyPixels (A);
197+ auto Bv = HwyPixels (B);
198+ const int nchannels = RoiNChannels (roi);
199+ const bool contig = ChannelsContiguous<Rtype>(Rv, nchannels)
200+ && ChannelsContiguous<Atype>(Av, nchannels)
201+ && ChannelsContiguous<Btype>(Bv, nchannels);
202+ if (contig) {
203+ // Use native integer path for scale-invariant sub when all types
204+ // match and are integer types (much faster: 6-12x vs 3-5x with
205+ // float conversion).
206+ constexpr bool all_same = std::is_same_v<Rtype, Atype>
207+ && std::is_same_v<Atype, Btype>;
208+ constexpr bool is_integer = std::is_integral_v<Rtype>;
209+ if constexpr (all_same && is_integer)
210+ return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
211+ return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
351212 }
352- return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
353213 }
354214#endif
355215 return sub_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
0 commit comments