@@ -64,8 +64,8 @@ add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
6464// Native integer add using SaturatedAdd (scale-invariant, no float conversion)
6565template <class T >
6666static bool
67- add_impl_hwy_native_int (ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
68- int nthreads)
67+ add_impl_hwy_native_int (ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
68+ ROI roi, int nthreads)
6969{
7070 ImageBufAlgo::parallel_image (roi, nthreads, [&](ROI roi) {
7171 const ImageSpec& Rspec = R.spec ();
@@ -103,31 +103,34 @@ add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI r
103103 if (contig) {
104104 // Native integer saturated add - much faster than float conversion!
105105 size_t n = static_cast <size_t >(roi.width ()) * nchannels;
106- RunHwyBinaryNativeInt<T>(
107- reinterpret_cast <T*>(r_row),
108- reinterpret_cast <const T*>(a_row),
109- reinterpret_cast <const T*>(b_row), n,
110- [](auto d, auto a, auto b) { return hn::SaturatedAdd (a, b); });
106+ RunHwyBinaryNativeInt<T>(reinterpret_cast <T*>(r_row),
107+ reinterpret_cast <const T*>(a_row),
108+ reinterpret_cast <const T*>(b_row), n,
109+ [](auto d, auto a, auto b) {
110+ return hn::SaturatedAdd (a, b);
111+ });
111112 } else {
112113 // Scalar fallback
113114 for (int x = 0 ; x < roi.width (); ++x) {
114115 T* r_ptr = reinterpret_cast <T*>(r_row)
115- + x * r_pixel_bytes / sizeof (T);
116+ + x * r_pixel_bytes / sizeof (T);
116117 const T* a_ptr = reinterpret_cast <const T*>(a_row)
117- + x * a_pixel_bytes / sizeof (T);
118+ + x * a_pixel_bytes / sizeof (T);
118119 const T* b_ptr = reinterpret_cast <const T*>(b_row)
119- + x * b_pixel_bytes / sizeof (T);
120+ + x * b_pixel_bytes / sizeof (T);
120121 for (int c = 0 ; c < nchannels; ++c) {
121122 // Saturating add in scalar
122123 int64_t sum = (int64_t )a_ptr[c] + (int64_t )b_ptr[c];
123124 if constexpr (std::is_unsigned_v<T>) {
124125 r_ptr[c] = (sum > std::numeric_limits<T>::max ())
125- ? std::numeric_limits<T>::max () : (T)sum;
126+ ? std::numeric_limits<T>::max ()
127+ : (T)sum;
126128 } else {
127129 r_ptr[c] = (sum > std::numeric_limits<T>::max ())
128- ? std::numeric_limits<T>::max ()
129- : (sum < std::numeric_limits<T>::min ())
130- ? std::numeric_limits<T>::min () : (T)sum;
130+ ? std::numeric_limits<T>::max ()
131+ : (sum < std::numeric_limits<T>::min ())
132+ ? std::numeric_limits<T>::min ()
133+ : (T)sum;
131134 }
132135 }
133136 }
@@ -193,8 +196,9 @@ add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
193196 const Btype* b_ptr = reinterpret_cast <const Btype*>(b_row)
194197 + x * b_pixel_bytes / sizeof (Btype);
195198 for (int c = 0 ; c < nchannels; ++c) {
196- r_ptr[c] = static_cast <Rtype>(static_cast <float >(a_ptr[c]) +
197- static_cast <float >(b_ptr[c]));
199+ r_ptr[c] = static_cast <Rtype>(
200+ static_cast <float >(a_ptr[c])
201+ + static_cast <float >(b_ptr[c]));
198202 }
199203 }
200204 }
@@ -248,7 +252,8 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
248252 && B.localpixels ()) {
249253 // Use native integer path for scale-invariant add when all types match
250254 // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
251- constexpr bool all_same = std::is_same_v<Rtype, Atype> && std::is_same_v<Atype, Btype>;
255+ constexpr bool all_same = std::is_same_v<Rtype, Atype>
256+ && std::is_same_v<Atype, Btype>;
252257 constexpr bool is_integer = std::is_integral_v<Rtype>;
253258 if constexpr (all_same && is_integer) {
254259 return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
@@ -270,8 +275,8 @@ add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
270275// Native integer sub using SaturatedSub (scale-invariant, no float conversion)
271276template <class T >
272277static bool
273- sub_impl_hwy_native_int (ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
274- int nthreads)
278+ sub_impl_hwy_native_int (ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
279+ ROI roi, int nthreads)
275280{
276281 ImageBufAlgo::parallel_image (roi, nthreads, [&](ROI roi) {
277282 const ImageSpec& Rspec = R.spec ();
@@ -309,31 +314,35 @@ sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI r
309314 if (contig) {
310315 // Native integer saturated sub - much faster than float conversion!
311316 size_t n = static_cast <size_t >(roi.width ()) * nchannels;
312- RunHwyBinaryNativeInt<T>(
313- reinterpret_cast <T*>(r_row),
314- reinterpret_cast <const T*>(a_row),
315- reinterpret_cast <const T*>(b_row), n,
316- [](auto d, auto a, auto b) { return hn::SaturatedSub (a, b); });
317+ RunHwyBinaryNativeInt<T>(reinterpret_cast <T*>(r_row),
318+ reinterpret_cast <const T*>(a_row),
319+ reinterpret_cast <const T*>(b_row), n,
320+ [](auto d, auto a, auto b) {
321+ return hn::SaturatedSub (a, b);
322+ });
317323 } else {
318324 // Scalar fallback
319325 for (int x = 0 ; x < roi.width (); ++x) {
320326 T* r_ptr = reinterpret_cast <T*>(r_row)
321- + x * r_pixel_bytes / sizeof (T);
327+ + x * r_pixel_bytes / sizeof (T);
322328 const T* a_ptr = reinterpret_cast <const T*>(a_row)
323- + x * a_pixel_bytes / sizeof (T);
329+ + x * a_pixel_bytes / sizeof (T);
324330 const T* b_ptr = reinterpret_cast <const T*>(b_row)
325- + x * b_pixel_bytes / sizeof (T);
331+ + x * b_pixel_bytes / sizeof (T);
326332 for (int c = 0 ; c < nchannels; ++c) {
327333 // Saturating sub in scalar
328334 if constexpr (std::is_unsigned_v<T>) {
329335 r_ptr[c] = (a_ptr[c] > b_ptr[c])
330- ? (a_ptr[c] - b_ptr[c]) : T (0 );
336+ ? (a_ptr[c] - b_ptr[c])
337+ : T (0 );
331338 } else {
332- int64_t diff = (int64_t )a_ptr[c] - (int64_t )b_ptr[c];
339+ int64_t diff = (int64_t )a_ptr[c]
340+ - (int64_t )b_ptr[c];
333341 r_ptr[c] = (diff > std::numeric_limits<T>::max ())
334- ? std::numeric_limits<T>::max ()
335- : (diff < std::numeric_limits<T>::min ())
336- ? std::numeric_limits<T>::min () : (T)diff;
342+ ? std::numeric_limits<T>::max ()
343+ : (diff < std::numeric_limits<T>::min ())
344+ ? std::numeric_limits<T>::min ()
345+ : (T)diff;
337346 }
338347 }
339348 }
@@ -396,8 +405,9 @@ sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
396405 const Btype* b_ptr = reinterpret_cast <const Btype*>(b_row)
397406 + x * b_pixel_bytes / sizeof (Btype);
398407 for (int c = 0 ; c < nchannels; ++c) {
399- r_ptr[c] = static_cast <Rtype>(static_cast <float >(a_ptr[c]) -
400- static_cast <float >(b_ptr[c]));
408+ r_ptr[c] = static_cast <Rtype>(
409+ static_cast <float >(a_ptr[c])
410+ - static_cast <float >(b_ptr[c]));
401411 }
402412 }
403413 }
@@ -415,7 +425,8 @@ sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
415425 && B.localpixels ()) {
416426 // Use native integer path for scale-invariant sub when all types match
417427 // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
418- constexpr bool all_same = std::is_same_v<Rtype, Atype> && std::is_same_v<Atype, Btype>;
428+ constexpr bool all_same = std::is_same_v<Rtype, Atype>
429+ && std::is_same_v<Atype, Btype>;
419430 constexpr bool is_integer = std::is_integral_v<Rtype>;
420431 if constexpr (all_same && is_integer) {
421432 return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
0 commit comments