AcademySoftwareFoundation
diff --git a/‎src/include/imageio_pvt.h‎
Lines changed: 1 addition & 0 deletions b/‎src/include/imageio_pvt.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/libOpenImageIO/imagebufalgo_addsub.cpp‎
Lines changed: 51 additions & 191 deletions b/‎src/libOpenImageIO/imagebufalgo_addsub.cpp‎
Lines changed: 51 additions & 191 deletions
diff --git a/‎src/libOpenImageIO/imagebufalgo_hwy_pvt.h‎
Lines changed: 85 additions & 0 deletions b/‎src/libOpenImageIO/imagebufalgo_hwy_pvt.h‎
Lines changed: 85 additions & 0 deletions
@@ -43,6 +43,7 @@ extern int oiio_log_times;
 extern int openexr_core;
 extern int jpeg_com_attributes;
 extern int png_linear_premult;
+extern int enable_hwy;
 extern int limit_channels;
 extern int limit_imagesize_MB;
 extern int imagebuf_print_uncaught_errors;
 
@@ -64,107 +64,28 @@ add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
 
 
 #if OIIO_USE_HWY
+
 // Native integer add using SaturatedAdd (scale-invariant, no float conversion)
 template<class T>
 static bool
 add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
                         ROI roi, int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
-                            && ChannelsContiguous<T>(Av, nchannels)
-                            && ChannelsContiguous<T>(Bv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
-            const T* a_row = RoiRowPtr<T>(Av, y, roi);
-            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
-
-            if (contig) {
-                // Native integer saturated add - much faster than float conversion!
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
-                                         [](auto d, auto a, auto b) {
-                                             return hn::SaturatedAdd(a, b);
-                                         });
-            } else {
-                // Scalar fallback
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
-                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
-                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
-                    for (int c = 0; c < nchannels; ++c) {
-                        // Saturating add in scalar
-                        int64_t sum = (int64_t)a_ptr[c] + (int64_t)b_ptr[c];
-                        if constexpr (std::is_unsigned_v<T>) {
-                            r_ptr[c] = (sum > std::numeric_limits<T>::max())
-                                           ? std::numeric_limits<T>::max()
-                                           : (T)sum;
-                        } else {
-                            r_ptr[c] = (sum > std::numeric_limits<T>::max())
-                                           ? std::numeric_limits<T>::max()
-                                       : (sum < std::numeric_limits<T>::min())
-                                           ? std::numeric_limits<T>::min()
-                                           : (T)sum;
-                        }
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_binary_native_int_perpixel_op<T>(R, A, B, roi, nthreads,
+                                               [](auto /*d*/, auto a, auto b) {
+                                                   return hn::SaturatedAdd(a, b);
+                                               });
 }
 
 template<class Rtype, class Atype, class Btype>
 static bool
 add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
-                            && ChannelsContiguous<Atype>(Av, nchannels)
-                            && ChannelsContiguous<Btype>(Bv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
-            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
-            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
-
-            if (contig) {
-                // Process whole line as one vector stream
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
-                                               [](auto d, auto a, auto b) {
-                                                   return hn::Add(a, b);
-                                               });
-            } else {
-                // Process pixel by pixel (scalar fallback for strided channels)
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
-                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
-                                                           roi.chbegin);
-                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
-                                                           roi.chbegin);
-                    for (int c = 0; c < nchannels; ++c) {
-                        r_ptr[c] = static_cast<Rtype>(
-                            static_cast<float>(a_ptr[c])
-                            + static_cast<float>(b_ptr[c]));
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
+                                                      [](auto /*d*/, auto a, auto b) {
+                                                          return hn::Add(a, b);
+                                                      });
 }
 
 template<class Rtype, class Atype>
@@ -204,15 +125,24 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
         && B.localpixels()) {
-        // Use native integer path for scale-invariant add when all types match
-        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
-        constexpr bool all_same = std::is_same_v<Rtype, Atype>
-                                  && std::is_same_v<Atype, Btype>;
-        constexpr bool is_integer = std::is_integral_v<Rtype>;
-        if constexpr (all_same && is_integer) {
-            return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        auto Rv = HwyPixels(R);
+        auto Av = HwyPixels(A);
+        auto Bv = HwyPixels(B);
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+        if (contig) {
+            // Use native integer path for scale-invariant add when all types
+            // match and are integer types (much faster: 6-12x vs 3-5x with
+            // float conversion).
+            constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                      && std::is_same_v<Atype, Btype>;
+            constexpr bool is_integer = std::is_integral_v<Rtype>;
+            if constexpr (all_same && is_integer)
+                return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+            return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
         }
-        return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
     }
 #endif
     return add_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
@@ -236,100 +166,21 @@ static bool
 sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
                         ROI roi, int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
-                            && ChannelsContiguous<T>(Av, nchannels)
-                            && ChannelsContiguous<T>(Bv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
-            const T* a_row = RoiRowPtr<T>(Av, y, roi);
-            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
-
-            if (contig) {
-                // Native integer saturated sub - much faster than float conversion!
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
-                                         [](auto d, auto a, auto b) {
-                                             return hn::SaturatedSub(a, b);
-                                         });
-            } else {
-                // Scalar fallback
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
-                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
-                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
-                    for (int c = 0; c < nchannels; ++c) {
-                        // Saturating sub in scalar
-                        if constexpr (std::is_unsigned_v<T>) {
-                            r_ptr[c] = (a_ptr[c] > b_ptr[c])
-                                           ? (a_ptr[c] - b_ptr[c])
-                                           : T(0);
-                        } else {
-                            int64_t diff = (int64_t)a_ptr[c]
-                                           - (int64_t)b_ptr[c];
-                            r_ptr[c] = (diff > std::numeric_limits<T>::max())
-                                           ? std::numeric_limits<T>::max()
-                                       : (diff < std::numeric_limits<T>::min())
-                                           ? std::numeric_limits<T>::min()
-                                           : (T)diff;
-                        }
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_binary_native_int_perpixel_op<T>(R, A, B, roi, nthreads,
+                                               [](auto /*d*/, auto a, auto b) {
+                                                   return hn::SaturatedSub(a, b);
+                                               });
 }
 
 template<class Rtype, class Atype, class Btype>
 static bool
 sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
-                            && ChannelsContiguous<Atype>(Av, nchannels)
-                            && ChannelsContiguous<Btype>(Bv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
-            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
-            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
-
-            if (contig) {
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
-                                               [](auto d, auto a, auto b) {
-                                                   return hn::Sub(a, b);
-                                               });
-            } else {
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
-                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
-                                                           roi.chbegin);
-                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
-                                                           roi.chbegin);
-                    for (int c = 0; c < nchannels; ++c) {
-                        r_ptr[c] = static_cast<Rtype>(
-                            static_cast<float>(a_ptr[c])
-                            - static_cast<float>(b_ptr[c]));
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
+                                                      [](auto /*d*/, auto a, auto b) {
+                                                          return hn::Sub(a, b);
+                                                      });
 }
 #endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
 
@@ -341,15 +192,24 @@ sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
         && B.localpixels()) {
-        // Use native integer path for scale-invariant sub when all types match
-        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
-        constexpr bool all_same = std::is_same_v<Rtype, Atype>
-                                  && std::is_same_v<Atype, Btype>;
-        constexpr bool is_integer = std::is_integral_v<Rtype>;
-        if constexpr (all_same && is_integer) {
-            return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        auto Rv = HwyPixels(R);
+        auto Av = HwyPixels(A);
+        auto Bv = HwyPixels(B);
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+        if (contig) {
+            // Use native integer path for scale-invariant sub when all types
+            // match and are integer types (much faster: 6-12x vs 3-5x with
+            // float conversion).
+            constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                      && std::is_same_v<Atype, Btype>;
+            constexpr bool is_integer = std::is_integral_v<Rtype>;
+            if constexpr (all_same && is_integer)
+                return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+            return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
         }
-        return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
     }
 #endif
     return sub_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
 
@@ -6,6 +6,7 @@
 
 #include <OpenImageIO/half.h>
 #include <OpenImageIO/imagebuf.h>
+#include <OpenImageIO/imagebufalgo_util.h>
 #include <OpenImageIO/imageio.h>
 #include <algorithm>
 #include <cstddef>
@@ -756,6 +757,90 @@ RunHwyTernaryCmd(Rtype* r, const ABCtype* a, const ABCtype* b, const ABCtype* c,
     }
 }
 
+// -----------------------------------------------------------------------
+// Per-pixel Ops (ImageBufAlgo, contiguous interleaved channels)
+// -----------------------------------------------------------------------
+
+/// Execute a binary per-pixel HWY operation for interleaved, contiguous
+/// channels. The caller is responsible for ensuring that the channel range is
+/// contiguous for R/A/B (i.e. no per-pixel padding, and the ROI channel range
+/// covers the full pixel).
+template<typename Rtype, typename Atype, typename Btype, typename OpFunc>
+inline bool
+hwy_binary_perpixel_op(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                       int nthreads, OpFunc op)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const size_t n = static_cast<size_t>(roi.width())
+                         * static_cast<size_t>(nchannels);
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+            RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n, op);
+        }
+    });
+    return true;
+}
+
+/// Execute a ternary per-pixel HWY operation for interleaved, contiguous
+/// channels. The caller is responsible for ensuring that the channel range is
+/// contiguous for R/A/B/C (i.e. no per-pixel padding, and the ROI channel range
+/// covers the full pixel).
+template<typename Rtype, typename ABCtype, typename OpFunc>
+inline bool
+hwy_ternary_perpixel_op(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                        const ImageBuf& C, ROI roi, int nthreads, OpFunc op)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    auto Cv = HwyPixels(C);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const size_t n = static_cast<size_t>(roi.width())
+                         * static_cast<size_t>(nchannels);
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi);
+            const ABCtype* a_row = RoiRowPtr<ABCtype>(Av, y, roi);
+            const ABCtype* b_row = RoiRowPtr<ABCtype>(Bv, y, roi);
+            const ABCtype* c_row = RoiRowPtr<ABCtype>(Cv, y, roi);
+            RunHwyTernaryCmd<Rtype, ABCtype>(r_row, a_row, b_row, c_row, n, op);
+        }
+    });
+    return true;
+}
+
+/// Execute a binary per-pixel HWY operation on native integer arrays (no type
+/// promotion/normalization). The caller is responsible for ensuring that the
+/// channel range is contiguous for R/A/B.
+template<typename T, typename OpFunc>
+inline bool
+hwy_binary_native_int_perpixel_op(ImageBuf& R, const ImageBuf& A,
+                                  const ImageBuf& B, ROI roi, int nthreads,
+                                  OpFunc op)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const size_t n = static_cast<size_t>(roi.width())
+                         * static_cast<size_t>(nchannels);
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+            RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n, op);
+        }
+    });
+    return true;
+}
+
 // -----------------------------------------------------------------------
 // Interleaved Channel Load/Store Helpers
 // -----------------------------------------------------------------------