Implement reduc_min and reduce_max

serge-sans-paille · serge-sans-paille · commit 9f63efb1a1bf · 2022-07-24T23:29:54.000+02:00
Using a generic reducer, aka 'butterfly reduction'. As a side effect, fix a bug in (untested until then) SSSE3 swizzle implementation for int8 and int16. Fix #219 (from 2018 ^^!)
diff --git a/include/xsimd/arch/generic/xsimd_generic_math.hpp b/include/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -1976,6 +1976,49 @@ namespace xsimd
             return { reduce_add(self.real()), reduce_add(self.imag()) };
         }
 
+        namespace detail
+        {
+            template <class T, T N>
+            struct SplitHigh
+            {
+                static constexpr T get(T i, T)
+                {
+                    return i >= N ? 0 : i + N;
+                }
+            };
+
+            template <class Op, class A, class T>
+            inline T reduce(Op, batch<T, A> const& self, std::integral_constant<unsigned, 1>) noexcept
+            {
+                return self.get(0);
+            }
+
+            template <class Op, class A, class T, unsigned Lvl>
+            inline T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
+            {
+                using index_type = as_unsigned_integer_t<T>;
+                batch<T, A> split = swizzle(self, make_batch_constant<batch<index_type, A>, SplitHigh<index_type, Lvl / 2>>());
+                return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());
+            }
+        }
+
+        // reduce_max
+        template <class A, class T>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
+                                  { return max(x, y); },
+                                  self, std::integral_constant<unsigned, batch<T, A>::size>());
+        }
+
+        // reduce_min
+        template <class A, class T>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
+                                  { return min(x, y); },
+                                  self, std::integral_constant<unsigned, batch<T, A>::size>());
+        }
 
         // remainder
         template <class A>
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -1098,6 +1098,28 @@ namespace xsimd
             return reduce_add(blow) + reduce_add(bhigh);
         }
 
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(1, 0);
+            batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+            batch<T, A> acc = max(self, step);
+            __m128i low = _mm256_castsi256_si128(acc);
+            return reduce_max(batch<T, sse4_2>(low));
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(1, 0);
+            batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+            batch<T, A> acc = min(self, step);
+            __m128i low = _mm256_castsi256_si128(acc);
+            return reduce_min(batch<T, sse4_2>(low));
+        }
+
         // rsqrt
         template <class A>
         inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
@@ -1499,12 +1521,13 @@ namespace xsimd
             return bitwise_cast<batch<T, A>>(
                 swizzle(bitwise_cast<batch<float, A>>(self), mask));
         }
+
         template <class A,
                   typename T,
-                  uint32_t V0,
-                  uint32_t V1,
-                  uint32_t V2,
-                  uint32_t V3,
+                  uint64_t V0,
+                  uint64_t V1,
+                  uint64_t V2,
+                  uint64_t V3,
                   detail::enable_sized_integral_t<T, 8> = 0>
         inline batch<T, A>
         swizzle(batch<T, A> const& self,
diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -557,7 +557,7 @@ namespace xsimd
         template <class A, uint8_t... Vs>
         inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
-            return _mm512_permutexvar_epi8((batch<uint8_t, A>)mask, self);
+            return _mm512_shuffle_epi8(self, (batch<uint8_t, A>)mask);
         }
 
         template <class A, uint8_t... Vs>
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -1299,6 +1299,27 @@ namespace xsimd
             return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
         }
 
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            batch<T, A> acc = max(self, step);
+            __m256i low = _mm512_castsi512_si256(acc);
+            return reduce_max(batch<T, avx2>(low));
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            batch<T, A> acc = min(self, step);
+            __m256i low = _mm512_castsi512_si256(acc);
+            return reduce_min(batch<T, avx2>(low));
+        }
 
         // rsqrt
         template <class A>
diff --git a/include/xsimd/arch/xsimd_generic_fwd.hpp b/include/xsimd/arch/xsimd_generic_fwd.hpp
@@ -12,11 +12,12 @@
 #ifndef XSIMD_GENERIC_FWD_HPP
 #define XSIMD_GENERIC_FWD_HPP
 
+#include "../types/xsimd_batch_constant.hpp"
+
 #include <type_traits>
 
 namespace xsimd
 {
-
     namespace kernel
     {
         // forward declaration
diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
@@ -1577,7 +1577,6 @@ namespace xsimd
             return vget_lane_f32(tmp, 0);
         }
 
-
         /**********
          * select *
          **********/
diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp
@@ -753,7 +753,6 @@ namespace xsimd
             return vaddvq_f64(arg);
         }
 
-
         /**********
          * select *
          **********/
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
@@ -33,6 +33,18 @@ namespace xsimd
     {
         using namespace types;
 
+        namespace detail
+        {
+            constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
+            {
+                return (z << 6) | (y << 4) | (x << 2) | w;
+            }
+            constexpr uint32_t shuffle(uint32_t x, uint32_t y)
+            {
+                return (y << 1) | x;
+            }
+        }
+
         // fwd
         template <class A, class T, size_t I>
         inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
@@ -1155,6 +1167,50 @@ namespace xsimd
             __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
             return _mm_cvtss_f32(tmp1);
         }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
+            batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
+            batch<T, A> acc0 = max(self, step0);
+
+            constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
+            batch<T, A> acc1 = max(acc0, step1);
+
+            constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
+            batch<T, A> acc2 = max(acc1, step2);
+            if (sizeof(T) == 2)
+                return acc2.get(0);
+            batch<T, A> step3 = bitwise_cast<batch<T, A>>(bitwise_cast<batch<uint16_t, A>>(acc2) >> 8);
+            batch<T, A> acc3 = max(acc2, step3);
+            return acc3.get(0);
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
+            batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
+            batch<T, A> acc0 = min(self, step0);
+
+            constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
+            batch<T, A> acc1 = min(acc0, step1);
+
+            constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
+            batch<T, A> acc2 = min(acc1, step2);
+            if (sizeof(T) == 2)
+                return acc2.get(0);
+            batch<T, A> step3 = bitwise_cast<batch<T, A>>(bitwise_cast<batch<uint16_t, A>>(acc2) >> 8);
+            batch<T, A> acc3 = min(acc2, step3);
+            return acc3.get(0);
+        }
         // TODO: move this in xsimd_generic
         namespace detail
         {
@@ -1207,7 +1263,6 @@ namespace xsimd
             return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
         }
 
-
         // rsqrt
         template <class A>
         inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
@@ -1541,18 +1596,6 @@ namespace xsimd
 
         // swizzle
 
-        namespace detail
-        {
-            constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
-            {
-                return (z << 6) | (y << 4) | (x << 2) | w;
-            }
-            constexpr uint32_t shuffle(uint32_t x, uint32_t y)
-            {
-                return (y << 1) | x;
-            }
-        }
-
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
         inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
         {
diff --git a/include/xsimd/arch/xsimd_sse3.hpp b/include/xsimd/arch/xsimd_sse3.hpp
@@ -42,7 +42,6 @@ namespace xsimd
             return _mm_lddqu_si128((__m128i const*)mem);
         }
 
-
         // reduce_add
         template <class A>
         inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
@@ -58,7 +57,6 @@ namespace xsimd
             return _mm_cvtsd_f64(tmp0);
         }
 
-
     }
 
 }
diff --git a/include/xsimd/arch/xsimd_ssse3.hpp b/include/xsimd/arch/xsimd_ssse3.hpp
@@ -118,7 +118,7 @@ namespace xsimd
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
         inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
         {
-            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
+            return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, ssse3 {}));
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
@@ -132,7 +132,7 @@ namespace xsimd
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
         inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
         {
-            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, ssse3 {}));
+            return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, ssse3 {}));
         }
 
     }
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
@@ -891,6 +891,34 @@ namespace xsimd
         return kernel::reduce_add<A>(x, A {});
     }
 
+    /**
+     * @ingroup batch_reducers
+     *
+     * Max of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_max(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_max<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Min of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_min(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_min<A>(x, A {});
+    }
+
     /**
      * @ingroup batch_reducers
      *
diff --git a/test/test_batch.cpp b/test/test_batch.cpp
@@ -593,6 +593,18 @@ class batch_test : public testing::Test
             value_type res = reduce_add(batch_lhs());
             EXPECT_SCALAR_EQ(res, expected) << print_function_name("reduce_add");
         }
+        // reduce_max
+        {
+            value_type expected = *std::max_element(lhs.cbegin(), lhs.cend());
+            value_type res = reduce_max(batch_lhs());
+            EXPECT_SCALAR_EQ(res, expected) << print_function_name("reduce_max");
+        }
+        // reduce_min
+        {
+            value_type expected = *std::min_element(lhs.cbegin(), lhs.cend());
+            value_type res = reduce_min(batch_lhs());
+            EXPECT_SCALAR_EQ(res, expected) << print_function_name("reduce_min");
+        }
     }
 
     void test_boolean_conversions() const

Original file line number	Diff line number	Diff line change
`@@ -557,7 +557,7 @@ namespace xsimd`
`557`	`557`	`template <class A, uint8_t... Vs>`
`558`	`558`	`inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept`
`559`	`559`	`{`
`560`		`- return _mm512_permutexvar_epi8((batch<uint8_t, A>)mask, self);`
	`560`	`+ return _mm512_shuffle_epi8(self, (batch<uint8_t, A>)mask);`
`561`	`561`	`}`
`562`	`562`
`563`	`563`	`template <class A, uint8_t... Vs>`
Original file line number	Diff line number	Diff line change
`@@ -12,11 +12,12 @@`
`12`	`12`	`#ifndef XSIMD_GENERIC_FWD_HPP`
`13`	`13`	`#define XSIMD_GENERIC_FWD_HPP`
`14`	`14`
	`15`	`+#include "../types/xsimd_batch_constant.hpp"`
	`16`	`+`
`15`	`17`	`#include <type_traits>`
`16`	`18`
`17`	`19`	`namespace xsimd`
`18`	`20`	`{`
`19`		`-`
`20`	`21`	`namespace kernel`
`21`	`22`	`{`
`22`	`23`	`// forward declaration`
Original file line number	Diff line number	Diff line change
`@@ -1577,7 +1577,6 @@ namespace xsimd`
`1577`	`1577`	`return vget_lane_f32(tmp, 0);`
`1578`	`1578`	`}`
`1579`	`1579`
`1580`		`-`
`1581`	`1580`	`/**********`
`1582`	`1581`	`* select *`
`1583`	`1582`	`**********/`
Original file line number	Diff line number	Diff line change
`@@ -753,7 +753,6 @@ namespace xsimd`
`753`	`753`	`return vaddvq_f64(arg);`
`754`	`754`	`}`
`755`	`755`
`756`		`-`
`757`	`756`	`/**********`
`758`	`757`	`* select *`
`759`	`758`	`**********/`
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,6 @@ namespace xsimd`
`42`	`42`	`return _mm_lddqu_si128((__m128i const*)mem);`
`43`	`43`	`}`
`44`	`44`
`45`		`-`
`46`	`45`	`// reduce_add`
`47`	`46`	`template <class A>`
`48`	`47`	`inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept`
`@@ -58,7 +57,6 @@ namespace xsimd`
`58`	`57`	`return _mm_cvtsd_f64(tmp0);`
`59`	`58`	`}`
`60`	`59`
`61`		`-`
`62`	`60`	`}`
`63`	`61`
`64`	`62`	`}`
Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,7 @@ namespace xsimd`
`118`	`118`	`template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>`
`119`	`119`	`inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept`
`120`	`120`	`{`
`121`		`- return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));`
	`121`	`+ return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, ssse3 {}));`
`122`	`122`	`}`
`123`	`123`
`124`	`124`	`template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,`
`@@ -132,7 +132,7 @@ namespace xsimd`
`132`	`132`	`uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>`
`133`	`133`	`inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept`
`134`	`134`	`{`
`135`		`- return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, ssse3 {}));`
	`135`	`+ return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, ssse3 {}));`
`136`	`136`	`}`
`137`	`137`
`138`	`138`	`}`