xtensor-stack
diff --git a/‎docs/source/api/dispatching.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/api/dispatching.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/xsimd/arch/generic/xsimd_generic_details.hpp‎
Lines changed: 2 additions & 2 deletions b/‎include/xsimd/arch/generic/xsimd_generic_details.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/xsimd/arch/generic/xsimd_generic_math.hpp‎
Lines changed: 8 additions & 7 deletions b/‎include/xsimd/arch/generic/xsimd_generic_math.hpp‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 40 additions & 40 deletions b/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 40 additions & 40 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx2.hpp‎
Lines changed: 34 additions & 33 deletions b/‎include/xsimd/arch/xsimd_avx2.hpp‎
Lines changed: 34 additions & 33 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx512f.hpp‎
Lines changed: 27 additions & 26 deletions b/‎include/xsimd/arch/xsimd_avx512f.hpp‎
Lines changed: 27 additions & 26 deletions
@@ -78,7 +78,7 @@ architecture-agnostic description:
       const unsigned n = size / batch::size * batch::size;
       for(unsigned i = 0; i != n; i += batch::size)
           acc += batch::load_unaligned(data + i);
-      T star_acc = xsimd::hadd(acc);
+      T star_acc = xsimd::reduce_add(acc);
       for(unsigned i = n; i < size; ++i)
         star_acc += data[i];
       return star_acc;
 
@@ -48,8 +48,6 @@ namespace xsimd
     inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
     template <class T, class A>
     inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
-    template <class T, class A>
-    inline T hadd(batch<T, A> const&) noexcept;
     template <class T, class A, uint64_t... Coefs>
     inline batch<T, A> horner(const batch<T, A>& self) noexcept;
     template <class T, class A>
@@ -73,6 +71,8 @@ namespace xsimd
     template <class T, class A>
     inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
     template <class T, class A>
+    inline T reduce_add(batch<T, A> const&) noexcept;
+    template <class T, class A>
     inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
     template <class T, class A>
     inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
 
@@ -1020,13 +1020,6 @@ namespace xsimd
             return batch<T, A>(self.data) & batch<T, A>(1);
         }
 
-        // hadd
-        template <class A, class T>
-        inline std::complex<T> hadd(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
-        {
-            return { hadd(self.real()), hadd(self.imag()) };
-        }
-
         // horner
         template <class T, class A, uint64_t... Coefs>
         inline batch<T, A> horner(const batch<T, A>& self) noexcept
@@ -1976,6 +1969,14 @@ namespace xsimd
             return div(batch_type(1), self);
         }
 
+        // reduce_add
+        template <class A, class T>
+        inline std::complex<T> reduce_add(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { reduce_add(self.real()), reduce_add(self.imag()) };
+        }
+
+
         // remainder
         template <class A>
         inline batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
 
@@ -711,46 +711,6 @@ namespace xsimd
             }
         }
 
-        // hadd
-        template <class A>
-        inline float hadd(batch<float, A> const& rhs, requires_arch<avx>) noexcept
-        {
-            // Warning about _mm256_hadd_ps:
-            // _mm256_hadd_ps(a,b) gives
-            // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
-            // rely on a naive use of this method
-            // rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
-            // tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
-            __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
-            // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
-            tmp = _mm256_add_ps(rhs, tmp);
-            // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
-            tmp = _mm256_hadd_ps(tmp, tmp);
-            // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
-            tmp = _mm256_hadd_ps(tmp, tmp);
-            return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
-        }
-        template <class A>
-        inline double hadd(batch<double, A> const& rhs, requires_arch<avx>) noexcept
-        {
-            // rhs = (x0, x1, x2, x3)
-            // tmp = (x2, x3, x0, x1)
-            __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
-            // tmp = (x2+x0, x3+x1, -, -)
-            tmp = _mm256_add_pd(rhs, tmp);
-            // tmp = (x2+x0+x3+x1, -, -, -)
-            tmp = _mm256_hadd_pd(tmp, tmp);
-            return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T hadd(batch<T, A> const& self, requires_arch<avx>) noexcept
-        {
-            __m128i low, high;
-            detail::split_avx(self, low, high);
-            batch<T, sse4_2> blow(low), bhigh(high);
-            return hadd(blow) + hadd(bhigh);
-        }
-
         // haddp
         template <class A>
         inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
@@ -1098,6 +1058,46 @@ namespace xsimd
             return _mm256_rcp_ps(self);
         }
 
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
+        {
+            // Warning about _mm256_hadd_ps:
+            // _mm256_hadd_ps(a,b) gives
+            // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
+            // rely on a naive use of this method
+            // rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
+            // tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
+            __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
+            // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
+            tmp = _mm256_add_ps(rhs, tmp);
+            // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
+            tmp = _mm256_hadd_ps(tmp, tmp);
+            // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
+            tmp = _mm256_hadd_ps(tmp, tmp);
+            return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
+        {
+            // rhs = (x0, x1, x2, x3)
+            // tmp = (x2, x3, x0, x1)
+            __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
+            // tmp = (x2+x0, x3+x1, -, -)
+            tmp = _mm256_add_pd(rhs, tmp);
+            // tmp = (x2+x0+x3+x1, -, -, -)
+            tmp = _mm256_hadd_pd(tmp, tmp);
+            return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m128i low, high;
+            detail::split_avx(self, low, high);
+            batch<T, sse4_2> blow(low), bhigh(high);
+            return reduce_add(blow) + reduce_add(bhigh);
+        }
+
         // rsqrt
         template <class A>
         inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
 
@@ -440,39 +440,6 @@ namespace xsimd
             }
         }
 
-        // hadd
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T hadd(batch<T, A> const& self, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                __m256i tmp1 = _mm256_hadd_epi32(self, self);
-                __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
-                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
-                __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
-                return _mm_cvtsi128_si32(tmp4);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
-                __m256i tmp2 = _mm256_add_epi64(self, tmp1);
-                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
-                __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
-#if defined(__x86_64__)
-                return _mm_cvtsi128_si64(res);
-#else
-                __m128i m;
-                _mm_storel_epi64(&m, res);
-                int64_t i;
-                std::memcpy(&i, &m, sizeof(i));
-                return i;
-#endif
-            }
-            else
-            {
-                return hadd(self, avx {});
-            }
-        }
         // load_complex
         template <class A>
         inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
@@ -621,6 +588,40 @@ namespace xsimd
             }
         }
 
+        // reduce_add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m256i tmp1 = _mm256_hadd_epi32(self, self);
+                __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
+                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+                __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
+                __m256i tmp2 = _mm256_add_epi64(self, tmp1);
+                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+                __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(res);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, res);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return reduce_add(self, avx {});
+            }
+        }
+
         // sadd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
 
@@ -868,32 +868,6 @@ namespace xsimd
             return detail::compare_int_avx512f<A, T, _MM_CMPINT_GT>(self, other);
         }
 
-        // hadd
-        template <class A>
-        inline float hadd(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
-        {
-            __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
-            __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
-            __m256 res1 = _mm256_add_ps(tmp1, tmp2);
-            return hadd(batch<float, avx2>(res1), avx2 {});
-        }
-        template <class A>
-        inline double hadd(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
-        {
-            __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
-            __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
-            __m256d res1 = _mm256_add_pd(tmp1, tmp2);
-            return hadd(batch<double, avx2>(res1), avx2 {});
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T hadd(batch<T, A> const& self, requires_arch<avx512f>) noexcept
-        {
-            __m256i low, high;
-            detail::split_avx512(self, low, high);
-            batch<T, avx2> blow(low), bhigh(high);
-            return hadd(blow, avx2 {}) + hadd(bhigh, avx2 {});
-        }
-
         // haddp
         template <class A>
         inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512f>) noexcept
@@ -1299,6 +1273,33 @@ namespace xsimd
             return _mm512_rcp14_pd(self);
         }
 
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
+            __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
+            __m256 res1 = _mm256_add_ps(tmp1, tmp2);
+            return reduce_add(batch<float, avx2>(res1), avx2 {});
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
+            __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
+            __m256d res1 = _mm256_add_pd(tmp1, tmp2);
+            return reduce_add(batch<double, avx2>(res1), avx2 {});
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            __m256i low, high;
+            detail::split_avx512(self, low, high);
+            batch<T, avx2> blow(low), bhigh(high);
+            return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
+        }
+
+
         // rsqrt
         template <class A>
         inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept