Skip to content

Commit 9dce801

Browse files
Rename hadd into reduce_add
1 parent 3dede97 commit 9dce801

File tree

15 files changed

+322
-314
lines changed

15 files changed

+322
-314
lines changed

docs/source/api/dispatching.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ architecture-agnostic description:
7878
const unsigned n = size / batch::size * batch::size;
7979
for(unsigned i = 0; i != n; i += batch::size)
8080
acc += batch::load_unaligned(data + i);
81-
T star_acc = xsimd::hadd(acc);
81+
T star_acc = xsimd::reduce_add(acc);
8282
for(unsigned i = n; i < size; ++i)
8383
star_acc += data[i];
8484
return star_acc;

include/xsimd/arch/generic/xsimd_generic_details.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@ namespace xsimd
4848
inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
4949
template <class T, class A>
5050
inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
51-
template <class T, class A>
52-
inline T hadd(batch<T, A> const&) noexcept;
5351
template <class T, class A, uint64_t... Coefs>
5452
inline batch<T, A> horner(const batch<T, A>& self) noexcept;
5553
template <class T, class A>
@@ -73,6 +71,8 @@ namespace xsimd
7371
template <class T, class A>
7472
inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
7573
template <class T, class A>
74+
inline T reduce_add(batch<T, A> const&) noexcept;
75+
template <class T, class A>
7676
inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
7777
template <class T, class A>
7878
inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;

include/xsimd/arch/generic/xsimd_generic_math.hpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,13 +1020,6 @@ namespace xsimd
10201020
return batch<T, A>(self.data) & batch<T, A>(1);
10211021
}
10221022

1023-
// hadd
1024-
template <class A, class T>
1025-
inline std::complex<T> hadd(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
1026-
{
1027-
return { hadd(self.real()), hadd(self.imag()) };
1028-
}
1029-
10301023
// horner
10311024
template <class T, class A, uint64_t... Coefs>
10321025
inline batch<T, A> horner(const batch<T, A>& self) noexcept
@@ -1976,6 +1969,14 @@ namespace xsimd
19761969
return div(batch_type(1), self);
19771970
}
19781971

1972+
// reduce_add
1973+
template <class A, class T>
1974+
inline std::complex<T> reduce_add(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
1975+
{
1976+
return { reduce_add(self.real()), reduce_add(self.imag()) };
1977+
}
1978+
1979+
19791980
// remainder
19801981
template <class A>
19811982
inline batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -711,46 +711,6 @@ namespace xsimd
711711
}
712712
}
713713

714-
// hadd
715-
template <class A>
716-
inline float hadd(batch<float, A> const& rhs, requires_arch<avx>) noexcept
717-
{
718-
// Warning about _mm256_hadd_ps:
719-
// _mm256_hadd_ps(a,b) gives
720-
// (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
721-
// rely on a naive use of this method
722-
// rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
723-
// tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
724-
__m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
725-
// tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
726-
tmp = _mm256_add_ps(rhs, tmp);
727-
// tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
728-
tmp = _mm256_hadd_ps(tmp, tmp);
729-
// tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
730-
tmp = _mm256_hadd_ps(tmp, tmp);
731-
return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
732-
}
733-
template <class A>
734-
inline double hadd(batch<double, A> const& rhs, requires_arch<avx>) noexcept
735-
{
736-
// rhs = (x0, x1, x2, x3)
737-
// tmp = (x2, x3, x0, x1)
738-
__m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
739-
// tmp = (x2+x0, x3+x1, -, -)
740-
tmp = _mm256_add_pd(rhs, tmp);
741-
// tmp = (x2+x0+x3+x1, -, -, -)
742-
tmp = _mm256_hadd_pd(tmp, tmp);
743-
return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
744-
}
745-
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
746-
inline T hadd(batch<T, A> const& self, requires_arch<avx>) noexcept
747-
{
748-
__m128i low, high;
749-
detail::split_avx(self, low, high);
750-
batch<T, sse4_2> blow(low), bhigh(high);
751-
return hadd(blow) + hadd(bhigh);
752-
}
753-
754714
// haddp
755715
template <class A>
756716
inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
@@ -1098,6 +1058,46 @@ namespace xsimd
10981058
return _mm256_rcp_ps(self);
10991059
}
11001060

1061+
// reduce_add
1062+
template <class A>
1063+
inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
1064+
{
1065+
// Warning about _mm256_hadd_ps:
1066+
// _mm256_hadd_ps(a,b) gives
1067+
// (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
1068+
// rely on a naive use of this method
1069+
// rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
1070+
// tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
1071+
__m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
1072+
// tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
1073+
tmp = _mm256_add_ps(rhs, tmp);
1074+
// tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
1075+
tmp = _mm256_hadd_ps(tmp, tmp);
1076+
// tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
1077+
tmp = _mm256_hadd_ps(tmp, tmp);
1078+
return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
1079+
}
1080+
template <class A>
1081+
inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
1082+
{
1083+
// rhs = (x0, x1, x2, x3)
1084+
// tmp = (x2, x3, x0, x1)
1085+
__m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
1086+
// tmp = (x2+x0, x3+x1, -, -)
1087+
tmp = _mm256_add_pd(rhs, tmp);
1088+
// tmp = (x2+x0+x3+x1, -, -, -)
1089+
tmp = _mm256_hadd_pd(tmp, tmp);
1090+
return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
1091+
}
1092+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1093+
inline T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
1094+
{
1095+
__m128i low, high;
1096+
detail::split_avx(self, low, high);
1097+
batch<T, sse4_2> blow(low), bhigh(high);
1098+
return reduce_add(blow) + reduce_add(bhigh);
1099+
}
1100+
11011101
// rsqrt
11021102
template <class A>
11031103
inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -440,39 +440,6 @@ namespace xsimd
440440
}
441441
}
442442

443-
// hadd
444-
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
445-
inline T hadd(batch<T, A> const& self, requires_arch<avx2>) noexcept
446-
{
447-
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
448-
{
449-
__m256i tmp1 = _mm256_hadd_epi32(self, self);
450-
__m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
451-
__m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
452-
__m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
453-
return _mm_cvtsi128_si32(tmp4);
454-
}
455-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
456-
{
457-
__m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
458-
__m256i tmp2 = _mm256_add_epi64(self, tmp1);
459-
__m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
460-
__m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
461-
#if defined(__x86_64__)
462-
return _mm_cvtsi128_si64(res);
463-
#else
464-
__m128i m;
465-
_mm_storel_epi64(&m, res);
466-
int64_t i;
467-
std::memcpy(&i, &m, sizeof(i));
468-
return i;
469-
#endif
470-
}
471-
else
472-
{
473-
return hadd(self, avx {});
474-
}
475-
}
476443
// load_complex
477444
template <class A>
478445
inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
@@ -621,6 +588,40 @@ namespace xsimd
621588
}
622589
}
623590

591+
// reduce_add
592+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
593+
inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
594+
{
595+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
596+
{
597+
__m256i tmp1 = _mm256_hadd_epi32(self, self);
598+
__m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
599+
__m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
600+
__m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
601+
return _mm_cvtsi128_si32(tmp4);
602+
}
603+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
604+
{
605+
__m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
606+
__m256i tmp2 = _mm256_add_epi64(self, tmp1);
607+
__m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
608+
__m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
609+
#if defined(__x86_64__)
610+
return _mm_cvtsi128_si64(res);
611+
#else
612+
__m128i m;
613+
_mm_storel_epi64(&m, res);
614+
int64_t i;
615+
std::memcpy(&i, &m, sizeof(i));
616+
return i;
617+
#endif
618+
}
619+
else
620+
{
621+
return reduce_add(self, avx {});
622+
}
623+
}
624+
624625
// sadd
625626
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
626627
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -868,32 +868,6 @@ namespace xsimd
868868
return detail::compare_int_avx512f<A, T, _MM_CMPINT_GT>(self, other);
869869
}
870870

871-
// hadd
872-
template <class A>
873-
inline float hadd(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
874-
{
875-
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
876-
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
877-
__m256 res1 = _mm256_add_ps(tmp1, tmp2);
878-
return hadd(batch<float, avx2>(res1), avx2 {});
879-
}
880-
template <class A>
881-
inline double hadd(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
882-
{
883-
__m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
884-
__m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
885-
__m256d res1 = _mm256_add_pd(tmp1, tmp2);
886-
return hadd(batch<double, avx2>(res1), avx2 {});
887-
}
888-
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
889-
inline T hadd(batch<T, A> const& self, requires_arch<avx512f>) noexcept
890-
{
891-
__m256i low, high;
892-
detail::split_avx512(self, low, high);
893-
batch<T, avx2> blow(low), bhigh(high);
894-
return hadd(blow, avx2 {}) + hadd(bhigh, avx2 {});
895-
}
896-
897871
// haddp
898872
template <class A>
899873
inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512f>) noexcept
@@ -1299,6 +1273,33 @@ namespace xsimd
12991273
return _mm512_rcp14_pd(self);
13001274
}
13011275

1276+
// reduce_add
1277+
template <class A>
1278+
inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
1279+
{
1280+
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
1281+
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
1282+
__m256 res1 = _mm256_add_ps(tmp1, tmp2);
1283+
return reduce_add(batch<float, avx2>(res1), avx2 {});
1284+
}
1285+
template <class A>
1286+
inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
1287+
{
1288+
__m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
1289+
__m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
1290+
__m256d res1 = _mm256_add_pd(tmp1, tmp2);
1291+
return reduce_add(batch<double, avx2>(res1), avx2 {});
1292+
}
1293+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1294+
inline T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1295+
{
1296+
__m256i low, high;
1297+
detail::split_avx512(self, low, high);
1298+
batch<T, avx2> blow(low), bhigh(high);
1299+
return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
1300+
}
1301+
1302+
13021303
// rsqrt
13031304
template <class A>
13041305
inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept

0 commit comments

Comments
 (0)