@@ -33,6 +33,18 @@ namespace xsimd
3333 {
3434 using namespace types ;
3535
36+ namespace detail
37+ {
38+ constexpr uint32_t shuffle (uint32_t w, uint32_t x, uint32_t y, uint32_t z)
39+ {
40+ return (z << 6 ) | (y << 4 ) | (x << 2 ) | w;
41+ }
42+ constexpr uint32_t shuffle (uint32_t x, uint32_t y)
43+ {
44+ return (y << 1 ) | x;
45+ }
46+ }
47+
3648 // fwd
3749 template <class A , class T , size_t I>
3850 inline batch<T, A> insert (batch<T, A> const & self, T val, index<I>, requires_arch<generic>) noexcept ;
@@ -1155,6 +1167,50 @@ namespace xsimd
11551167 __m128 tmp1 = _mm_add_ss (tmp0, _mm_shuffle_ps (tmp0, tmp0, 1 ));
11561168 return _mm_cvtss_f32 (tmp1);
11571169 }
1170+
1171+ // reduce_max
1172+ template <class A , class T , class _ = typename std::enable_if<(sizeof (T) <= 2 ), void >::type>
1173+ inline T reduce_max (batch<T, A> const & self, requires_arch<sse2>) noexcept
1174+ {
1175+ constexpr auto mask0 = detail::shuffle (2 , 3 , 0 , 0 );
1176+ batch<T, A> step0 = _mm_shuffle_epi32 (self, mask0);
1177+ batch<T, A> acc0 = max (self, step0);
1178+
1179+ constexpr auto mask1 = detail::shuffle (1 , 0 , 0 , 0 );
1180+ batch<T, A> step1 = _mm_shuffle_epi32 (acc0, mask1);
1181+ batch<T, A> acc1 = max (acc0, step1);
1182+
1183+ constexpr auto mask2 = detail::shuffle (1 , 0 , 0 , 0 );
1184+ batch<T, A> step2 = _mm_shufflelo_epi16 (acc1, mask2);
1185+ batch<T, A> acc2 = max (acc1, step2);
1186+ if (sizeof (T) == 2 )
1187+ return acc2.get (0 );
1188+ batch<T, A> step3 = bitwise_cast<batch<T, A>>(bitwise_cast<batch<uint16_t , A>>(acc2) >> 8 );
1189+ batch<T, A> acc3 = max (acc2, step3);
1190+ return acc3.get (0 );
1191+ }
1192+
1193+ // reduce_min
1194+ template <class A , class T , class _ = typename std::enable_if<(sizeof (T) <= 2 ), void >::type>
1195+ inline T reduce_min (batch<T, A> const & self, requires_arch<sse2>) noexcept
1196+ {
1197+ constexpr auto mask0 = detail::shuffle (2 , 3 , 0 , 0 );
1198+ batch<T, A> step0 = _mm_shuffle_epi32 (self, mask0);
1199+ batch<T, A> acc0 = min (self, step0);
1200+
1201+ constexpr auto mask1 = detail::shuffle (1 , 0 , 0 , 0 );
1202+ batch<T, A> step1 = _mm_shuffle_epi32 (acc0, mask1);
1203+ batch<T, A> acc1 = min (acc0, step1);
1204+
1205+ constexpr auto mask2 = detail::shuffle (1 , 0 , 0 , 0 );
1206+ batch<T, A> step2 = _mm_shufflelo_epi16 (acc1, mask2);
1207+ batch<T, A> acc2 = min (acc1, step2);
1208+ if (sizeof (T) == 2 )
1209+ return acc2.get (0 );
1210+ batch<T, A> step3 = bitwise_cast<batch<T, A>>(bitwise_cast<batch<uint16_t , A>>(acc2) >> 8 );
1211+ batch<T, A> acc3 = min (acc2, step3);
1212+ return acc3.get (0 );
1213+ }
11581214 // TODO: move this in xsimd_generic
11591215 namespace detail
11601216 {
@@ -1207,7 +1263,6 @@ namespace xsimd
12071263 return _mm_cvtsd_f64 (_mm_add_sd (self, _mm_unpackhi_pd (self, self)));
12081264 }
12091265
1210-
12111266 // rsqrt
12121267 template <class A >
12131268 inline batch<float , A> rsqrt (batch<float , A> const & val, requires_arch<sse2>) noexcept
@@ -1541,18 +1596,6 @@ namespace xsimd
15411596
15421597 // swizzle
15431598
1544- namespace detail
1545- {
1546- constexpr uint32_t shuffle (uint32_t w, uint32_t x, uint32_t y, uint32_t z)
1547- {
1548- return (z << 6 ) | (y << 4 ) | (x << 2 ) | w;
1549- }
1550- constexpr uint32_t shuffle (uint32_t x, uint32_t y)
1551- {
1552- return (y << 1 ) | x;
1553- }
1554- }
1555-
15561599 template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
15571600 inline batch<float , A> swizzle (batch<float , A> const & self, batch_constant<batch<uint32_t , A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
15581601 {
0 commit comments