@@ -1502,88 +1502,74 @@ namespace xsimd
15021502 }
15031503 return split;
15041504 }
1505- // Duplicate lanes separately
1506- // 1) duplicate low and high lanes
1507- __m256 low_dup = _mm256_permute2f128_ps (self, self, 0x00 ); // [low | low]
1508- __m256 hi_dup = _mm256_permute2f128_ps (self, self, 0x11 ); // [high| high]
15091505
1510- // 2) build lane-local index vector (each element = source_index & 3)
1511- constexpr batch_constant< uint32_t , A, ( V0 % 4 ), ( V1 % 4 ), ( V2 % 4 ), ( V3 % 4 ), ( V4 % 4 ), ( V5 % 4 ), ( V6 % 4 ), ( V7 % 4 )> half_mask;
1506+ // Fallback to general algorithm. This is the same as the dynamic version with the exception
1507+ // that possible operations are done at compile time.
15121508
1513- __m256 r0 = _mm256_permutevar_ps (low_dup, half_mask.as_batch ()); // pick from low lane
1514- __m256 r1 = _mm256_permutevar_ps (hi_dup, half_mask.as_batch ()); // pick from high lane
1509+ // swap lanes
1510+ __m256 swapped = _mm256_permute2f128_ps (self, self, 0x01 ); // [high | low]
1511+
1512+ // normalize mask taking modulo 4
1513+ constexpr auto half_mask = mask % make_batch_constant<uint32_t , 4 , A>();
15151514
1516- constexpr batch_bool_constant<uint32_t , A, (V0 >= 4 ), (V1 >= 4 ), (V2 >= 4 ), (V3 >= 4 ), (V4 >= 4 ), (V5 >= 4 ), (V6 >= 4 ), (V7 >= 4 )> lane_mask {};
1515+ // permute within each lane
1516+ __m256 r0 = _mm256_permutevar_ps (self, half_mask.as_batch ());
1517+ __m256 r1 = _mm256_permutevar_ps (swapped, half_mask.as_batch ());
15171518
1518- return _mm256_blend_ps (r0, r1, lane_mask.mask ());
1519+ constexpr auto lane = batch_constant<uint32_t , A, 0 , 0 , 0 , 0 , 4 , 4 , 4 , 4 > {};
1520+ batch_bool<uint32_t , A> blend_mask = (mask & 0b100u ) != lane;
1521+ return _mm256_blendv_ps (r0, r1, batch_bool_cast<float >(blend_mask));
1522+
1523+ // constexpr int lane_mask = (mask >= make_batch_constant<uint32_t, 4, A>()).mask();
1524+ //
1525+ // return _mm256_blend_ps(r0, r1, lane_mask);
15191526 }
15201527
15211528 template <class A , uint64_t V0 , uint64_t V1 , uint64_t V2 , uint64_t V3 >
15221529 XSIMD_INLINE batch<double , A> swizzle (batch<double , A> const & self, batch_constant<uint64_t , A, V0 , V1 , V2 , V3 > mask, requires_arch<avx>) noexcept
15231530 {
15241531 // cannot use detail::mod_shuffle as the mod and shift are different in this case
1525- constexpr auto imm = ((V0 & 1 ) << 0 ) | ((V1 & 1 ) << 1 ) | ((V2 & 1 ) << 2 ) | ((V3 & 1 ) << 3 );
1532+ constexpr auto imm = ((V0 % 2 ) << 0 ) | ((V1 % 2 ) << 1 ) | ((V2 % 2 ) << 2 ) | ((V3 % 2 ) << 3 );
15261533 XSIMD_IF_CONSTEXPR (detail::is_identity (mask)) { return self; }
15271534 XSIMD_IF_CONSTEXPR (!detail::is_cross_lane (mask))
15281535 {
15291536 return _mm256_permute_pd (self, imm);
15301537 }
1531- // duplicate low and high part of input
1532- __m256d lo = _mm256_permute2f128_pd (self, self, 0x00 );
1533- __m256d hi = _mm256_permute2f128_pd (self, self, 0x11 );
1538+
1539+ // Fallback to general algorithm. This is the same as the dynamic version with the exception
1540+ // that possible operations are done at compile time.
1541+
1542+ // swap lanes
1543+ __m256d swapped = _mm256_permute2f128_pd (self, self, 0x01 ); // [high | low]
15341544
15351545 // permute within each lane
1536- __m256d r0 = _mm256_permute_pd (lo , imm);
1537- __m256d r1 = _mm256_permute_pd (hi , imm);
1546+ __m256d r0 = _mm256_permute_pd (self , imm);
1547+ __m256d r1 = _mm256_permute_pd (swapped , imm);
15381548
1539- // mask to choose the right lane
1540- constexpr batch_bool_constant<uint64_t , A, (V0 >= 2 ), (V1 >= 2 ), (V2 >= 2 ), (V3 >= 2 )> blend_mask;
1549+ constexpr int lane_mask = (mask >= make_batch_constant<uint64_t , 2 , A>()).mask ();
15411550
15421551 // blend the two permutes
1543- return _mm256_blend_pd (r0, r1, blend_mask.mask ());
1544- }
1545- template <class A ,
1546- typename T,
1547- uint32_t V0 ,
1548- uint32_t V1 ,
1549- uint32_t V2 ,
1550- uint32_t V3 ,
1551- uint32_t V4 ,
1552- uint32_t V5 ,
1553- uint32_t V6 ,
1554- uint32_t V7 ,
1555- detail::enable_sized_integral_t <T, 4 > = 0 >
1556- XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self,
1557- batch_constant<uint32_t , A,
1558- V0 ,
1559- V1 ,
1560- V2 ,
1561- V3 ,
1562- V4 ,
1563- V5 ,
1564- V6 ,
1565- V7 > const & mask,
1566- requires_arch<avx>) noexcept
1552+ return _mm256_blend_pd (r0, r1, lane_mask);
1553+ }
1554+
1555+ template <
1556+ class A , typename T,
1557+ uint32_t V0 , uint32_t V1 , uint32_t V2 , uint32_t V3 , uint32_t V4 , uint32_t V5 , uint32_t V6 , uint32_t V7 ,
1558+ detail::enable_sized_integral_t <T, 4 > = 0 >
1559+ XSIMD_INLINE batch<T, A> swizzle (
1560+ batch<T, A> const & self,
1561+ batch_constant<uint32_t , A, V0 , V1 , V2 , V3 , V4 , V5 , V6 , V7 > const & mask,
1562+ requires_arch<avx>) noexcept
15671563 {
1568- return bitwise_cast<T>(
1569- swizzle (bitwise_cast<float >(self), mask));
1564+ return bitwise_cast<T>(swizzle (bitwise_cast<float >(self), mask));
15701565 }
15711566
1572- template <class A ,
1573- typename T,
1574- uint64_t V0 ,
1575- uint64_t V1 ,
1576- uint64_t V2 ,
1577- uint64_t V3 ,
1578- detail::enable_sized_integral_t <T, 8 > = 0 >
1579- XSIMD_INLINE batch<T, A>
1580- swizzle (batch<T, A> const & self,
1581- batch_constant<uint64_t , A, V0 , V1 , V2 , V3 > const & mask,
1582- requires_arch<avx>) noexcept
1567+ template <class A , typename T, uint64_t V0 , uint64_t V1 , uint64_t V2 , uint64_t V3 , detail::enable_sized_integral_t <T, 8 > = 0 >
1568+ XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self, batch_constant<uint64_t , A, V0 , V1 , V2 , V3 > const & mask, requires_arch<avx>) noexcept
15831569 {
1584- return bitwise_cast<T>(
1585- swizzle (bitwise_cast<double >(self), mask));
1570+ return bitwise_cast<T>(swizzle (bitwise_cast<double >(self), mask));
15861571 }
1572+
15871573 // transpose
15881574 template <class A >
15891575 XSIMD_INLINE void transpose (batch<float , A>* matrix_begin, batch<float , A>* matrix_end, requires_arch<avx>) noexcept
0 commit comments