@@ -1502,88 +1502,74 @@ namespace xsimd
15021502 }
15031503 return split;
15041504 }
1505- // Duplicate lanes separately
1506- // 1) duplicate low and high lanes
1507- __m256 low_dup = _mm256_permute2f128_ps (self, self, 0x00 ); // [low | low]
1508- __m256 hi_dup = _mm256_permute2f128_ps (self, self, 0x11 ); // [high| high]
15091505
1510- // 2) build lane-local index vector (each element = source_index & 3)
1511- constexpr batch_constant< uint32_t , A, ( V0 % 4 ), ( V1 % 4 ), ( V2 % 4 ), ( V3 % 4 ), ( V4 % 4 ), ( V5 % 4 ), ( V6 % 4 ), ( V7 % 4 )> half_mask;
1506+ // Fallback to general algorithm. This is the same as the dynamic version with the exception
1507+ // that possible operations are done at compile time.
15121508
1513- __m256 r0 = _mm256_permutevar_ps (low_dup, half_mask.as_batch ()); // pick from low lane
1514- __m256 r1 = _mm256_permutevar_ps (hi_dup, half_mask.as_batch ()); // pick from high lane
1509+ // swap lanes
1510+ __m256 swapped = _mm256_permute2f128_ps (self, self, 0x01 ); // [high | low]
1511+
1512+ // normalize mask taking modulo 4
1513+ constexpr auto half_mask = mask % make_batch_constant<uint32_t , 4 , A>();
1514+
1515+ // permute within each lane
1516+ __m256 r0 = _mm256_permutevar_ps (self, half_mask.as_batch ());
1517+ __m256 r1 = _mm256_permutevar_ps (swapped, half_mask.as_batch ());
15151518
1516- constexpr batch_bool_constant<uint32_t , A, (V0 >= 4 ), (V1 >= 4 ), (V2 >= 4 ), (V3 >= 4 ), (V4 >= 4 ), (V5 >= 4 ), (V6 >= 4 ), (V7 >= 4 )> lane_mask {};
1519+ // select lane by the mask index divided by 4
1520+ constexpr auto lane = batch_constant<uint32_t , A, 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 > {};
1521+ constexpr int lane_mask = ((mask / make_batch_constant<uint32_t , 4 , A>()) != lane).mask ();
15171522
1518- return _mm256_blend_ps (r0, r1, lane_mask. mask () );
1523+ return _mm256_blend_ps (r0, r1, lane_mask);
15191524 }
15201525
15211526 template <class A , uint64_t V0 , uint64_t V1 , uint64_t V2 , uint64_t V3 >
15221527 XSIMD_INLINE batch<double , A> swizzle (batch<double , A> const & self, batch_constant<uint64_t , A, V0 , V1 , V2 , V3 > mask, requires_arch<avx>) noexcept
15231528 {
15241529 // cannot use detail::mod_shuffle as the mod and shift are different in this case
1525- constexpr auto imm = ((V0 & 1 ) << 0 ) | ((V1 & 1 ) << 1 ) | ((V2 & 1 ) << 2 ) | ((V3 & 1 ) << 3 );
1530+ constexpr auto imm = ((V0 % 2 ) << 0 ) | ((V1 % 2 ) << 1 ) | ((V2 % 2 ) << 2 ) | ((V3 % 2 ) << 3 );
15261531 XSIMD_IF_CONSTEXPR (detail::is_identity (mask)) { return self; }
15271532 XSIMD_IF_CONSTEXPR (!detail::is_cross_lane (mask))
15281533 {
15291534 return _mm256_permute_pd (self, imm);
15301535 }
1531- // duplicate low and high part of input
1532- __m256d lo = _mm256_permute2f128_pd (self, self, 0x00 );
1533- __m256d hi = _mm256_permute2f128_pd (self, self, 0x11 );
1536+
1537+ // Fallback to general algorithm. This is the same as the dynamic version with the exception
1538+ // that possible operations are done at compile time.
1539+
1540+ // swap lanes
1541+ __m256d swapped = _mm256_permute2f128_pd (self, self, 0x01 ); // [high | low]
15341542
15351543 // permute within each lane
1536- __m256d r0 = _mm256_permute_pd (lo , imm);
1537- __m256d r1 = _mm256_permute_pd (hi , imm);
1544+ __m256d r0 = _mm256_permute_pd (self , imm);
1545+ __m256d r1 = _mm256_permute_pd (swapped , imm);
15381546
1539- // mask to choose the right lane
1540- constexpr batch_bool_constant<uint64_t , A, (V0 >= 2 ), (V1 >= 2 ), (V2 >= 2 ), (V3 >= 2 )> blend_mask;
1547+ // select lane by the mask index divided by 2
1548+ constexpr auto lane = batch_constant<uint64_t , A, 0 , 0 , 1 , 1 > {};
1549+ constexpr int lane_mask = ((mask / make_batch_constant<uint64_t , 2 , A>()) != lane).mask ();
15411550
15421551 // blend the two permutes
1543- return _mm256_blend_pd (r0, r1, blend_mask.mask ());
1544- }
1545- template <class A ,
1546- typename T,
1547- uint32_t V0 ,
1548- uint32_t V1 ,
1549- uint32_t V2 ,
1550- uint32_t V3 ,
1551- uint32_t V4 ,
1552- uint32_t V5 ,
1553- uint32_t V6 ,
1554- uint32_t V7 ,
1555- detail::enable_sized_integral_t <T, 4 > = 0 >
1556- XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self,
1557- batch_constant<uint32_t , A,
1558- V0 ,
1559- V1 ,
1560- V2 ,
1561- V3 ,
1562- V4 ,
1563- V5 ,
1564- V6 ,
1565- V7 > const & mask,
1566- requires_arch<avx>) noexcept
1552+ return _mm256_blend_pd (r0, r1, lane_mask);
1553+ }
1554+
1555+ template <
1556+ class A , typename T,
1557+ uint32_t V0 , uint32_t V1 , uint32_t V2 , uint32_t V3 , uint32_t V4 , uint32_t V5 , uint32_t V6 , uint32_t V7 ,
1558+ detail::enable_sized_integral_t <T, 4 > = 0 >
1559+ XSIMD_INLINE batch<T, A> swizzle (
1560+ batch<T, A> const & self,
1561+ batch_constant<uint32_t , A, V0 , V1 , V2 , V3 , V4 , V5 , V6 , V7 > const & mask,
1562+ requires_arch<avx>) noexcept
15671563 {
1568- return bitwise_cast<T>(
1569- swizzle (bitwise_cast<float >(self), mask));
1564+ return bitwise_cast<T>(swizzle (bitwise_cast<float >(self), mask));
15701565 }
15711566
1572- template <class A ,
1573- typename T,
1574- uint64_t V0 ,
1575- uint64_t V1 ,
1576- uint64_t V2 ,
1577- uint64_t V3 ,
1578- detail::enable_sized_integral_t <T, 8 > = 0 >
1579- XSIMD_INLINE batch<T, A>
1580- swizzle (batch<T, A> const & self,
1581- batch_constant<uint64_t , A, V0 , V1 , V2 , V3 > const & mask,
1582- requires_arch<avx>) noexcept
1567+ template <class A , typename T, uint64_t V0 , uint64_t V1 , uint64_t V2 , uint64_t V3 , detail::enable_sized_integral_t <T, 8 > = 0 >
1568+ XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self, batch_constant<uint64_t , A, V0 , V1 , V2 , V3 > const & mask, requires_arch<avx>) noexcept
15831569 {
1584- return bitwise_cast<T>(
1585- swizzle (bitwise_cast<double >(self), mask));
1570+ return bitwise_cast<T>(swizzle (bitwise_cast<double >(self), mask));
15861571 }
1572+
15871573 // transpose
15881574 template <class A >
15891575 XSIMD_INLINE void transpose (batch<float , A>* matrix_begin, batch<float , A>* matrix_end, requires_arch<avx>) noexcept
0 commit comments