Skip to content

Commit 0904da2

Browse files
committed
Swap instead of duplicate
1 parent 58e1b0c commit 0904da2

1 file changed

Lines changed: 43 additions & 57 deletions

File tree

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 43 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,88 +1502,74 @@ namespace xsimd
15021502
}
15031503
return split;
15041504
}
1505-
// Duplicate lanes separately
1506-
// 1) duplicate low and high lanes
1507-
__m256 low_dup = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
1508-
__m256 hi_dup = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
15091505

1510-
// 2) build lane-local index vector (each element = source_index & 3)
1511-
constexpr batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
1506+
// Fallback to general algorithm. This is the same as the dynamic version with the exception
1507+
// that possible operations are done at compile time.
15121508

1513-
__m256 r0 = _mm256_permutevar_ps(low_dup, half_mask.as_batch()); // pick from low lane
1514-
__m256 r1 = _mm256_permutevar_ps(hi_dup, half_mask.as_batch()); // pick from high lane
1509+
// swap lanes
1510+
__m256 swapped = _mm256_permute2f128_ps(self, self, 0x01); // [high | low]
1511+
1512+
// normalize mask taking modulo 4
1513+
constexpr auto half_mask = mask % make_batch_constant<uint32_t, 4, A>();
1514+
1515+
// permute within each lane
1516+
__m256 r0 = _mm256_permutevar_ps(self, half_mask.as_batch());
1517+
__m256 r1 = _mm256_permutevar_ps(swapped, half_mask.as_batch());
15151518

1516-
constexpr batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> lane_mask {};
1519+
// select lane by the mask index divided by 4
1520+
constexpr auto lane = batch_constant<uint32_t, A, 0, 0, 0, 0, 1, 1, 1, 1> {};
1521+
constexpr int lane_mask = ((mask / make_batch_constant<uint32_t, 4, A>()) != lane).mask();
15171522

1518-
return _mm256_blend_ps(r0, r1, lane_mask.mask());
1523+
return _mm256_blend_ps(r0, r1, lane_mask);
15191524
}
15201525

15211526
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
15221527
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx>) noexcept
15231528
{
15241529
// cannot use detail::mod_shuffle as the mod and shift are different in this case
1525-
constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
1530+
constexpr auto imm = ((V0 % 2) << 0) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3);
15261531
XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
15271532
XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
15281533
{
15291534
return _mm256_permute_pd(self, imm);
15301535
}
1531-
// duplicate low and high part of input
1532-
__m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
1533-
__m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
1536+
1537+
// Fallback to general algorithm. This is the same as the dynamic version with the exception
1538+
// that possible operations are done at compile time.
1539+
1540+
// swap lanes
1541+
__m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low]
15341542

15351543
// permute within each lane
1536-
__m256d r0 = _mm256_permute_pd(lo, imm);
1537-
__m256d r1 = _mm256_permute_pd(hi, imm);
1544+
__m256d r0 = _mm256_permute_pd(self, imm);
1545+
__m256d r1 = _mm256_permute_pd(swapped, imm);
15381546

1539-
// mask to choose the right lane
1540-
constexpr batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
1547+
// select lane by the mask index divided by 2
1548+
constexpr auto lane = batch_constant<uint64_t, A, 0, 0, 1, 1> {};
1549+
constexpr int lane_mask = ((mask / make_batch_constant<uint64_t, 2, A>()) != lane).mask();
15411550

15421551
// blend the two permutes
1543-
return _mm256_blend_pd(r0, r1, blend_mask.mask());
1544-
}
1545-
template <class A,
1546-
typename T,
1547-
uint32_t V0,
1548-
uint32_t V1,
1549-
uint32_t V2,
1550-
uint32_t V3,
1551-
uint32_t V4,
1552-
uint32_t V5,
1553-
uint32_t V6,
1554-
uint32_t V7,
1555-
detail::enable_sized_integral_t<T, 4> = 0>
1556-
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
1557-
batch_constant<uint32_t, A,
1558-
V0,
1559-
V1,
1560-
V2,
1561-
V3,
1562-
V4,
1563-
V5,
1564-
V6,
1565-
V7> const& mask,
1566-
requires_arch<avx>) noexcept
1552+
return _mm256_blend_pd(r0, r1, lane_mask);
1553+
}
1554+
1555+
template <
1556+
class A, typename T,
1557+
uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
1558+
detail::enable_sized_integral_t<T, 4> = 0>
1559+
XSIMD_INLINE batch<T, A> swizzle(
1560+
batch<T, A> const& self,
1561+
batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> const& mask,
1562+
requires_arch<avx>) noexcept
15671563
{
1568-
return bitwise_cast<T>(
1569-
swizzle(bitwise_cast<float>(self), mask));
1564+
return bitwise_cast<T>(swizzle(bitwise_cast<float>(self), mask));
15701565
}
15711566

1572-
template <class A,
1573-
typename T,
1574-
uint64_t V0,
1575-
uint64_t V1,
1576-
uint64_t V2,
1577-
uint64_t V3,
1578-
detail::enable_sized_integral_t<T, 8> = 0>
1579-
XSIMD_INLINE batch<T, A>
1580-
swizzle(batch<T, A> const& self,
1581-
batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
1582-
requires_arch<avx>) noexcept
1567+
template <class A, typename T, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3, detail::enable_sized_integral_t<T, 8> = 0>
1568+
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask, requires_arch<avx>) noexcept
15831569
{
1584-
return bitwise_cast<T>(
1585-
swizzle(bitwise_cast<double>(self), mask));
1570+
return bitwise_cast<T>(swizzle(bitwise_cast<double>(self), mask));
15861571
}
1572+
15871573
// transpose
15881574
template <class A>
15891575
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<avx>) noexcept

0 commit comments

Comments
 (0)