Skip to content

Commit 7d30b9c

Browse files
Tentative support for avx512vl extensions to 128 bit registers
Very similar to the implementation for avx512vl_256, with a few less instruction supported though. As a side effect, fix an OOB acces in tobitset for avx512_256
1 parent b707616 commit 7d30b9c

8 files changed

Lines changed: 713 additions & 13 deletions

File tree

.github/workflows/linux.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ jobs:
3232
- { compiler: 'clang', version: '18', flags: 'avx512' }
3333
- { compiler: 'clang', version: '18', flags: 'avx_128' }
3434
- { compiler: 'clang', version: '18', flags: 'avx2_128' }
35+
- { compiler: 'clang', version: '18', flags: 'avx512vl_128' }
3536
- { compiler: 'clang', version: '18', flags: 'avx512vl_256' }
3637
steps:
3738
- name: Setup compiler
@@ -97,6 +98,10 @@ jobs:
9798
if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then
9899
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
99100
fi
101+
if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then
102+
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
103+
CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128"
104+
fi
100105
if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then
101106
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
102107
CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_256"

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,15 +1518,40 @@ namespace xsimd
15181518
{
15191519
// Adapted from https://github.com/serge-sans-paille/fast-bitset-from-bool-array
15201520
// Generate a bitset from an array of boolean.
1521-
XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[8])
1521+
template <size_t N>
1522+
XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[N])
15221523
{
1523-
uint64_t data;
1524-
memcpy(&data, unpacked, sizeof(uint64_t));
1524+
static_assert(N == 8 || N == 4 || N == 2, "valid pack size");
1525+
XSIMD_IF_CONSTEXPR(N == 8)
1526+
{
1527+
uint64_t data;
1528+
memcpy(&data, unpacked, sizeof(uint64_t));
1529+
1530+
const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000);
1531+
1532+
unsigned char res = ((data * magic) >> 56) & 0xFF;
1533+
return res;
1534+
}
1535+
else XSIMD_IF_CONSTEXPR(N == 4)
1536+
{
1537+
uint32_t data;
1538+
memcpy(&data, unpacked, sizeof(uint32_t));
15251539

1526-
const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000);
1540+
const uint32_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000);
15271541

1528-
unsigned char res = ((data * magic) >> 56) & 0xFF;
1529-
return res;
1542+
unsigned char res = ((data * magic) >> 24) & 0xFF;
1543+
return res;
1544+
}
1545+
else XSIMD_IF_CONSTEXPR(N == 2)
1546+
{
1547+
uint16_t data;
1548+
memcpy(&data, unpacked, sizeof(uint16_t));
1549+
1550+
const uint16_t magic = (0x80 + 0x4000);
1551+
1552+
unsigned char res = ((data * magic) >> 8) & 0xFF;
1553+
return res;
1554+
}
15301555
}
15311556
}
15321557

@@ -1541,7 +1566,7 @@ namespace xsimd
15411566
register_type mask = 0;
15421567
for (std::size_t i = 0; i < iter; ++i)
15431568
{
1544-
unsigned char block = detail::tobitset((unsigned char*)mem + i * 8);
1569+
unsigned char block = detail::tobitset<8>((unsigned char*)mem + i * 8);
15451570
mask |= (register_type(block) << (i * 8));
15461571
}
15471572
return mask;

include/xsimd/arch/xsimd_avx512vl_128.hpp

Lines changed: 647 additions & 0 deletions
Large diffs are not rendered by default.

include/xsimd/arch/xsimd_avx512vl_256.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,13 +119,14 @@ namespace xsimd
119119
{
120120
using register_type = typename batch_bool<T, A>::register_type;
121121
constexpr auto size = batch_bool<T, A>::size;
122-
constexpr auto iter = size / 4;
123-
static_assert((size % 4) == 0, "incorrect size of bool batch");
122+
constexpr auto chunk_size = size >= 8 ? 8 : 4;
123+
constexpr auto iter = size / chunk_size;
124+
static_assert((size % chunk_size) == 0, "incorrect size of bool batch");
124125
register_type mask = 0;
125126
for (std::size_t i = 0; i < iter; ++i)
126127
{
127-
unsigned char block = detail::tobitset((unsigned char*)mem + i * 4);
128-
mask |= (register_type(block) << (i * 4));
128+
unsigned char block = detail::tobitset<chunk_size>((unsigned char*)mem + i * chunk_size);
129+
mask |= (register_type(block) << (i * chunk_size));
129130
}
130131
return mask;
131132
}

include/xsimd/arch/xsimd_isa.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575

7676
#if XSIMD_WITH_AVX512VL
7777
#include "./xsimd_avx512vl.hpp"
78+
#include "./xsimd_avx512vl_128.hpp"
7879
#include "./xsimd_avx512vl_256.hpp"
7980
#endif
8081

include/xsimd/config/xsimd_arch.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ namespace xsimd
163163

164164
using all_x86_architectures = arch_list<
165165
avx512vnni<avx512vbmi2>, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512vl, avx512cd, avx512f,
166-
avxvnni, avx512vl_256, fma3<avx2>, avx2, fma3<avx>, avx, avx2_128, avx_128, fma4, fma3<sse4_2>,
166+
avxvnni, avx512vl_256, fma3<avx2>, avx2, fma3<avx>, avx, avx512vl_128, avx2_128, avx_128, fma4, fma3<sse4_2>,
167167
sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
168168

169169
using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;

include/xsimd/config/xsimd_cpu_features_x86.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -895,7 +895,9 @@ namespace xsimd
895895

896896
inline bool avx512vl() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx512vl>(); }
897897

898-
inline bool avx512vl_256() const noexcept { return avx512_enabled() && osxsave() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx512vl>(); }
898+
inline bool avx512vl_128() const noexcept { return avx512vl() && osxsave(); }
899+
900+
inline bool avx512vl_256() const noexcept { return avx512vl_128(); }
899901

900902
inline bool avx512vbmi() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ecx::avx512vbmi>(); }
901903

include/xsimd/types/xsimd_avx512vl_register.hpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,18 @@ namespace xsimd
2929
static constexpr char const* name() noexcept { return "avx512vl"; }
3030
};
3131

32+
/**
33+
* @ingroup architectures
34+
*
35+
* AVX512VL instructions extension for 128 bits registers
36+
*/
37+
struct avx512vl_128 : avx2_128
38+
{
39+
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VL; }
40+
static constexpr bool available() noexcept { return true; }
41+
static constexpr char const* name() noexcept { return "avx512vl/128"; }
42+
};
43+
3244
/**
3345
* @ingroup architectures
3446
*
@@ -57,6 +69,13 @@ namespace xsimd
5769

5870
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vl, avx512cd);
5971

72+
template <class T>
73+
struct get_bool_simd_register<T, avx512vl_128>
74+
{
75+
using type = simd_avx512_bool_register<T>;
76+
};
77+
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vl_128, avx2_128);
78+
6079
template <class T>
6180
struct get_bool_simd_register<T, avx512vl_256>
6281
{

0 commit comments

Comments
 (0)