Skip to content

Commit 012fb98

Browse files
committed
Fix sse2
1 parent 3425fd0 commit 012fb98

2 files changed

Lines changed: 17 additions & 9 deletions

File tree

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -181,10 +181,12 @@ namespace xsimd
181181
static_assert(shift < bits, "Shift must be less than the number of bits in T");
182182
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
183183
{
184+
// 8-bit left shift via 16-bit shift + mask
185+
__m256i shifted = _mm256_slli_epi16(self, shift);
184186
// TODO(C++17): without `if constexpr` we must ensure the compile-time shift does not overflow
185187
constexpr uint8_t mask8 = static_cast<uint8_t>(sizeof(T) == 1 ? (~0u << shift) : 0);
186-
__m256i const mask = _mm256_set1_epi8(mask8);
187-
return _mm256_and_si256(_mm256_slli_epi16(self, shift), mask);
188+
const __m256i mask = _mm256_set1_epi8(mask8);
189+
return _mm256_and_si256(shifted, mask);
188190
}
189191
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
190192
{
@@ -315,10 +317,12 @@ namespace xsimd
315317
{
316318
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
317319
{
320+
// 8-bit left shift via 16-bit shift + mask
321+
const __m256i shifted = _mm256_srli_epi16(self, shift);
318322
// TODO(C++17): without `if constexpr` we must ensure the compile-time shift does not overflow
319323
constexpr uint8_t mask8 = static_cast<uint8_t>(sizeof(T) == 1 ? ((1u << shift) - 1u) : 0);
320-
__m256i const mask = _mm256_set1_epi8(mask8);
321-
return _mm256_and_si256(_mm256_srli_epi16(self, shift), mask);
324+
const __m256i mask = _mm256_set1_epi8(mask8);
325+
return _mm256_and_si256(shifted, mask);
322326
}
323327
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
324328
{

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,9 @@ namespace xsimd
305305
{
306306
// 8-bit left shift via 16-bit shift + mask
307307
__m128i shifted = _mm_slli_epi16(self, static_cast<int>(shift));
308-
__m128i mask = _mm_set1_epi8(static_cast<char>(0xFF << shift));
308+
// TODO(C++17): without `if constexpr` we must ensure the compile-time shift does not overflow
309+
constexpr uint8_t mask8 = static_cast<uint8_t>(sizeof(T) == 1 ? (~0u << shift) : 0);
310+
const __m128i mask = _mm_set1_epi8(mask8);
309311
return _mm_and_si128(shifted, mask);
310312
}
311313
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
@@ -488,10 +490,12 @@ namespace xsimd
488490
{
489491
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
490492
{
491-
// Emulate byte-wise logical right shift using 16-bit shifts + per-byte mask.
492-
__m128i s16 = _mm_srli_epi16(self, static_cast<int>(shift));
493-
__m128i mask = _mm_set1_epi8(static_cast<char>(0xFFu >> shift));
494-
return _mm_and_si128(s16, mask);
493+
// 8-bit left shift via 16-bit shift + mask
494+
__m128i shifted = _mm_srli_epi16(self, static_cast<int>(shift));
495+
// TODO(C++17): without `if constexpr` we must ensure the compile-time shift does not overflow
496+
constexpr uint8_t mask8 = static_cast<uint8_t>(sizeof(T) == 1 ? ((1u << shift) - 1u) : 0);
497+
const __m128i mask = _mm_set1_epi8(mask8);
498+
return _mm_and_si128(shifted, mask);
495499
}
496500
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
497501
{

0 commit comments

Comments
 (0)