@@ -181,10 +181,12 @@ namespace xsimd
181181 static_assert (shift < bits, " Shift must be less than the number of bits in T" );
182182 XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
183183 {
184+ // 8-bit left shift via 16-bit shift + mask
185+ __m256i shifted = _mm256_slli_epi16 (self, shift);
184186 // TODO(C++17): without `if constexpr` we must ensure the compile-time shift does not overflow
185187 constexpr uint8_t mask8 = static_cast <uint8_t >(sizeof (T) == 1 ? (~0u << shift) : 0 );
186- __m256i const mask = _mm256_set1_epi8 (mask8);
187- return _mm256_and_si256 (_mm256_slli_epi16 (self, shift) , mask);
188+ const __m256i mask = _mm256_set1_epi8 (mask8);
189+ return _mm256_and_si256 (shifted , mask);
188190 }
189191 XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
190192 {
@@ -315,10 +317,12 @@ namespace xsimd
315317 {
316318 XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
317319 {
320+ // 8-bit left shift via 16-bit shift + mask
321+ const __m256i shifted = _mm256_srli_epi16 (self, shift);
318322 // TODO(C++17): without `if constexpr` we must ensure the compile-time shift does not overflow
319323 constexpr uint8_t mask8 = static_cast <uint8_t >(sizeof (T) == 1 ? ((1u << shift) - 1u ) : 0 );
320- __m256i const mask = _mm256_set1_epi8 (mask8);
321- return _mm256_and_si256 (_mm256_srli_epi16 (self, shift) , mask);
324+ const __m256i mask = _mm256_set1_epi8 (mask8);
325+ return _mm256_and_si256 (shifted , mask);
322326 }
323327 XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
324328 {
0 commit comments