@@ -8,6 +8,20 @@ use super::super::no_std_floats::NoStdFloatExt;
88use core:: arch:: wasm32 as wasm;
99#[ cfg( target_arch = "wasm64" ) ]
1010use core:: arch:: wasm64 as wasm;
11+ #[ cfg( all(
12+ feature = "simd-x86" ,
13+ target_arch = "x86_64" ,
14+ target_feature = "sse4.2" ,
15+ target_feature = "avx" ,
16+ target_feature = "avx2" ,
17+ target_feature = "bmi1" ,
18+ target_feature = "bmi2" ,
19+ target_feature = "fma" ,
20+ target_feature = "lzcnt" ,
21+ target_feature = "movbe" ,
22+ target_feature = "popcnt"
23+ ) ) ]
24+ use core:: arch:: x86_64 as x86;
1125
1226impl Value128 {
1327 #[ doc( alias = "v128.any_true" ) ]
@@ -132,20 +146,41 @@ impl Value128 {
132146
133147 #[ doc( alias = "i8x16.swizzle" ) ]
134148 pub fn i8x16_swizzle ( self , s : Self ) -> Self {
135- #[ cfg( any( target_arch = "wasm32" , target_arch = "wasm64" ) ) ]
136- return Self :: from_wasm_v128 ( wasm:: i8x16_swizzle ( self . to_wasm_v128 ( ) , s. to_wasm_v128 ( ) ) ) ;
137-
138- let a = self . to_le_bytes ( ) ;
139- let idx = s. to_le_bytes ( ) ;
140- let mut out = [ 0u8 ; 16 ] ;
141- let mut i = 0 ;
142- while i < 16 {
143- let j = idx[ i] ;
144- let lane = a[ ( j & 0x0f ) as usize ] ;
145- out[ i] = if j < 16 { lane } else { 0 } ;
146- i += 1 ;
149+ simd_impl ! {
150+ wasm => { Self :: from_wasm_v128( wasm:: i8x16_swizzle( self . to_wasm_v128( ) , s. to_wasm_v128( ) ) ) }
151+ x86 => {
152+ let a = self . to_le_bytes( ) ;
153+ let idx = s. to_le_bytes( ) ;
154+ let mut mask = [ 0u8 ; 16 ] ;
155+ for i in 0 ..16 {
156+ let j = idx[ i] ;
157+ mask[ i] = if j < 16 { j & 0x0f } else { 0x80 } ;
158+ }
159+
160+ // SAFETY: `a`, `mask`, and `out` are valid 16-byte buffers, and `_mm_loadu/_mm_storeu` support unaligned accesses.
161+ #[ allow( unsafe_code) ]
162+ let out = unsafe {
163+ let a_vec = x86:: _mm_loadu_si128( a. as_ptr( ) . cast:: <x86:: __m128i>( ) ) ;
164+ let mask_vec = x86:: _mm_loadu_si128( mask. as_ptr( ) . cast:: <x86:: __m128i>( ) ) ;
165+ let result = x86:: _mm_shuffle_epi8( a_vec, mask_vec) ;
166+ let mut out = [ 0u8 ; 16 ] ;
167+ x86:: _mm_storeu_si128( out. as_mut_ptr( ) . cast:: <x86:: __m128i>( ) , result) ;
168+ out
169+ } ;
170+ Self :: from_le_bytes( out)
171+ }
172+ generic => {
173+ let a = self . to_le_bytes( ) ;
174+ let idx = s. to_le_bytes( ) ;
175+ let mut out = [ 0u8 ; 16 ] ;
176+ for i in 0 ..16 {
177+ let j = idx[ i] ;
178+ let lane = a[ ( j & 0x0f ) as usize ] ;
179+ out[ i] = if j < 16 { lane } else { 0 } ;
180+ }
181+ Self :: from_le_bytes( out)
182+ }
147183 }
148- Self :: from_le_bytes ( out)
149184 }
150185
151186 #[ doc( alias = "i8x16.relaxed_swizzle" ) ]
@@ -155,14 +190,45 @@ impl Value128 {
155190
156191 #[ doc( alias = "i8x16.shuffle" ) ]
157192 pub fn i8x16_shuffle ( a : Self , b : Self , idx : [ u8 ; 16 ] ) -> Self {
158- let mut src = [ 0u8 ; 32 ] ;
159- src[ ..16 ] . copy_from_slice ( & a. to_le_bytes ( ) ) ;
160- src[ 16 ..] . copy_from_slice ( & b. to_le_bytes ( ) ) ;
161- let mut out = [ 0u8 ; 16 ] ;
162- for i in 0 ..16 {
163- out[ i] = src[ ( idx[ i] & 31 ) as usize ] ;
193+ simd_impl ! {
194+ x86 => {
195+ let a_bytes = a. to_le_bytes( ) ;
196+ let b_bytes = b. to_le_bytes( ) ;
197+ let mut mask_a = [ 0u8 ; 16 ] ;
198+ let mut mask_b = [ 0u8 ; 16 ] ;
199+ for i in 0 ..16 {
200+ let j = idx[ i] & 31 ;
201+ mask_a[ i] = if j < 16 { j } else { 0x80 } ;
202+ mask_b[ i] = if j < 16 { 0x80 } else { j & 0x0f } ;
203+ }
204+
205+ // SAFETY: all inputs are valid 16-byte buffers, and `_mm_loadu/_mm_storeu` support unaligned accesses.
206+ #[ allow( unsafe_code) ]
207+ let out = unsafe {
208+ let a_vec = x86:: _mm_loadu_si128( a_bytes. as_ptr( ) . cast:: <x86:: __m128i>( ) ) ;
209+ let b_vec = x86:: _mm_loadu_si128( b_bytes. as_ptr( ) . cast:: <x86:: __m128i>( ) ) ;
210+ let mask_a_vec = x86:: _mm_loadu_si128( mask_a. as_ptr( ) . cast:: <x86:: __m128i>( ) ) ;
211+ let mask_b_vec = x86:: _mm_loadu_si128( mask_b. as_ptr( ) . cast:: <x86:: __m128i>( ) ) ;
212+ let a_part = x86:: _mm_shuffle_epi8( a_vec, mask_a_vec) ;
213+ let b_part = x86:: _mm_shuffle_epi8( b_vec, mask_b_vec) ;
214+ let result = x86:: _mm_or_si128( a_part, b_part) ;
215+ let mut out = [ 0u8 ; 16 ] ;
216+ x86:: _mm_storeu_si128( out. as_mut_ptr( ) . cast:: <x86:: __m128i>( ) , result) ;
217+ out
218+ } ;
219+ Self :: from_le_bytes( out)
220+ }
221+ generic => {
222+ let a_bytes = a. to_le_bytes( ) ;
223+ let b_bytes = b. to_le_bytes( ) ;
224+ let mut out = [ 0u8 ; 16 ] ;
225+ for i in 0 ..16 {
226+ let j = idx[ i] & 31 ;
227+ out[ i] = if j < 16 { a_bytes[ j as usize ] } else { b_bytes[ ( j & 0x0f ) as usize ] } ;
228+ }
229+ Self :: from_le_bytes( out)
230+ }
164231 }
165- Self :: from_le_bytes ( out)
166232 }
167233
168234 #[ doc( alias = "i8x16.splat" ) ]
0 commit comments