Skip to content

Commit 097dd12

Browse files
committed
sse{,2,3,4.1},avx: more WASM shuffle implementations
1 parent 26d6343 commit 097dd12

5 files changed

Lines changed: 174 additions & 51 deletions

File tree

simde/x86/avx.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4637,6 +4637,8 @@ simde_mm_permute_ps (simde__m128 a, const int imm8)
46374637
}
46384638
#if defined(SIMDE_X86_AVX_NATIVE)
46394639
# define simde_mm_permute_ps(a, imm8) _mm_permute_ps(a, imm8)
4640+
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4641+
# define simde_mm_permute_ps(a, imm8) simde__m128_from_wasm_v128(wasm_i32x4_shuffle(simde__m128_to_wasm_v128(a), simde__m128_to_wasm_v128(a), ((imm8) & 3), (((imm8) >> 2) & 3 ), (((imm8) >> 4) & 3), (((imm8) >> 6) & 3)))
46404642
#endif
46414643
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
46424644
#undef _mm_permute_ps
@@ -4661,6 +4663,8 @@ simde_mm_permute_pd (simde__m128d a, const int imm8)
46614663
}
46624664
#if defined(SIMDE_X86_AVX_NATIVE)
46634665
# define simde_mm_permute_pd(a, imm8) _mm_permute_pd(a, imm8)
4666+
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4667+
# define simde_mm_permute_pd(a, imm8) simde__m128d_from_wasm_v128(wasm_i64x2_shuffle(simde__m128d_to_wasm_v128(a), simde__m128d_to_wasm_v128(a), ((imm8) & 1), (((imm8) >> 1) & 1 )))
46644668
#endif
46654669
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
46664670
#undef _mm_permute_pd

simde/x86/sse.h

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -761,9 +761,7 @@ simde_mm_move_ss (simde__m128 a, simde__m128 b) {
761761
a_ = simde__m128_to_private(a),
762762
b_ = simde__m128_to_private(b);
763763

764-
#if defined(SIMDE_SHUFFLE_VECTOR_)
765-
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3);
766-
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
764+
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
767765
r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0);
768766
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
769767
static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) m = { ~0U, 0U, 0U, 0U };
@@ -772,6 +770,8 @@ simde_mm_move_ss (simde__m128 a, simde__m128 b) {
772770
r_.wasm_v128 = wasm_i8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
773771
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
774772
r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, b_.lsx_i64, 0);
773+
#elif defined(SIMDE_SHUFFLE_VECTOR_)
774+
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3);
775775
#else
776776
r_.f32[0] = b_.f32[0];
777777
r_.f32[1] = a_.f32[1];
@@ -3238,9 +3238,7 @@ simde_mm_movelh_ps (simde__m128 a, simde__m128 b) {
32383238
a_ = simde__m128_to_private(a),
32393239
b_ = simde__m128_to_private(b);
32403240

3241-
#if defined(SIMDE_SHUFFLE_VECTOR_)
3242-
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5);
3243-
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3241+
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
32443242
float32x2_t a10 = vget_low_f32(a_.neon_f32);
32453243
float32x2_t b10 = vget_low_f32(b_.neon_f32);
32463244
r_.neon_f32 = vcombine_f32(a10, b10);
@@ -3249,6 +3247,8 @@ simde_mm_movelh_ps (simde__m128 a, simde__m128 b) {
32493247
vec_mergeh(a_.altivec_i64, b_.altivec_i64));
32503248
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
32513249
r_.lsx_i64 = __lsx_vilvl_d(b_.lsx_i64, a_.lsx_i64);
3250+
#elif defined(SIMDE_SHUFFLE_VECTOR_)
3251+
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5);
32523252
#else
32533253
r_.f32[0] = a_.f32[0];
32543254
r_.f32[1] = a_.f32[1];
@@ -4081,28 +4081,38 @@ simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8)
40814081
}
40824082
#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
40834083
# define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8)
4084-
#elif defined(SIMDE_SHUFFLE_VECTOR_)
4084+
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
40854085
#define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \
4086-
simde__m128_from_private((simde__m128_private) { .f32 = \
4087-
SIMDE_SHUFFLE_VECTOR_(32, 16, \
4088-
simde__m128_to_private(a).f32, \
4089-
simde__m128_to_private(b).f32, \
4090-
(((imm8) ) & 3), \
4091-
(((imm8) >> 2) & 3), \
4092-
(((imm8) >> 4) & 3) + 4, \
4093-
(((imm8) >> 6) & 3) + 4) }); }))
4086+
simde__m128_from_private((simde__m128_private) { .wasm_v128 = \
4087+
wasm_i32x4_shuffle( \
4088+
simde__m128_to_private(a).wasm_v128, \
4089+
simde__m128_to_private(b).wasm_v128, \
4090+
(((imm8) ) & 3), \
4091+
(((imm8) >> 2) & 3), \
4092+
(((imm8) >> 4) & 3) + 4, \
4093+
(((imm8) >> 6) & 3) + 4) }); }))
40944094
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_)
40954095
#define simde_mm_shuffle_ps(a, b, imm8) \
40964096
(__extension__({ \
4097-
float32x4_t simde_mm_shuffle_ps_a_ = simde__m128i_to_neon_f32(a); \
4098-
float32x4_t simde_mm_shuffle_ps_b_ = simde__m128i_to_neon_f32(b); \
4097+
float32x4_t simde_mm_shuffle_ps_a_ = simde__m128_to_neon_f32(a); \
4098+
float32x4_t simde_mm_shuffle_ps_b_ = simde__m128_to_neon_f32(b); \
40994099
float32x4_t simde_mm_shuffle_ps_r_; \
41004100
\
41014101
simde_mm_shuffle_ps_r_ = vmovq_n_f32(vgetq_lane_f32(simde_mm_shuffle_ps_a_, (imm8) & (0x3))); \
41024102
simde_mm_shuffle_ps_r_ = vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_a_, ((imm8) >> 2) & 0x3), simde_mm_shuffle_ps_r_, 1); \
41034103
simde_mm_shuffle_ps_r_ = vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_b_, ((imm8) >> 4) & 0x3), simde_mm_shuffle_ps_r_, 2); \
41044104
vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_b_, ((imm8) >> 6) & 0x3), simde_mm_shuffle_ps_r_, 3); \
41054105
}))
4106+
#elif defined(SIMDE_SHUFFLE_VECTOR_)
4107+
#define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \
4108+
simde__m128_from_private((simde__m128_private) { .f32 = \
4109+
SIMDE_SHUFFLE_VECTOR_(32, 16, \
4110+
simde__m128_to_private(a).f32, \
4111+
simde__m128_to_private(b).f32, \
4112+
(((imm8) ) & 3), \
4113+
(((imm8) >> 2) & 3), \
4114+
(((imm8) >> 4) & 3) + 4, \
4115+
(((imm8) >> 6) & 3) + 4) }); }))
41064116
#endif
41074117
#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
41084118
# define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8)
@@ -4675,6 +4685,8 @@ simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) {
46754685
r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
46764686
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
46774687
r_.lsx_i64 = __lsx_vilvh_w(b_.lsx_i64, a_.lsx_i64);
4688+
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4689+
r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 6, 3, 7);
46784690
#elif defined(SIMDE_SHUFFLE_VECTOR_)
46794691
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7);
46804692
#else
@@ -4708,13 +4720,15 @@ simde_mm_unpacklo_ps (simde__m128 a, simde__m128 b) {
47084720
r_.altivec_f32 = vec_mergeh(a_.altivec_f32, b_.altivec_f32);
47094721
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
47104722
r_.lsx_i64 = __lsx_vilvl_w(b_.lsx_i64, a_.lsx_i64);
4711-
#elif defined(SIMDE_SHUFFLE_VECTOR_)
4712-
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5);
4723+
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4724+
r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 4, 1, 5);
47134725
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
47144726
float32x2_t a1 = vget_low_f32(a_.neon_f32);
47154727
float32x2_t b1 = vget_low_f32(b_.neon_f32);
47164728
float32x2x2_t result = vzip_f32(a1, b1);
47174729
r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
4730+
#elif defined(SIMDE_SHUFFLE_VECTOR_)
4731+
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5);
47184732
#else
47194733
r_.f32[0] = a_.f32[0];
47204734
r_.f32[1] = b_.f32[0];

0 commit comments

Comments
 (0)