Skip to content

Commit adba2c3

Browse files
authored
Merge pull request #5685 from teddygood/wasm-intrin-backend-exp
Add a WebAssembly SIMD backend for reusable intrinsics kernels
2 parents c9dae4c + 99d0557 commit adba2c3

File tree

5 files changed

+86
-3
lines changed

5 files changed

+86
-3
lines changed

kernel/simd/intrin.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ extern "C" {
5656
#include <arm_neon.h>
5757
#endif
5858

59+
/** WASM SIMD **/
60+
#if defined(ARCH_WASM) && defined(__wasm_simd128__)
61+
#include <wasm_simd128.h>
62+
#endif
63+
5964
// distribute
6065
#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16)
6166
#include "intrin_avx512.h"
@@ -69,6 +74,10 @@ extern "C" {
6974
#include "intrin_neon.h"
7075
#endif
7176

77+
#if defined(ARCH_WASM) && defined(__wasm_simd128__)
78+
#include "intrin_wasm.h"
79+
#endif
80+
7281
#ifndef V_SIMD
7382
#define V_SIMD 0
7483
#define V_SIMD_F64 0

kernel/simd/intrin_wasm.h

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#include <wasm_simd128.h>
2+
3+
#define V_SIMD 128
4+
#define V_SIMD_F64 1
5+
6+
/***************************
7+
* Data Type
8+
***************************/
9+
typedef v128_t v_f32;
10+
typedef v128_t v_f64;
11+
#define v_nlanes_f32 4
12+
#define v_nlanes_f64 2
13+
14+
/***************************
15+
* Arithmetic
16+
***************************/
17+
#define v_add_f32 wasm_f32x4_add
18+
#define v_add_f64 wasm_f64x2_add
19+
#define v_sub_f32 wasm_f32x4_sub
20+
#define v_sub_f64 wasm_f64x2_sub
21+
#define v_mul_f32 wasm_f32x4_mul
22+
#define v_mul_f64 wasm_f64x2_mul
23+
24+
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
25+
{ return v_add_f32(v_mul_f32(a, b), c); }
26+
27+
BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
28+
{ return v_add_f64(v_mul_f64(a, b), c); }
29+
30+
BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
31+
{ return v_sub_f32(v_mul_f32(a, b), c); }
32+
33+
BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c)
34+
{ return v_sub_f64(v_mul_f64(a, b), c); }
35+
36+
/***************************
37+
* reduction
38+
***************************/
39+
BLAS_FINLINE float v_sum_f32(v_f32 a)
40+
{
41+
return wasm_f32x4_extract_lane(a, 0)
42+
+ wasm_f32x4_extract_lane(a, 1)
43+
+ wasm_f32x4_extract_lane(a, 2)
44+
+ wasm_f32x4_extract_lane(a, 3);
45+
}
46+
47+
BLAS_FINLINE double v_sum_f64(v_f64 a)
48+
{
49+
return wasm_f64x2_extract_lane(a, 0)
50+
+ wasm_f64x2_extract_lane(a, 1);
51+
}
52+
53+
/***************************
54+
* memory
55+
***************************/
56+
#define v_loadu_f32(a) wasm_v128_load((const float*)a)
57+
#define v_loadu_f64(a) wasm_v128_load((const double*)a)
58+
#define v_storeu_f32(a, v) wasm_v128_store((float*)a, v)
59+
#define v_storeu_f64(a, v) wasm_v128_store((double*)a, v)
60+
#define v_setall_f32(VAL) wasm_f32x4_splat(VAL)
61+
#define v_setall_f64(VAL) wasm_f64x2_splat(VAL)
62+
#define v_zero_f32() wasm_f32x4_splat(0.0f)
63+
#define v_zero_f64() wasm_f64x2_splat(0.0)

kernel/wasm/KERNEL.WASM128_GENERIC

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ DSUMKERNEL = ../arm/sum.c
4040
CSUMKERNEL = ../arm/zsum.c
4141
ZSUMKERNEL = ../arm/zsum.c
4242

43-
SAXPYKERNEL = ../riscv64/axpy.c
43+
SAXPYKERNEL = ../x86_64/saxpy.c
4444
DAXPYKERNEL = ../x86_64/daxpy.c
4545
CAXPYKERNEL = ../riscv64/zaxpy.c
4646
ZAXPYKERNEL = ../riscv64/zaxpy.c

kernel/x86_64/saxpy.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4343

4444

4545
#ifndef HAVE_KERNEL_16
46+
#include"../simd/intrin.h"
4647

4748
static void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
4849
{
4950
BLASLONG register i = 0;
5051
FLOAT a = *alpha;
5152

53+
#if V_SIMD
54+
v_f32 __alpha, tmp;
55+
__alpha = v_setall_f32(*alpha);
56+
const int vstep = v_nlanes_f32;
57+
58+
for (; i < n; i += vstep) {
59+
tmp = v_muladd_f32(__alpha, v_loadu_f32(x + i), v_loadu_f32(y + i));
60+
v_storeu_f32(y + i, tmp);
61+
}
62+
#else
5263
while(i < n)
5364
{
5465
y[i] += a * x[i];
@@ -62,6 +73,7 @@ static void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
6273
i+=8 ;
6374

6475
}
76+
#endif
6577

6678
}
6779

@@ -131,4 +143,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
131143

132144
}
133145

134-

kernel/x86_64/srot.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
1313
{
1414
BLASLONG i = 0;
1515

16-
#if V_SIMD && !defined(C_PGI) && (defined(HAVE_FMA3) || V_SIMD > 128)
16+
#if V_SIMD && !defined(C_PGI) && (defined(HAVE_FMA3) || V_SIMD > 128 || defined(ARCH_WASM))
1717
const int vstep = v_nlanes_f32;
1818
const int unrollx4 = n & (-vstep * 4);
1919
const int unrollx = n & -vstep;

0 commit comments

Comments
 (0)