Skip to content

Commit fda2410

Browse files
committed
Merge pull request #9940 from yolanda15:add_relaxed_rminmax
PiperOrigin-RevId: 901018723
2 parents 1883212 + ff2d6b8 commit fda2410

12 files changed

Lines changed: 498 additions & 8 deletions

cmake/gen/wasmrelaxedsimd_microkernels.cmake

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ SET(PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS
5050
src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-wasmrelaxedsimd-fma-splat.c
5151
src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-wasmrelaxedsimd-fma-splat.c
5252
src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16-acc2.c
53+
src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u16-acc4.c
5354
src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmrelaxedsimd-arm.c
5455
src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmrelaxedsimd-x86.c
5556
src/f32-vclamp/gen/f32-vclamp-wasmrelaxedsimd.c
@@ -279,6 +280,11 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS
279280
src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u4.c
280281
src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8-acc2.c
281282
src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16-acc4.c
283+
src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u4.c
284+
src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u8-acc2.c
285+
src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u12-acc3.c
286+
src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u16-acc2.c
287+
src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u32-acc4.c
282288
src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-pipelined-u2.c
283289
src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-pipelined.c
284290
src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-u2.c

gen/wasmrelaxedsimd_microkernels.bzl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [
4646
"src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-wasmrelaxedsimd-fma-splat.c",
4747
"src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-wasmrelaxedsimd-fma-splat.c",
4848
"src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16-acc2.c",
49+
"src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u16-acc4.c",
4950
"src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmrelaxedsimd-arm.c",
5051
"src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmrelaxedsimd-x86.c",
5152
"src/f32-vclamp/gen/f32-vclamp-wasmrelaxedsimd.c",
@@ -276,6 +277,11 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [
276277
"src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u4.c",
277278
"src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u8-acc2.c",
278279
"src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-u16-acc4.c",
280+
"src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u4.c",
281+
"src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u8-acc2.c",
282+
"src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u12-acc3.c",
283+
"src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u16-acc2.c",
284+
"src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u32-acc4.c",
279285
"src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-pipelined-u2.c",
280286
"src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-pipelined.c",
281287
"src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmrelaxedsimd-arm-u2.c",

scripts/generate-f32-rminmax.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,13 @@ tools/xngen src/f32-rminmax/wasmsimd.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D
133133
tools/xngen src/f32-rminmax/wasmsimd.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D OP=MINMAX -D MINMAX=PMINMAX -o src/f32-rminmax/gen/f32-rminmax-wasmsimd-pminmax-u16-acc2.c &
134134
tools/xngen src/f32-rminmax/wasmsimd.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D OP=MINMAX -D MINMAX=PMINMAX -o src/f32-rminmax/gen/f32-rminmax-wasmsimd-pminmax-u16-acc4.c &
135135

136+
tools/xngen src/f32-rminmax/wasmsimd.c.in -D BATCH_TILE=4 -D ACCUMULATORS=1 -D OP=MINMAX -D MINMAX=RELAXED_MINMAX -o src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u4.c &
137+
tools/xngen src/f32-rminmax/wasmsimd.c.in -D BATCH_TILE=8 -D ACCUMULATORS=2 -D OP=MINMAX -D MINMAX=RELAXED_MINMAX -o src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u8-acc2.c &
138+
tools/xngen src/f32-rminmax/wasmsimd.c.in -D BATCH_TILE=12 -D ACCUMULATORS=3 -D OP=MINMAX -D MINMAX=RELAXED_MINMAX -o src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u12-acc3.c &
139+
tools/xngen src/f32-rminmax/wasmsimd.c.in -D BATCH_TILE=16 -D ACCUMULATORS=2 -D OP=MINMAX -D MINMAX=RELAXED_MINMAX -o src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u16-acc2.c &
140+
tools/xngen src/f32-rminmax/wasmsimd.c.in -D BATCH_TILE=16 -D ACCUMULATORS=4 -D OP=MINMAX -D MINMAX=RELAXED_MINMAX -o src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u16-acc4.c &
141+
tools/xngen src/f32-rminmax/wasmsimd.c.in -D BATCH_TILE=32 -D ACCUMULATORS=4 -D OP=MINMAX -D MINMAX=RELAXED_MINMAX -o src/f32-rminmax/gen/f32-rminmax-wasmrelaxedsimd-minmax-u32-acc4.c &
142+
136143
#################################### Scalar ###################################
137144
### Generic C micro-kernels
138145
tools/xngen src/f32-rminmax/simd.c.in -D ARCH=scalar -D BATCH_TILE=1 -D DATATYPE=F32 -D ACCUMULATORS=1 -D OP=MINMAX -D WASM=0 -o src/f32-rminmax/gen/f32-rminmax-scalar-u1.c &

src/configs/reduce-config.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1040,7 +1040,9 @@ static void init_f32_rminmax_config(void) {
10401040
{
10411041
f32_rminmax_config.ukernel = XNN_INIT_REDUCE_UKERNEL(xnn_f32_rminmax_ukernel__scalar_u4_acc4);
10421042
}
1043-
#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1043+
#elif XNN_ARCH_WASMRELAXEDSIMD
1044+
f32_rminmax_config.ukernel = XNN_INIT_REDUCE_UKERNEL(xnn_f32_rminmax_ukernel__wasmrelaxedsimd_minmax_u16_acc4);
1045+
#elif XNN_ARCH_WASMSIMD
10441046
f32_rminmax_config.ukernel = XNN_INIT_REDUCE_UKERNEL(xnn_f32_rminmax_ukernel__wasmsimd_minmax_u16_acc4);
10451047
#elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
10461048
f32_rminmax_config.ukernel = XNN_INIT_REDUCE_UKERNEL(xnn_f32_rminmax_ukernel__rvv_u8v);

src/f32-rminmax/f32-rminmax.inc

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_rminmax_ukernel__avx512f_u64_acc2, 64,
4343
XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_rminmax_ukernel__avx512f_u64_acc4, 64, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
4444
#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
4545

46-
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
46+
#if XNN_ARCH_WASMSIMD
4747
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmsimd_minmax_u4, 4, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
4848
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmsimd_minmax_u8_acc2, 8, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
4949
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmsimd_minmax_u12_acc3, 12, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
@@ -54,7 +54,16 @@ XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmsimd_pminmax_u8_acc2, 8,
5454
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmsimd_pminmax_u12_acc3, 12, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
5555
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmsimd_pminmax_u16_acc2, 16, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
5656
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmsimd_pminmax_u16_acc4, 16, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
57-
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
57+
#endif // XNN_ARCH_WASMSIMD
58+
59+
#if XNN_ARCH_WASMRELAXEDSIMD
60+
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmrelaxedsimd_minmax_u4, 4, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
61+
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmrelaxedsimd_minmax_u8_acc2, 8, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
62+
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmrelaxedsimd_minmax_u12_acc3, 12, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
63+
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmrelaxedsimd_minmax_u16_acc2, 16, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
64+
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmrelaxedsimd_minmax_u16_acc4, 16, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
65+
XNN_UKERNEL(xnn_arch_none, xnn_f32_rminmax_ukernel__wasmrelaxedsimd_minmax_u32_acc4, 16, false, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
66+
#endif // XNN_ARCH_WASMRELAXEDSIMD
5867

5968
#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
6069
XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_rminmax_ukernel__rvv_u1v, 1, true, float, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL))
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// clang-format off
2+
// Auto-generated file. Do not edit!
3+
// Template: src/f32-rminmax/wasmsimd.c.in
4+
// Generator: tools/xngen
5+
//
6+
// Copyright 2023 Google LLC
7+
//
8+
// This source code is licensed under the BSD-style license found in the
9+
// LICENSE file in the root directory of this source tree.
10+
11+
#include <assert.h>
12+
13+
#include <wasm_simd128.h>
14+
15+
#include "src/xnnpack/common.h"
16+
#include "src/xnnpack/reduce.h"
17+
18+
19+
void xnn_f32_rminmax_ukernel__wasmrelaxedsimd_minmax_u12_acc3(
20+
size_t batch,
21+
const float* input,
22+
float* output,
23+
const struct xnn_f32_default_params* restrict params)
24+
{
25+
assert(batch != 0);
26+
assert(batch % sizeof(float) == 0);
27+
assert(input != NULL);
28+
assert(output != NULL);
29+
30+
v128_t vmin0 = wasm_v128_load32_splat(output);
31+
v128_t vmax0 = wasm_v128_load32_splat(output + 1);
32+
v128_t vmin1 = vmin0;
33+
v128_t vmax1 = vmax0;
34+
v128_t vmin2 = vmin0;
35+
v128_t vmax2 = vmax0;
36+
for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) {
37+
const v128_t vt0 = wasm_v128_load(input);
38+
const v128_t vt1 = wasm_v128_load(input + 4);
39+
const v128_t vt2 = wasm_v128_load(input + 8);
40+
input += 12;
41+
42+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt0);
43+
vmin1 = wasm_f32x4_relaxed_min(vmin1, vt1);
44+
vmin2 = wasm_f32x4_relaxed_min(vmin2, vt2);
45+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt0);
46+
vmax1 = wasm_f32x4_relaxed_max(vmax1, vt1);
47+
vmax2 = wasm_f32x4_relaxed_max(vmax2, vt2);
48+
}
49+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vmin1);
50+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vmax1);
51+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vmin2);
52+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vmax2);
53+
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
54+
const v128_t vt = wasm_v128_load(input);
55+
input += 4;
56+
57+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt);
58+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt);
59+
}
60+
vmin0 = wasm_f32x4_relaxed_min(vmin0, wasm_v64x2_shuffle(vmin0, vmin0, 1, 1));
61+
vmax0 = wasm_f32x4_relaxed_max(vmax0, wasm_v64x2_shuffle(vmax0, vmax0, 1, 1));
62+
if XNN_UNLIKELY(batch & (2 * sizeof(float))) {
63+
const v128_t vt = wasm_v128_load64_zero(input);
64+
input += 2;
65+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt);
66+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt);
67+
}
68+
vmin0 = wasm_f32x4_relaxed_min(vmin0, wasm_v32x4_shuffle(vmin0, vmin0, 1, 1, 1, 1));
69+
vmax0 = wasm_f32x4_relaxed_max(vmax0, wasm_v32x4_shuffle(vmax0, vmax0, 1, 1, 1, 1));
70+
if XNN_UNLIKELY(batch & (1 * sizeof(float))) {
71+
const v128_t vt = wasm_v128_load32_zero(input);
72+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt);
73+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt);
74+
}
75+
wasm_v128_store32_lane(output, vmin0, 0);
76+
wasm_v128_store32_lane(output + 1, vmax0, 0);
77+
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// clang-format off
2+
// Auto-generated file. Do not edit!
3+
// Template: src/f32-rminmax/wasmsimd.c.in
4+
// Generator: tools/xngen
5+
//
6+
// Copyright 2023 Google LLC
7+
//
8+
// This source code is licensed under the BSD-style license found in the
9+
// LICENSE file in the root directory of this source tree.
10+
11+
#include <assert.h>
12+
13+
#include <wasm_simd128.h>
14+
15+
#include "src/xnnpack/common.h"
16+
#include "src/xnnpack/reduce.h"
17+
18+
19+
void xnn_f32_rminmax_ukernel__wasmrelaxedsimd_minmax_u16_acc2(
20+
size_t batch,
21+
const float* input,
22+
float* output,
23+
const struct xnn_f32_default_params* restrict params)
24+
{
25+
assert(batch != 0);
26+
assert(batch % sizeof(float) == 0);
27+
assert(input != NULL);
28+
assert(output != NULL);
29+
30+
v128_t vmin0 = wasm_v128_load32_splat(output);
31+
v128_t vmax0 = wasm_v128_load32_splat(output + 1);
32+
v128_t vmin1 = vmin0;
33+
v128_t vmax1 = vmax0;
34+
for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
35+
const v128_t vt0 = wasm_v128_load(input);
36+
const v128_t vt1 = wasm_v128_load(input + 4);
37+
const v128_t vt2 = wasm_v128_load(input + 8);
38+
const v128_t vt3 = wasm_v128_load(input + 12);
39+
input += 16;
40+
41+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt0);
42+
vmin1 = wasm_f32x4_relaxed_min(vmin1, vt1);
43+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt2);
44+
vmin1 = wasm_f32x4_relaxed_min(vmin1, vt3);
45+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt0);
46+
vmax1 = wasm_f32x4_relaxed_max(vmax1, vt1);
47+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt2);
48+
vmax1 = wasm_f32x4_relaxed_max(vmax1, vt3);
49+
}
50+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vmin1);
51+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vmax1);
52+
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
53+
const v128_t vt = wasm_v128_load(input);
54+
input += 4;
55+
56+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt);
57+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt);
58+
}
59+
vmin0 = wasm_f32x4_relaxed_min(vmin0, wasm_v64x2_shuffle(vmin0, vmin0, 1, 1));
60+
vmax0 = wasm_f32x4_relaxed_max(vmax0, wasm_v64x2_shuffle(vmax0, vmax0, 1, 1));
61+
if XNN_UNLIKELY(batch & (2 * sizeof(float))) {
62+
const v128_t vt = wasm_v128_load64_zero(input);
63+
input += 2;
64+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt);
65+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt);
66+
}
67+
vmin0 = wasm_f32x4_relaxed_min(vmin0, wasm_v32x4_shuffle(vmin0, vmin0, 1, 1, 1, 1));
68+
vmax0 = wasm_f32x4_relaxed_max(vmax0, wasm_v32x4_shuffle(vmax0, vmax0, 1, 1, 1, 1));
69+
if XNN_UNLIKELY(batch & (1 * sizeof(float))) {
70+
const v128_t vt = wasm_v128_load32_zero(input);
71+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt);
72+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt);
73+
}
74+
wasm_v128_store32_lane(output, vmin0, 0);
75+
wasm_v128_store32_lane(output + 1, vmax0, 0);
76+
}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
// clang-format off
2+
// Auto-generated file. Do not edit!
3+
// Template: src/f32-rminmax/wasmsimd.c.in
4+
// Generator: tools/xngen
5+
//
6+
// Copyright 2023 Google LLC
7+
//
8+
// This source code is licensed under the BSD-style license found in the
9+
// LICENSE file in the root directory of this source tree.
10+
11+
#include <assert.h>
12+
13+
#include <wasm_simd128.h>
14+
15+
#include "src/xnnpack/common.h"
16+
#include "src/xnnpack/reduce.h"
17+
18+
19+
void xnn_f32_rminmax_ukernel__wasmrelaxedsimd_minmax_u16_acc4(
20+
size_t batch,
21+
const float* input,
22+
float* output,
23+
const struct xnn_f32_default_params* restrict params)
24+
{
25+
assert(batch != 0);
26+
assert(batch % sizeof(float) == 0);
27+
assert(input != NULL);
28+
assert(output != NULL);
29+
30+
v128_t vmin0 = wasm_v128_load32_splat(output);
31+
v128_t vmax0 = wasm_v128_load32_splat(output + 1);
32+
v128_t vmin1 = vmin0;
33+
v128_t vmax1 = vmax0;
34+
v128_t vmin2 = vmin0;
35+
v128_t vmax2 = vmax0;
36+
v128_t vmin3 = vmin0;
37+
v128_t vmax3 = vmax0;
38+
for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
39+
const v128_t vt0 = wasm_v128_load(input);
40+
const v128_t vt1 = wasm_v128_load(input + 4);
41+
const v128_t vt2 = wasm_v128_load(input + 8);
42+
const v128_t vt3 = wasm_v128_load(input + 12);
43+
input += 16;
44+
45+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt0);
46+
vmin1 = wasm_f32x4_relaxed_min(vmin1, vt1);
47+
vmin2 = wasm_f32x4_relaxed_min(vmin2, vt2);
48+
vmin3 = wasm_f32x4_relaxed_min(vmin3, vt3);
49+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt0);
50+
vmax1 = wasm_f32x4_relaxed_max(vmax1, vt1);
51+
vmax2 = wasm_f32x4_relaxed_max(vmax2, vt2);
52+
vmax3 = wasm_f32x4_relaxed_max(vmax3, vt3);
53+
}
54+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vmin1);
55+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vmax1);
56+
vmin2 = wasm_f32x4_relaxed_min(vmin2, vmin3);
57+
vmax2 = wasm_f32x4_relaxed_max(vmax2, vmax3);
58+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vmin2);
59+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vmax2);
60+
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
61+
const v128_t vt = wasm_v128_load(input);
62+
input += 4;
63+
64+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt);
65+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt);
66+
}
67+
vmin0 = wasm_f32x4_relaxed_min(vmin0, wasm_v64x2_shuffle(vmin0, vmin0, 1, 1));
68+
vmax0 = wasm_f32x4_relaxed_max(vmax0, wasm_v64x2_shuffle(vmax0, vmax0, 1, 1));
69+
if XNN_UNLIKELY(batch & (2 * sizeof(float))) {
70+
const v128_t vt = wasm_v128_load64_zero(input);
71+
input += 2;
72+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt);
73+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt);
74+
}
75+
vmin0 = wasm_f32x4_relaxed_min(vmin0, wasm_v32x4_shuffle(vmin0, vmin0, 1, 1, 1, 1));
76+
vmax0 = wasm_f32x4_relaxed_max(vmax0, wasm_v32x4_shuffle(vmax0, vmax0, 1, 1, 1, 1));
77+
if XNN_UNLIKELY(batch & (1 * sizeof(float))) {
78+
const v128_t vt = wasm_v128_load32_zero(input);
79+
vmin0 = wasm_f32x4_relaxed_min(vmin0, vt);
80+
vmax0 = wasm_f32x4_relaxed_max(vmax0, vt);
81+
}
82+
wasm_v128_store32_lane(output, vmin0, 0);
83+
wasm_v128_store32_lane(output + 1, vmax0, 0);
84+
}

0 commit comments

Comments
 (0)