Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions cmake/gen/rvv_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ SET(PROD_RVV_MICROKERNEL_SRCS
src/f32-spmm/gen/f32-spmm-4vx4-minmax-rvv.c
src/f32-spmm/gen/f32-spmm-8vx1-minmax-rvv.c
src/f32-spmm/gen/f32-spmm-8vx2-minmax-rvv.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c
src/f32-vbinary/gen/f32-vadd-rvv-u8v.c
src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c
src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c
Expand All @@ -69,6 +70,8 @@ SET(PROD_RVV_MICROKERNEL_SRCS
src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c
src/f32-vcos/gen/f32-vcos-rvv-rational-5-4-div-u8v.c
src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u8v.c
src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c
src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c
src/f32-vhswish/gen/f32-vhswish-rvv-u4v.c
src/f32-vlog/gen/f32-vlog-rvv-rational-3-3-div-u8v.c
src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c
Expand Down Expand Up @@ -198,6 +201,13 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
src/f32-spmm/gen/f32-spmm-4vx1-minmax-rvv.c
src/f32-spmm/gen/f32-spmm-4vx2-minmax-rvv.c
src/f32-spmm/gen/f32-spmm-8vx4-minmax-rvv.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c
src/f32-vbinary/gen/f32-vadd-rvv-u4v.c
src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c
src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c
Expand Down Expand Up @@ -231,6 +241,16 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u1v.c
src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u2v.c
src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u4v.c
src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c
src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c
src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c
src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c
src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c
src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c
src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c
src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c
src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c
src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c
src/f32-vhswish/gen/f32-vhswish-rvv-u1v.c
src/f32-vhswish/gen/f32-vhswish-rvv-u2v.c
src/f32-vhswish/gen/f32-vhswish-rvv-u8v.c
Expand Down
20 changes: 20 additions & 0 deletions gen/rvv_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ PROD_RVV_MICROKERNEL_SRCS = [
"src/f32-spmm/gen/f32-spmm-4vx4-minmax-rvv.c",
"src/f32-spmm/gen/f32-spmm-8vx1-minmax-rvv.c",
"src/f32-spmm/gen/f32-spmm-8vx2-minmax-rvv.c",
"src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c",
"src/f32-vbinary/gen/f32-vadd-rvv-u8v.c",
"src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c",
"src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c",
Expand All @@ -65,6 +66,8 @@ PROD_RVV_MICROKERNEL_SRCS = [
"src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c",
"src/f32-vcos/gen/f32-vcos-rvv-rational-5-4-div-u8v.c",
"src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u8v.c",
"src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c",
"src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c",
"src/f32-vhswish/gen/f32-vhswish-rvv-u4v.c",
"src/f32-vlog/gen/f32-vlog-rvv-rational-3-3-div-u8v.c",
"src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c",
Expand Down Expand Up @@ -195,6 +198,13 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
"src/f32-spmm/gen/f32-spmm-4vx1-minmax-rvv.c",
"src/f32-spmm/gen/f32-spmm-4vx2-minmax-rvv.c",
"src/f32-spmm/gen/f32-spmm-8vx4-minmax-rvv.c",
"src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c",
"src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c",
"src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c",
"src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c",
"src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c",
"src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c",
"src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c",
"src/f32-vbinary/gen/f32-vadd-rvv-u4v.c",
"src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c",
"src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c",
Expand Down Expand Up @@ -228,6 +238,16 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
"src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u1v.c",
"src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u2v.c",
"src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u4v.c",
"src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c",
"src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c",
"src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c",
"src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c",
"src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c",
"src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c",
"src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c",
"src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c",
"src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c",
"src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c",
"src/f32-vhswish/gen/f32-vhswish-rvv-u1v.c",
"src/f32-vhswish/gen/f32-vhswish-rvv-u2v.c",
"src/f32-vhswish/gen/f32-vhswish-rvv-u8v.c",
Expand Down
10 changes: 10 additions & 0 deletions scripts/generate-f32-vapproxgelu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,14 @@ tools/xngen src/f32-vapproxgelu/rational-12-10.c.in -D ARCH=hvx -D BATCH_TILES=3

tools/xngen src/f32-vapproxgelu/rational-12-10.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-avx512f-rational-12-10-nr.c &

################################## RISC-V RVV #################################
tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c &
tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c &
tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c &
tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c &
tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c &
tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c &
tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c &
tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c &

wait
6 changes: 6 additions & 0 deletions scripts/generate-f32-velu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,10 @@ tools/xngen src/f32-velu/avx512f-rr1-p6.c.in -D BATCH_TILE=32 -o src/f32-velu/g
tools/xngen src/f32-velu/avx512f-rr1-p6.c.in -D BATCH_TILE=48 -o src/f32-velu/gen/f32-velu-avx512f-rr1-p6-u48.c &
tools/xngen src/f32-velu/avx512f-rr1-p6.c.in -D BATCH_TILE=64 -o src/f32-velu/gen/f32-velu-avx512f-rr1-p6-u64.c &

################################## RISC-V RVV #################################
tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=1 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c &
tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=2 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c &
tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=4 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c &
tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=8 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c &

wait
10 changes: 10 additions & 0 deletions scripts/generate-f32-vgelu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,14 @@ tools/xngen src/f32-vgelu/rational-12-10.c.in -D ARCH=hvx -D BATCH_TILES=32
tools/xngen src/f32-vgelu/rational-12-10.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-avx512f-rational-12-10-nr.c &
tools/xngen src/f32-vgelu/rational-12-10.c.in -D ARCH=hvx -D BATCH_TILES=32,64,128 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-hvx-rational-12-10-nr.c &

################################## RISC-V RVV #################################
tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c &
tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c &
tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c &
tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c &
tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c &
tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c &
tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c &
tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c &

wait
43 changes: 40 additions & 3 deletions src/configs/unary-elementwise-config.c
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,19 @@ static void init_f32_approxgelu_config_impl(struct xnn_unary_elementwise_config*
config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__hvx_rational_12_10_div_u128);
config->element_tile = 128;
}
#elif XNN_ARCH_RISCV
#if XNN_ENABLE_RISCV_VECTOR
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->arch_flags & xnn_arch_riscv_vector) {
config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u4v);
config->element_tile = 4 * hardware_config->vlenb / sizeof(float);
} else
#endif
{
config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__scalar_rational_12_10_div_u1);
config->element_tile = 1;
}
#else
config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__scalar_rational_12_10_div_u1);
config->element_tile = 1;
Expand Down Expand Up @@ -1348,9 +1361,20 @@ static void init_f32_elu_config(void) {
}
#endif
#elif XNN_ARCH_RISCV
f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4);
f32_elu_config.element_tile = 4;
f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params;
#if XNN_ENABLE_RISCV_VECTOR
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->arch_flags & xnn_arch_riscv_vector) {
f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__rvv_rr2_p6_u4v);
f32_elu_config.element_tile = 4 * hardware_config->vlenb / sizeof(float);
f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params;
} else
#endif
{
f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4);
f32_elu_config.element_tile = 4;
f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params;
}
#else
f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4);
f32_elu_config.element_tile = 4;
Expand Down Expand Up @@ -1423,6 +1447,19 @@ static void init_f32_gelu_config_impl(struct xnn_unary_elementwise_config* confi
config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__hvx_rational_12_10_div_u128);
config->element_tile = 128;
}
#elif XNN_ARCH_RISCV
#if XNN_ENABLE_RISCV_VECTOR
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->arch_flags & xnn_arch_riscv_vector) {
config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u4v);
config->element_tile = 4 * hardware_config->vlenb / sizeof(float);
} else
#endif
{
config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u1);
config->element_tile = 1;
}
#else
config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u1);
config->element_tile = 1;
Expand Down
7 changes: 7 additions & 0 deletions src/f32-vapproxgelu/f32-vapproxgelu.inc
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,10 @@ XNN_UKERNEL(xnn_arch_none, xnn_f32_vapproxgelu_ukernel__wasmsimd_rational_12_10_
XNN_UKERNEL(xnn_arch_none, xnn_f32_vapproxgelu_ukernel__wasmsimd_rational_12_10_div_u12, 12, false, float, struct xnn_f32_default_params, NULL)
XNN_UKERNEL(xnn_arch_none, xnn_f32_vapproxgelu_ukernel__wasmsimd_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, NULL)
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD

#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u1v, 1, true, float, struct xnn_f32_default_params, NULL)
XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u2v, 2, true, float, struct xnn_f32_default_params, NULL)
XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u4v, 4, true, float, struct xnn_f32_default_params, NULL)
XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u8v, 8, true, float, struct xnn_f32_default_params, NULL)
#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in
// Generator: tools/xngen
//
// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <riscv_vector.h>

#include "src/xnnpack/common.h"
#include "src/xnnpack/vunary.h"


void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u1v(
size_t batch,
const float* input,
float* output,
const struct xnn_f32_default_params* unused_params)
{
assert(batch != 0);
assert(batch % sizeof(float) == 0);
assert(input != NULL);
assert(output != NULL);

// Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
// beyond this point. This value is chosen as the first floating point
// number as of which the interpolation returns +/-1.0f.
const float vmax_x = 4.84974098e+00f;
const float vmin_x = -4.84974098e+00f;

// The monomial coefficients of the numerator polynomial (odd).
const float valpha_1 = 7.9788458347e-01f;
const float valpha_3 = 6.0803253204e-02f;
const float valpha_5 = 7.2898347862e-03f;
const float valpha_7 = 2.6887017884e-04f;
const float valpha_9 = 1.4302649106e-05f;
const float valpha_11 = 4.9544411240e-08f;

// The monomial coefficients of the denominator polynomial (even).
const float vbeta_2 = 2.4369759858e-01f;
const float vbeta_4 = 2.4381054565e-02f;
const float vbeta_6 = 1.3060354395e-03f;
const float vbeta_8 = 7.6477612311e-05f;
const float vbeta_10 = 1.3433452750e-06f;

batch >>= XNN_LOG2_SIZEOF_FLOAT;
do {
const size_t n = __riscv_vsetvl_e32m1(batch);

vfloat32m1_t vx_orig = __riscv_vle32_v_f32m1(input, n);
input += n;

// Clamp the inputs to the interpolation range.
vfloat32m1_t vx = __riscv_vfmin_vf_f32m1(vx_orig, vmax_x, n);
vx = __riscv_vfmax_vf_f32m1(vx, vmin_x, n);

// Since the polynomials are odd/even, we need x^2.
vfloat32m1_t vx2 = __riscv_vfmul_vv_f32m1(vx, vx, n);

// Evaluate the numerator polynomial p.
vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(valpha_9, n);
vp = __riscv_vfmacc_vf_f32m1(vp, valpha_11, vx2, n);
vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_7, n), n);
vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_5, n), n);
vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_3, n), n);
vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_1, n), n);
vp = __riscv_vfmul_vv_f32m1(vp, vx, n);

// Evaluate the denominator polynomial q.
vfloat32m1_t vq = __riscv_vfmv_v_f_f32m1(vbeta_8, n);
vq = __riscv_vfmacc_vf_f32m1(vq, vbeta_10, vx2, n);
vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_6, n), n);
vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_4, n), n);
vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_2, n), n);
vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(1.0f, n), n);

// Divide the numerator by the denominator.
vfloat32m1_t verf = __riscv_vfdiv_vv_f32m1(vp, vq, n);

// Add one to the rational interpolant, and multiply by 0.5 times the
// original input.
vfloat32m1_t vy = __riscv_vfadd_vf_f32m1(verf, 1.0f, n);
vy = __riscv_vfmul_vf_f32m1(vy, 0.5f, n);
vy = __riscv_vfmul_vv_f32m1(vy, vx_orig, n);

__riscv_vse32_v_f32m1(output, vy, n);
output += n;

batch -= n;
} while (batch != 0);
}
Loading
Loading