diff --git a/src/layer/riscv/exp_riscv.cpp b/src/layer/riscv/exp_riscv.cpp new file mode 100644 index 000000000000..364a975f3e69 --- /dev/null +++ b/src/layer/riscv/exp_riscv.cpp @@ -0,0 +1,112 @@ +// Copyright 2026 ihb2032 +// SPDX-License-Identifier: BSD-3-Clause + +#include "exp_riscv.h" + +#if __riscv_vector +#include +#include "rvv_mathfun.h" +#endif // __riscv_vector + +#include "cpu.h" + +namespace ncnn { + +Exp_riscv::Exp_riscv() +{ +#if __riscv_vector + support_packing = true; +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif +} + +int Exp_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ +#if NCNN_ZFH + int elembits = bottom_top_blob.elembits(); + + if (opt.use_fp16_storage && elembits == 16) + { + if (opt.use_fp16_arithmetic) + return forward_inplace_fp16sa(bottom_top_blob, opt); + else + return forward_inplace_fp16s(bottom_top_blob, opt); + } +#endif + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + if (base == -1.f) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfmul_vf_f32m8(_p, scale, vl); + _p = __riscv_vfadd_vf_f32m8(_p, shift, vl); + _p = exp_ps(_p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_vector + for (int i = 0; i < size; i++) + { + ptr[i] = expf(shift + ptr[i] * scale); + } +#endif // __riscv_vector + } + + return 0; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfmul_vf_f32m8(_p, scale, vl); + _p = __riscv_vfadd_vf_f32m8(_p, shift, vl); + _p = pow_ps(__riscv_vfmv_v_f_f32m8(base, vl), _p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_vector + for (int i = 0; i < size; i++) + { + ptr[i] = powf(base, shift + ptr[i] * scale); + } +#endif // __riscv_vector + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/exp_riscv.h b/src/layer/riscv/exp_riscv.h new file mode 100644 index 000000000000..ffed8a26922d --- /dev/null +++ b/src/layer/riscv/exp_riscv.h @@ -0,0 +1,27 @@ +// Copyright 2026 ihb2032 +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef LAYER_EXP_RISCV_H +#define LAYER_EXP_RISCV_H + +#include "exp.h" + +namespace ncnn { + +class Exp_riscv : public Exp +{ +public: + Exp_riscv(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +protected: +#if NCNN_ZFH + int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; + int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; +#endif +}; + +} // namespace ncnn + +#endif // LAYER_EXP_RISCV_H diff --git a/src/layer/riscv/exp_riscv_zfh.cpp b/src/layer/riscv/exp_riscv_zfh.cpp new file mode 100644 index 000000000000..f1771e1a6141 --- /dev/null +++ b/src/layer/riscv/exp_riscv_zfh.cpp @@ -0,0 +1,162 @@ +// Copyright 2026 ihb2032 +// SPDX-License-Identifier: BSD-3-Clause + +#include "exp_riscv.h" + +#if __riscv_vector +#include +#include "rvv_mathfun.h" +#if __riscv_zvfh +#include "rvv_mathfun_fp16s.h" +#endif +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int Exp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + if (base == -1.f) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + _p = __riscv_vfmul_vf_f32m8(_p, scale, vl); + _p = __riscv_vfadd_vf_f32m8(_p, shift, vl); + _p = exp_ps(_p, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + ptr[i] = (__fp16)expf(shift + (float)ptr[i] * scale); + } +#endif // __riscv_zvfh + } + + return 0; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + _p = __riscv_vfmul_vf_f32m8(_p, scale, vl); + _p = __riscv_vfadd_vf_f32m8(_p, shift, vl); + _p = pow_ps(__riscv_vfmv_v_f_f32m8(base, vl), _p, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + ptr[i] = (__fp16)powf(base, shift + (float)ptr[i] * scale); + } +#endif // __riscv_zvfh + } + + return 0; +} + +int Exp_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + __fp16 _scale = (__fp16)scale; + __fp16 _shift = (__fp16)shift; + + if (base == -1.f) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = __riscv_vfmul_vf_f16m8(_p, _scale, vl); + _p = __riscv_vfadd_vf_f16m8(_p, _shift, vl); + _p = exp_ps(_p, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + ptr[i] = (__fp16)expf(shift + (float)ptr[i] * scale); + } +#endif // __riscv_zvfh + } + + return 0; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = __riscv_vfmul_vf_f16m8(_p, _scale, vl); + _p = __riscv_vfadd_vf_f16m8(_p, _shift, vl); + _p = pow_ps(__riscv_vfmv_v_f_f16m8((__fp16)base, vl), _p, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + ptr[i] = (__fp16)powf(base, shift + (float)ptr[i] * scale); + } +#endif // __riscv_zvfh + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 730cf919a192..1b7515a1ac77 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -105,6 +105,7 @@ ncnn_add_layer_test(Eltwise) ncnn_add_layer_test(ELU) ncnn_add_layer_test(Embed) ncnn_add_layer_test(Erf) +ncnn_add_layer_test(Exp) ncnn_add_layer_test(ExpandDims) ncnn_add_layer_test(Flatten) ncnn_add_layer_test(Flip) diff --git a/tests/test_exp.cpp b/tests/test_exp.cpp new file mode 100644 index 000000000000..f9c86bbfdbf1 --- /dev/null +++ b/tests/test_exp.cpp @@ -0,0 +1,56 @@ +// Copyright 2026 ihb2032 +// SPDX-License-Identifier: BSD-3-Clause + +#include "testutil.h" + +static int test_exp(const ncnn::Mat& a, float base, float scale, float shift) +{ + ncnn::ParamDict pd; + pd.set(0, base); + pd.set(1, scale); + pd.set(2, shift); + + std::vector weights(0); + + int ret = test_layer("Exp", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_exp failed a.dims=%d a=(%d %d %d %d) base=%f scale=%f shift=%f\n", a.dims, a.w, a.h, a.d, a.c, base, scale, shift); + } + + return ret; +} + +static int test_exp_0() +{ + return 0 + || test_exp(RandomMat(5, 7, 24, -1.f, 1.f), -1.f, 1.f, 0.f) + || test_exp(RandomMat(7, 9, 12, -1.f, 1.f), -1.f, 0.75f, -0.25f) + || test_exp(RandomMat(3, 5, 13, -1.f, 1.f), 2.f, 0.5f, 0.125f); +} + +static int test_exp_1() +{ + return 0 + || test_exp(RandomMat(15, 24, -1.f, 1.f), -1.f, 1.f, 0.f) + || test_exp(RandomMat(17, 12, -1.f, 1.f), -1.f, 1.25f, 0.5f) + || test_exp(RandomMat(19, 15, -1.f, 1.f), 2.f, 0.75f, -0.5f); +} + +static int test_exp_2() +{ + return 0 + || test_exp(RandomMat(128, -1.f, 1.f), -1.f, 1.f, 0.f) + || test_exp(RandomMat(124, -1.f, 1.f), -1.f, 0.5f, 0.25f) + || test_exp(RandomMat(127, -1.f, 1.f), 2.f, 1.5f, -0.75f); +} + +int main() +{ + SRAND(7767517); + + return 0 + || test_exp_0() + || test_exp_1() + || test_exp_2(); +}