rvv: add exp

ihb2032 · ihb2032 · commit d67b723239e4 · 2026-04-05T09:45:57.000+08:00
Signed-off-by: ihb2032 &lt;hebome@foxmail.com&gt;
diff --git a/src/layer/riscv/exp_riscv.cpp b/src/layer/riscv/exp_riscv.cpp
@@ -0,0 +1,112 @@
+// Copyright 2026 ihb2032 <hebome@foxmail.com>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "exp_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "rvv_mathfun.h"
+#endif // __riscv_vector
+
+#include "cpu.h"
+
+namespace ncnn {
+
+Exp_riscv::Exp_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
+    support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
+#endif
+}
+
+int Exp_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+#if NCNN_ZFH
+    int elembits = bottom_top_blob.elembits();
+
+    if (opt.use_fp16_storage && elembits == 16)
+    {
+        if (opt.use_fp16_arithmetic)
+            return forward_inplace_fp16sa(bottom_top_blob, opt);
+        else
+            return forward_inplace_fp16s(bottom_top_blob, opt);
+    }
+#endif
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    if (base == -1.f)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+#if __riscv_vector
+            int n = size;
+            while (n > 0)
+            {
+                size_t vl = __riscv_vsetvl_e32m8(n);
+                vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+                _p = __riscv_vfmul_vf_f32m8(_p, scale, vl);
+                _p = __riscv_vfadd_vf_f32m8(_p, shift, vl);
+                _p = exp_ps(_p, vl);
+                __riscv_vse32_v_f32m8(ptr, _p, vl);
+
+                ptr += vl;
+                n -= vl;
+            }
+#else  // __riscv_vector
+            for (int i = 0; i < size; i++)
+            {
+                ptr[i] = expf(shift + ptr[i] * scale);
+            }
+#endif // __riscv_vector
+        }
+
+        return 0;
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+#if __riscv_vector
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e32m8(n);
+            vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+            _p = __riscv_vfmul_vf_f32m8(_p, scale, vl);
+            _p = __riscv_vfadd_vf_f32m8(_p, shift, vl);
+            _p = pow_ps(__riscv_vfmv_v_f_f32m8(base, vl), _p, vl);
+            __riscv_vse32_v_f32m8(ptr, _p, vl);
+
+            ptr += vl;
+            n -= vl;
+        }
+#else  // __riscv_vector
+        for (int i = 0; i < size; i++)
+        {
+            ptr[i] = powf(base, shift + ptr[i] * scale);
+        }
+#endif // __riscv_vector
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/exp_riscv.h b/src/layer/riscv/exp_riscv.h
@@ -0,0 +1,27 @@
+// Copyright 2026 ihb2032 <hebome@foxmail.com>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_EXP_RISCV_H
+#define LAYER_EXP_RISCV_H
+
+#include "exp.h"
+
+namespace ncnn {
+
+class Exp_riscv : public Exp
+{
+public:
+    Exp_riscv();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+#if NCNN_ZFH
+    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
+    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_EXP_RISCV_H
diff --git a/src/layer/riscv/exp_riscv_zfh.cpp b/src/layer/riscv/exp_riscv_zfh.cpp
@@ -0,0 +1,162 @@
+// Copyright 2026 ihb2032 <hebome@foxmail.com>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "exp_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "rvv_mathfun.h"
+#if __riscv_zvfh
+#include "rvv_mathfun_fp16s.h"
+#endif
+#endif // __riscv_vector
+
+namespace ncnn {
+
+#if NCNN_ZFH
+int Exp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    if (base == -1.f)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            __fp16* ptr = bottom_top_blob.channel(q);
+
+#if __riscv_zvfh
+            int n = size;
+            while (n > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m4(n);
+                vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
+                _p = __riscv_vfmul_vf_f32m8(_p, scale, vl);
+                _p = __riscv_vfadd_vf_f32m8(_p, shift, vl);
+                _p = exp_ps(_p, vl);
+                __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
+
+                ptr += vl;
+                n -= vl;
+            }
+#else  // __riscv_zvfh
+            for (int i = 0; i < size; i++)
+            {
+                ptr[i] = (__fp16)expf(shift + (float)ptr[i] * scale);
+            }
+#endif // __riscv_zvfh
+        }
+
+        return 0;
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        __fp16* ptr = bottom_top_blob.channel(q);
+
+#if __riscv_zvfh
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e16m4(n);
+            vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
+            _p = __riscv_vfmul_vf_f32m8(_p, scale, vl);
+            _p = __riscv_vfadd_vf_f32m8(_p, shift, vl);
+            _p = pow_ps(__riscv_vfmv_v_f_f32m8(base, vl), _p, vl);
+            __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
+
+            ptr += vl;
+            n -= vl;
+        }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            ptr[i] = (__fp16)powf(base, shift + (float)ptr[i] * scale);
+        }
+#endif // __riscv_zvfh
+    }
+
+    return 0;
+}
+
+int Exp_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+    __fp16 _scale = (__fp16)scale;
+    __fp16 _shift = (__fp16)shift;
+
+    if (base == -1.f)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            __fp16* ptr = bottom_top_blob.channel(q);
+
+#if __riscv_zvfh
+            int n = size;
+            while (n > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m8(n);
+                vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+                _p = __riscv_vfmul_vf_f16m8(_p, _scale, vl);
+                _p = __riscv_vfadd_vf_f16m8(_p, _shift, vl);
+                _p = exp_ps(_p, vl);
+                __riscv_vse16_v_f16m8(ptr, _p, vl);
+
+                ptr += vl;
+                n -= vl;
+            }
+#else  // __riscv_zvfh
+            for (int i = 0; i < size; i++)
+            {
+                ptr[i] = (__fp16)expf(shift + (float)ptr[i] * scale);
+            }
+#endif // __riscv_zvfh
+        }
+
+        return 0;
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        __fp16* ptr = bottom_top_blob.channel(q);
+
+#if __riscv_zvfh
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e16m8(n);
+            vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+            _p = __riscv_vfmul_vf_f16m8(_p, _scale, vl);
+            _p = __riscv_vfadd_vf_f16m8(_p, _shift, vl);
+            _p = pow_ps(__riscv_vfmv_v_f_f16m8((__fp16)base, vl), _p, vl);
+            __riscv_vse16_v_f16m8(ptr, _p, vl);
+
+            ptr += vl;
+            n -= vl;
+        }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            ptr[i] = (__fp16)powf(base, shift + (float)ptr[i] * scale);
+        }
+#endif // __riscv_zvfh
+    }
+
+    return 0;
+}
+#endif // NCNN_ZFH
+
+} // namespace ncnn