From 53fc15889d0ac7a7f626640fbe797131d0744123 Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Tue, 17 Mar 2026 22:14:16 +0800 Subject: [PATCH 1/6] support loongarch elu erf gelu selu --- src/layer/loongarch/elu_loongarch.cpp | 60 ++++++++++++++++ src/layer/loongarch/elu_loongarch.h | 21 ++++++ src/layer/loongarch/erf_loongarch.cpp | 56 +++++++++++++++ src/layer/loongarch/erf_loongarch.h | 21 ++++++ src/layer/loongarch/gelu_loongarch.cpp | 98 ++++++++++++++++++++++++++ src/layer/loongarch/gelu_loongarch.h | 21 ++++++ src/layer/loongarch/lsx_mathfun.h | 48 +++++++++++++ src/layer/loongarch/selu_loongarch.cpp | 73 +++++++++++++++++++ src/layer/loongarch/selu_loongarch.h | 21 ++++++ 9 files changed, 419 insertions(+) create mode 100644 src/layer/loongarch/elu_loongarch.cpp create mode 100644 src/layer/loongarch/elu_loongarch.h create mode 100644 src/layer/loongarch/erf_loongarch.cpp create mode 100644 src/layer/loongarch/erf_loongarch.h create mode 100644 src/layer/loongarch/gelu_loongarch.cpp create mode 100644 src/layer/loongarch/gelu_loongarch.h create mode 100644 src/layer/loongarch/selu_loongarch.cpp create mode 100644 src/layer/loongarch/selu_loongarch.h diff --git a/src/layer/loongarch/elu_loongarch.cpp b/src/layer/loongarch/elu_loongarch.cpp new file mode 100644 index 000000000000..9fae41006def --- /dev/null +++ b/src/layer/loongarch/elu_loongarch.cpp @@ -0,0 +1,60 @@ +// Copyright 2024 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include "elu_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#include "loongarch_activation.h" +#endif // __loongarch_sx + +namespace ncnn { + +ELU_loongarch::ELU_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int ELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = elu_ps(_p, _alpha); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < 0.f) + *ptr = alpha * (expf(*ptr) - 1.f); + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/elu_loongarch.h b/src/layer/loongarch/elu_loongarch.h new file mode 100644 index 000000000000..9e698099501a --- /dev/null +++ b/src/layer/loongarch/elu_loongarch.h @@ -0,0 +1,21 @@ +// Copyright 2024 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef LAYER_ELU_LOONGARCH_H +#define LAYER_ELU_LOONGARCH_H + +#include "elu.h" + +namespace ncnn { + +class ELU_loongarch : public ELU +{ +public: + ELU_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_ELU_LOONGARCH_H diff --git a/src/layer/loongarch/erf_loongarch.cpp b/src/layer/loongarch/erf_loongarch.cpp new file mode 100644 index 000000000000..0a07f2b7f1b2 --- /dev/null +++ b/src/layer/loongarch/erf_loongarch.cpp @@ -0,0 +1,56 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#include "erf_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +Erf_loongarch::Erf_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Erf_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = erf_ps(_p); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = erff(*ptr); + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/erf_loongarch.h b/src/layer/loongarch/erf_loongarch.h new file mode 100644 index 000000000000..fb46a6375764 --- /dev/null +++ b/src/layer/loongarch/erf_loongarch.h @@ -0,0 +1,21 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef LAYER_ERF_LOONGARCH_H +#define LAYER_ERF_LOONGARCH_H + +#include "erf.h" + +namespace ncnn { + +class Erf_loongarch : public Erf +{ +public: + Erf_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_ERF_LOONGARCH_H diff --git a/src/layer/loongarch/gelu_loongarch.cpp b/src/layer/loongarch/gelu_loongarch.cpp new file mode 100644 index 000000000000..1be8345099ae --- /dev/null +++ b/src/layer/loongarch/gelu_loongarch.cpp @@ -0,0 +1,98 @@ +// Copyright 2024 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include "gelu_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +GELU_loongarch::GELU_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int GELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + if (fast_gelu) + { + __m128 _half = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + __m128 _fast1c = (__m128)__lsx_vreplfr2vr_s(0.79788452f); + __m128 _fast2c = (__m128)__lsx_vreplfr2vr_s(0.044715f); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + + __m128 _cube = __lsx_vfmul_s(_p, _p); + _cube = __lsx_vfmul_s(_p, _cube); + __m128 _blob = __lsx_vfmul_s(_fast2c, _cube); + _blob = __lsx_vfadd_s(_p, _blob); + _blob = __lsx_vfmul_s(_fast1c, _blob); + _blob = tanh_ps(_blob); + _blob = __lsx_vfadd_s(_one, _blob); + _blob = __lsx_vfmul_s(_half, __lsx_vfmul_s(_blob, _p)); + __lsx_vst(_blob, ptr, 0); + + ptr += 4; + } + } + else + { + __m128 _half = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + __m128 _inv_sqrt2 = (__m128)__lsx_vreplfr2vr_s(0.70710678f); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + + __m128 _blob = __lsx_vfmul_s(_inv_sqrt2, _p); + _blob = erf_ps(_blob); + _blob = __lsx_vfadd_s(_one, _blob); + _blob = __lsx_vfmul_s(_half, __lsx_vfmul_s(_blob, _p)); + __lsx_vst(_blob, ptr, 0); + + ptr += 4; + } + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (fast_gelu) + { + *ptr = 0.5f * *ptr * (1.0f + tanhf(0.79788452f * (*ptr + 0.044715f * *ptr * *ptr * *ptr))); + } + else + { + *ptr = 0.5f * *ptr * (1.0f + erff(0.70710678f * *ptr)); + } + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/gelu_loongarch.h b/src/layer/loongarch/gelu_loongarch.h new file mode 100644 index 000000000000..46fb873f4ad6 --- /dev/null +++ b/src/layer/loongarch/gelu_loongarch.h @@ -0,0 +1,21 @@ +// Copyright 2024 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef LAYER_GELU_LOONGARCH_H +#define LAYER_GELU_LOONGARCH_H + +#include "gelu.h" + +namespace ncnn { + +class GELU_loongarch : public GELU +{ +public: + GELU_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_GELU_LOONGARCH_H diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h index aeb7f188ce4b..e8fdfc44958c 100644 --- a/src/layer/loongarch/lsx_mathfun.h +++ b/src/layer/loongarch/lsx_mathfun.h @@ -626,6 +626,54 @@ static inline __m128 atan2_ps(__m128 y, __m128 x) return final_result; } +_LOONGARCH_FLOAT_CONST(c_erf_threshold, 0.927734375f); +_LOONGARCH_FLOAT_CONST(c_erf_hi_c0, -1.72853470e-5f); +_LOONGARCH_FLOAT_CONST(c_erf_hi_c1, 3.83197126e-4f); +_LOONGARCH_FLOAT_CONST(c_erf_hi_c2, -3.88396438e-3f); +_LOONGARCH_FLOAT_CONST(c_erf_hi_c3, 2.42546219e-2f); +_LOONGARCH_FLOAT_CONST(c_erf_hi_c4, -1.06777877e-1f); +_LOONGARCH_FLOAT_CONST(c_erf_hi_c5, -6.34846687e-1f); +_LOONGARCH_FLOAT_CONST(c_erf_hi_c6, -1.28717512e-1f); +_LOONGARCH_FLOAT_CONST(c_erf_lo_c0, -5.96761703e-4f); +_LOONGARCH_FLOAT_CONST(c_erf_lo_c1, 4.99119423e-3f); +_LOONGARCH_FLOAT_CONST(c_erf_lo_c2, -2.67681349e-2f); +_LOONGARCH_FLOAT_CONST(c_erf_lo_c3, 1.12819925e-1f); +_LOONGARCH_FLOAT_CONST(c_erf_lo_c4, -3.76125336e-1f); +_LOONGARCH_FLOAT_CONST(c_erf_lo_c5, 1.28379166e-1f); + +static inline __m128 erf_ps(__m128 a) +{ + __m128 t = (__m128)__lsx_vbitclri_w((__m128i)a, 31); + __m128 s = __lsx_vfmul_s(a, a); + + __m128i hi_mask = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(c_erf_threshold.i), t); + + __m128 r_hi = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c0.i), (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c1.i)); + __m128 u = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c2.i), (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c3.i)); + r_hi = __lsx_vfmadd_s(r_hi, s, u); + r_hi = __lsx_vfmadd_s(r_hi, t, (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c4.i)); + r_hi = __lsx_vfmadd_s(r_hi, t, (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c5.i)); + r_hi = __lsx_vfmadd_s(r_hi, t, (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c6.i)); + r_hi = __lsx_vfmadd_s(r_hi, t, __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(c_0.i), t)); + r_hi = __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(c_1.i), exp_ps(r_hi)); + + __m128 r_lo = (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c0.i); + r_lo = __lsx_vfmadd_s(r_lo, s, (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c1.i)); + r_lo = __lsx_vfmadd_s(r_lo, s, (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c2.i)); + r_lo = __lsx_vfmadd_s(r_lo, s, (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c3.i)); + r_lo = __lsx_vfmadd_s(r_lo, s, (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c4.i)); + r_lo = __lsx_vfmadd_s(r_lo, s, (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c5.i)); + r_lo = __lsx_vfmadd_s(r_lo, a, a); + + __m128 r = (__m128)__lsx_vbitsel_v((__m128i)r_lo, (__m128i)r_hi, hi_mask); + + __m128i sign_mask = __lsx_vreplgr2vr_w(0x80000000); + __m128i sign_a = __lsx_vand_v((__m128i)a, sign_mask); + r = (__m128)__lsx_vor_v((__m128i)r, sign_a); + + return r; +} + static inline __m128 fmod_ps(__m128 a, __m128 b) { // fmod(a,b) = a - trunc(a/b)*b (trunc toward 0) diff --git a/src/layer/loongarch/selu_loongarch.cpp b/src/layer/loongarch/selu_loongarch.cpp new file mode 100644 index 000000000000..7354fcdd6e2b --- /dev/null +++ b/src/layer/loongarch/selu_loongarch.cpp @@ -0,0 +1,73 @@ +// Copyright 2024 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include "selu_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +SELU_loongarch::SELU_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int SELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + float alphaxlambda = alpha * lambda; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + __m128 _alphaxlambda = (__m128)__lsx_vreplfr2vr_s(alphaxlambda); + __m128 _lambda = (__m128)__lsx_vreplfr2vr_s(lambda); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + + __m128 _nps = exp_ps(_p); + _nps = __lsx_vfsub_s(_nps, _one); + _nps = __lsx_vfmul_s(_nps, _alphaxlambda); + + _p = __lsx_vfmul_s(_p, _lambda); + + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_nps, (__m128i)_lemask); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < 0.f) + *ptr = (expf(*ptr) - 1.f) * alphaxlambda; + else + *ptr *= lambda; + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/selu_loongarch.h b/src/layer/loongarch/selu_loongarch.h new file mode 100644 index 000000000000..119f2cdccafa --- /dev/null +++ b/src/layer/loongarch/selu_loongarch.h @@ -0,0 +1,21 @@ +// Copyright 2024 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef LAYER_SELU_LOONGARCH_H +#define LAYER_SELU_LOONGARCH_H + +#include "selu.h" + +namespace ncnn { + +class SELU_loongarch : public SELU +{ +public: + SELU_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_SELU_LOONGARCH_H From 28c568dcdfd5f1d46f8986f21c115af4bc7a8b36 Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Wed, 18 Mar 2026 12:23:41 +0800 Subject: [PATCH 2/6] add elu --- src/layer/loongarch/elu_loongarch.cpp | 21 ++++++++++++++++++--- src/layer/loongarch/lasx_mathfun.h | 10 ++++++++++ src/layer/loongarch/lsx_mathfun.h | 10 ++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/layer/loongarch/elu_loongarch.cpp b/src/layer/loongarch/elu_loongarch.cpp index 9fae41006def..674f10202899 100644 --- a/src/layer/loongarch/elu_loongarch.cpp +++ b/src/layer/loongarch/elu_loongarch.cpp @@ -6,7 +6,10 @@ #if __loongarch_sx #include #include "lsx_mathfun.h" -#include "loongarch_activation.h" +#if __loongarch_asx +#include +#include "lasx_mathfun.h" +#endif // __loongarch_asx #endif // __loongarch_sx namespace ncnn { @@ -34,12 +37,24 @@ int ELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int i = 0; #if __loongarch_sx - __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha); +#if __loongarch_asx + __m256 _alpha_lasx = (__m256)__lasx_xvreplfr2vr_s(alpha); + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(ptr + 32); + __m256 _p = (__m256)__lasx_xvld(ptr, 0); + _p = elu_ps(_p, _alpha_lasx); + __lasx_xvst(_p, ptr, 0); + + ptr += 8; + } +#endif // __loongarch_asx + __m128 _alpha_lsx = (__m128)__lsx_vreplfr2vr_s(alpha); for (; i + 3 < size; i += 4) { __builtin_prefetch(ptr + 16); __m128 _p = (__m128)__lsx_vld(ptr, 0); - _p = elu_ps(_p, _alpha); + _p = elu_ps(_p, _alpha_lsx); __lsx_vst(_p, ptr, 0); ptr += 4; diff --git a/src/layer/loongarch/lasx_mathfun.h b/src/layer/loongarch/lasx_mathfun.h index 28a9e6fd96a9..56b0a69cf767 100644 --- a/src/layer/loongarch/lasx_mathfun.h +++ b/src/layer/loongarch/lasx_mathfun.h @@ -601,4 +601,14 @@ static inline __m256 sigmoid256_ps(__m256 _v) return __lasx_xvfdiv_s(_one, _v); } +static inline __m256 elu_ps(__m256 _v, __m256 _alpha) +{ + __m256 _zero = (__m256)__lasx_xvreplgr2vr_w(0); + __m256 _one = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i); + __m256 _pos = __lasx_xvfmax_s(_v, _zero); + __m256 _neg = __lasx_xvfmin_s(_v, _zero); + _neg = __lasx_xvfsub_s(exp256_ps(_neg), _one); + return __lasx_xvfadd_s(_pos, __lasx_xvfmul_s(_alpha, _neg)); +} + #endif // LASX_MATHFUN_H diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h index e8fdfc44958c..7e0b58314107 100644 --- a/src/layer/loongarch/lsx_mathfun.h +++ b/src/layer/loongarch/lsx_mathfun.h @@ -735,4 +735,14 @@ static inline __m128 remainder_ps(__m128 a, __m128 b) return __lsx_vfsub_s(a, __lsx_vfmul_s(rq, b)); } +static inline __m128 elu_ps(__m128 _v, __m128 _alpha) +{ + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _one = (__m128)__lsx_vreplgr2vr_w(c_1.i); + __m128 _pos = __lsx_vfmax_s(_v, _zero); + __m128 _neg = __lsx_vfmin_s(_v, _zero); + _neg = __lsx_vfsub_s(exp_ps(_neg), _one); + return __lsx_vfadd_s(_pos, __lsx_vfmul_s(_alpha, _neg)); +} + #endif // LSX_MATHFUN_H From b166ecdc0929d88ea5745a4ac21004878950a910 Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Thu, 19 Mar 2026 20:18:27 +0800 Subject: [PATCH 3/6] 1. use erfc instead of erf + 1 2. avoid exp overflow --- src/layer/loongarch/gelu_loongarch.cpp | 8 +- src/layer/loongarch/lsx_mathfun.h | 170 +++++++++++++++++++++++++ src/layer/loongarch/selu_loongarch.cpp | 16 +-- 3 files changed, 181 insertions(+), 13 deletions(-) diff --git a/src/layer/loongarch/gelu_loongarch.cpp b/src/layer/loongarch/gelu_loongarch.cpp index 1be8345099ae..11bca3cebdc2 100644 --- a/src/layer/loongarch/gelu_loongarch.cpp +++ b/src/layer/loongarch/gelu_loongarch.cpp @@ -60,16 +60,14 @@ int GELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) con else { __m128 _half = (__m128)__lsx_vreplfr2vr_s(0.5f); - __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); - __m128 _inv_sqrt2 = (__m128)__lsx_vreplfr2vr_s(0.70710678f); + __m128 _inv_sqrt2 = (__m128)__lsx_vreplfr2vr_s(-0.70710678f); for (; i + 3 < size; i += 4) { __builtin_prefetch(ptr + 16); __m128 _p = (__m128)__lsx_vld(ptr, 0); __m128 _blob = __lsx_vfmul_s(_inv_sqrt2, _p); - _blob = erf_ps(_blob); - _blob = __lsx_vfadd_s(_one, _blob); + _blob = erfc_ps(_blob); _blob = __lsx_vfmul_s(_half, __lsx_vfmul_s(_blob, _p)); __lsx_vst(_blob, ptr, 0); @@ -85,7 +83,7 @@ int GELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) con } else { - *ptr = 0.5f * *ptr * (1.0f + erff(0.70710678f * *ptr)); + *ptr = 0.5f * *ptr * erfcf(-0.70710678f * *ptr); } ptr++; diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h index 7e0b58314107..1ffdc703f7a8 100644 --- a/src/layer/loongarch/lsx_mathfun.h +++ b/src/layer/loongarch/lsx_mathfun.h @@ -745,4 +745,174 @@ static inline __m128 elu_ps(__m128 _v, __m128 _alpha) return __lsx_vfadd_s(_pos, __lsx_vfmul_s(_alpha, _neg)); } +_LOONGARCH_FLOAT_CONST(c_erfc_erx, 8.4506291151e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_pp0, 1.2837916613e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_pp1, -3.2504209876e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_pp2, -2.8481749818e-2f); +_LOONGARCH_FLOAT_CONST(c_erfc_pp3, -5.7702702470e-3f); +_LOONGARCH_FLOAT_CONST(c_erfc_pp4, -2.3763017452e-05f); +_LOONGARCH_FLOAT_CONST(c_erfc_qq1, 3.9791721106e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_qq2, 6.5022252500e-2f); +_LOONGARCH_FLOAT_CONST(c_erfc_qq3, 5.0813062117e-3f); +_LOONGARCH_FLOAT_CONST(c_erfc_qq4, 1.3249473704e-4f); +_LOONGARCH_FLOAT_CONST(c_erfc_qq5, -3.9602282413e-6f); +_LOONGARCH_FLOAT_CONST(c_erfc_pa0, -2.3621185683e-3f); +_LOONGARCH_FLOAT_CONST(c_erfc_pa1, 4.1485610604e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_pa2, -3.7220788002e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_pa3, 3.1834661961e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_pa4, -1.1089469492e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_pa5, 3.5478305072e-2f); +_LOONGARCH_FLOAT_CONST(c_erfc_pa6, -2.1663755178e-3f); +_LOONGARCH_FLOAT_CONST(c_erfc_qa1, 1.0642088205e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_qa2, 5.4039794207e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_qa3, 7.1828655899e-2f); +_LOONGARCH_FLOAT_CONST(c_erfc_qa4, 1.2617121637e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_qa5, 1.3637083583e-2f); +_LOONGARCH_FLOAT_CONST(c_erfc_qa6, 1.1984500103e-2f); +_LOONGARCH_FLOAT_CONST(c_erfc_ra0, -9.8649440333e-3f); +_LOONGARCH_FLOAT_CONST(c_erfc_ra1, -6.9385856390e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_ra2, -1.0558626175e+1f); +_LOONGARCH_FLOAT_CONST(c_erfc_ra3, -6.2375331879e+1f); +_LOONGARCH_FLOAT_CONST(c_erfc_ra4, -1.6239666748e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_ra5, -1.8460508728e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_ra6, -8.1287437439e+1f); +_LOONGARCH_FLOAT_CONST(c_erfc_ra7, -9.8143291473e+00f); +_LOONGARCH_FLOAT_CONST(c_erfc_sa1, 1.9651271820e+1f); +_LOONGARCH_FLOAT_CONST(c_erfc_sa2, 1.3765776062e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_sa3, 4.3456588745e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_sa4, 6.4538726807e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_sa5, 4.2900814819e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_sa6, 1.0863500214e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_sa7, 6.5702495575e+00f); +_LOONGARCH_FLOAT_CONST(c_erfc_sa8, -6.0424413532e-02f); +_LOONGARCH_FLOAT_CONST(c_erfc_rb0, -9.8649431020e-3f); +_LOONGARCH_FLOAT_CONST(c_erfc_rb1, -7.9928326607e-1f); +_LOONGARCH_FLOAT_CONST(c_erfc_rb2, -1.7757955551e+1f); +_LOONGARCH_FLOAT_CONST(c_erfc_rb3, -1.6063638306e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_rb4, -6.3756646729e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_rb5, -1.0250950928e+3f); +_LOONGARCH_FLOAT_CONST(c_erfc_rb6, -4.8351919556e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_sb1, 3.0338060379e+1f); +_LOONGARCH_FLOAT_CONST(c_erfc_sb2, 3.2579251099e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_sb3, 1.5367296143e+3f); +_LOONGARCH_FLOAT_CONST(c_erfc_sb4, 3.1998581543e+3f); +_LOONGARCH_FLOAT_CONST(c_erfc_sb5, 2.5530502930e+3f); +_LOONGARCH_FLOAT_CONST(c_erfc_sb6, 4.7452853394e+2f); +_LOONGARCH_FLOAT_CONST(c_erfc_sb7, -2.2440952301e+1f); + +static inline __m128 erfc_ps(__m128 x) +{ + __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i); + __m128 two = (__m128)__lsx_vreplgr2vr_w(c_2.i); + __m128 zero = (__m128)__lsx_vreplgr2vr_w(c_0.i); + + __m128 absx = (__m128)__lsx_vbitclri_w((__m128i)x, 31); + __m128 x2 = __lsx_vfmul_s(x, x); + __m128 t = __lsx_vfdiv_s(x2, one); + __m128 tt = __lsx_vfsub_s(absx, one); + + __m128i mask_ge_1_25 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x3fa00000), absx); + t = (__m128)__lsx_vbitsel_v((__m128i)tt, (__m128i)t, mask_ge_1_25); + + __m128i mask_ge_0_84375 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x3f580000), absx); + t = (__m128)__lsx_vbitsel_v((__m128i)x2, (__m128i)t, mask_ge_0_84375); + + __m128 u = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb6.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_rb5.i)); + u = __lsx_vfmadd_s(t, u, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb4.i)); + u = __lsx_vfmadd_s(t, u, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb3.i)); + u = __lsx_vfmadd_s(t, u, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb2.i)); + u = __lsx_vfmadd_s(t, u, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb1.i)); + u = __lsx_vfmadd_s(t, u, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb0.i)); + + __m128 v = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb7.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_sb6.i)); + v = __lsx_vfmadd_s(t, v, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb5.i)); + v = __lsx_vfmadd_s(t, v, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb4.i)); + v = __lsx_vfmadd_s(t, v, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb3.i)); + v = __lsx_vfmadd_s(t, v, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb2.i)); + v = __lsx_vfmadd_s(t, v, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb1.i)); + + __m128 tu = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra7.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_ra6.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra5.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra4.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra3.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra2.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra1.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra0.i)); + + __m128 tv = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa8.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_sa7.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa6.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa5.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa4.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa3.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa2.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa1.i)); + + __m128i mask_ge_2_857143 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x4036db6d), absx); + u = (__m128)__lsx_vbitsel_v((__m128i)tu, (__m128i)u, mask_ge_2_857143); + v = (__m128)__lsx_vbitsel_v((__m128i)tv, (__m128i)v, mask_ge_2_857143); + + tu = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa6.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_pa5.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa4.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa3.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa2.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa1.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa0.i)); + + tv = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_qa6.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_qa5.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qa4.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qa3.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qa2.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qa1.i)); + + u = (__m128)__lsx_vbitsel_v((__m128i)tu, (__m128i)u, mask_ge_1_25); + v = (__m128)__lsx_vbitsel_v((__m128i)tv, (__m128i)v, mask_ge_1_25); + + tu = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_pp4.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_pp3.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pp2.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pp1.i)); + tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pp0.i)); + + tv = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_qq5.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_qq4.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qq3.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qq2.i)); + tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qq1.i)); + + u = (__m128)__lsx_vbitsel_v((__m128i)tu, (__m128i)u, mask_ge_0_84375); + v = (__m128)__lsx_vbitsel_v((__m128i)tv, (__m128i)v, mask_ge_0_84375); + + v = __lsx_vfmadd_s(t, v, one); + + __m128 q = __lsx_vfdiv_s(u, v); + __m128 ret = zero; + + __m128i z_i = __lsx_vand_v((__m128i)absx, __lsx_vreplgr2vr_w(0xfffff000)); + __m128 z = (__m128)z_i; + + __m128 r = exp_ps(__lsx_vfmadd_s(__lsx_vfneg_s(z), z, (__m128)__lsx_vreplgr2vr_w(0xbf100000))); + __m128 tmp = __lsx_vfmadd_s(__lsx_vfsub_s(z, absx), __lsx_vfadd_s(z, absx), q); + r = __lsx_vfmul_s(r, exp_ps(tmp)); + r = __lsx_vfdiv_s(r, absx); + t = __lsx_vfsub_s(two, r); + __m128i mask_x_ge_0 = __lsx_vfcmp_clt_s(zero, x); + r = (__m128)__lsx_vbitsel_v((__m128i)t, (__m128i)r, mask_x_ge_0); + __m128i mask_absx_ge_28 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x41e00000), absx); + ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_absx_ge_28); + + r = __lsx_vfsub_s(q, (__m128)__lsx_vreplgr2vr_w(0x3f58560b)); + t = __lsx_vfadd_s(q, (__m128)__lsx_vreplgr2vr_w(0x3f98560b)); + r = (__m128)__lsx_vbitsel_v((__m128i)t, (__m128i)r, mask_x_ge_0); + ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_1_25); + + r = __lsx_vfsub_s(__lsx_vfmadd_s(x, q, __lsx_vfsub_s(x, (__m128)__lsx_vreplgr2vr_w(c_0p5.i))), (__m128)__lsx_vreplgr2vr_w(c_0p5.i)); + ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_0_84375); + + __m128i mask_x_lt_m6 = __lsx_vfcmp_clt_s(x, (__m128)__lsx_vreplgr2vr_w(0xc0c00000)); + ret = (__m128)__lsx_vbitsel_v((__m128i)ret, two, mask_x_lt_m6); + + __m128i mask_nan = __lsx_vfcmp_cne_s(x, x); + ret = (__m128)__lsx_vbitsel_v((__m128i)ret, (__m128i)x, mask_nan); + + return ret; +} + #endif // LSX_MATHFUN_H diff --git a/src/layer/loongarch/selu_loongarch.cpp b/src/layer/loongarch/selu_loongarch.cpp index 7354fcdd6e2b..b240cddfbf4c 100644 --- a/src/layer/loongarch/selu_loongarch.cpp +++ b/src/layer/loongarch/selu_loongarch.cpp @@ -36,22 +36,22 @@ int SELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) con #if __loongarch_sx __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); - __m128 _alphaxlambda = (__m128)__lsx_vreplfr2vr_s(alphaxlambda); + __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha); __m128 _lambda = (__m128)__lsx_vreplfr2vr_s(lambda); for (; i + 3 < size; i += 4) { __builtin_prefetch(ptr + 16); __m128 _p = (__m128)__lsx_vld(ptr, 0); - __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); - __m128 _nps = exp_ps(_p); - _nps = __lsx_vfsub_s(_nps, _one); - _nps = __lsx_vfmul_s(_nps, _alphaxlambda); + __m128 _pos = __lsx_vfmax_s(_p, _zero); + __m128 _neg = __lsx_vfmin_s(_p, _zero); - _p = __lsx_vfmul_s(_p, _lambda); + __m128 _blob = exp_ps(_neg); + _blob = __lsx_vfsub_s(_blob, _one); + _blob = __lsx_vfmul_s(_alpha, _blob); + _blob = __lsx_vfmul_s(_lambda, __lsx_vfadd_s(_pos, _blob)); - _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_nps, (__m128i)_lemask); - __lsx_vst(_p, ptr, 0); + __lsx_vst(_blob, ptr, 0); ptr += 4; } From e3a2e8d90b755e1d1ba48cc2ae0d08336ec24569 Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Fri, 20 Mar 2026 11:07:06 +0800 Subject: [PATCH 4/6] fix erfc --- src/layer/loongarch/lsx_mathfun.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h index 1ffdc703f7a8..b396b39dca48 100644 --- a/src/layer/loongarch/lsx_mathfun.h +++ b/src/layer/loongarch/lsx_mathfun.h @@ -888,7 +888,8 @@ static inline __m128 erfc_ps(__m128 x) __m128i z_i = __lsx_vand_v((__m128i)absx, __lsx_vreplgr2vr_w(0xfffff000)); __m128 z = (__m128)z_i; - __m128 r = exp_ps(__lsx_vfmadd_s(__lsx_vfneg_s(z), z, (__m128)__lsx_vreplgr2vr_w(0xbf100000))); + __m128 neg_z = (__m128)__lsx_vxor_v((__m128i)z, __lsx_vreplgr2vr_w(0x80000000)); + __m128 r = exp_ps(__lsx_vfmadd_s(neg_z, z, (__m128)__lsx_vreplgr2vr_w(0xbf100000))); __m128 tmp = __lsx_vfmadd_s(__lsx_vfsub_s(z, absx), __lsx_vfadd_s(z, absx), q); r = __lsx_vfmul_s(r, exp_ps(tmp)); r = __lsx_vfdiv_s(r, absx); @@ -907,7 +908,7 @@ static inline __m128 erfc_ps(__m128 x) ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_0_84375); __m128i mask_x_lt_m6 = __lsx_vfcmp_clt_s(x, (__m128)__lsx_vreplgr2vr_w(0xc0c00000)); - ret = (__m128)__lsx_vbitsel_v((__m128i)ret, two, mask_x_lt_m6); + ret = (__m128)__lsx_vbitsel_v((__m128i)ret, (__m128i)two, mask_x_lt_m6); __m128i mask_nan = __lsx_vfcmp_cne_s(x, x); ret = (__m128)__lsx_vbitsel_v((__m128i)ret, (__m128i)x, mask_nan); From f6d1889d7376b5087096997ee0c163d7f4c214ca Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Sat, 21 Mar 2026 10:41:59 +0800 Subject: [PATCH 5/6] fix erfc --- src/layer/loongarch/lsx_mathfun.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h index b396b39dca48..0c3075097c18 100644 --- a/src/layer/loongarch/lsx_mathfun.h +++ b/src/layer/loongarch/lsx_mathfun.h @@ -899,8 +899,8 @@ static inline __m128 erfc_ps(__m128 x) __m128i mask_absx_ge_28 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x41e00000), absx); ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_absx_ge_28); - r = __lsx_vfsub_s(q, (__m128)__lsx_vreplgr2vr_w(0x3f58560b)); - t = __lsx_vfadd_s(q, (__m128)__lsx_vreplgr2vr_w(0x3f98560b)); + r = __lsx_vfsub_s(__lsx_vfsub_s(one, (__m128)__lsx_vreplgr2vr_w(c_erfc_erx.i)), q); + t = __lsx_vfadd_s(q, __lsx_vfadd_s(one, (__m128)__lsx_vreplgr2vr_w(c_erfc_erx.i))); r = (__m128)__lsx_vbitsel_v((__m128i)t, (__m128i)r, mask_x_ge_0); ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_1_25); From 7d9918632a023aba6113af769921a12abf1e458b Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Tue, 14 Apr 2026 00:43:26 +0800 Subject: [PATCH 6/6] flip the eqn sign --- src/layer/loongarch/lsx_mathfun.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h index 0c3075097c18..e8e491d35708 100644 --- a/src/layer/loongarch/lsx_mathfun.h +++ b/src/layer/loongarch/lsx_mathfun.h @@ -894,7 +894,7 @@ static inline __m128 erfc_ps(__m128 x) r = __lsx_vfmul_s(r, exp_ps(tmp)); r = __lsx_vfdiv_s(r, absx); t = __lsx_vfsub_s(two, r); - __m128i mask_x_ge_0 = __lsx_vfcmp_clt_s(zero, x); + __m128i mask_x_ge_0 = __lsx_vfcmp_cle_s(zero, x); r = (__m128)__lsx_vbitsel_v((__m128i)t, (__m128i)r, mask_x_ge_0); __m128i mask_absx_ge_28 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x41e00000), absx); ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_absx_ge_28); @@ -904,7 +904,7 @@ static inline __m128 erfc_ps(__m128 x) r = (__m128)__lsx_vbitsel_v((__m128i)t, (__m128i)r, mask_x_ge_0); ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_1_25); - r = __lsx_vfsub_s(__lsx_vfmadd_s(x, q, __lsx_vfsub_s(x, (__m128)__lsx_vreplgr2vr_w(c_0p5.i))), (__m128)__lsx_vreplgr2vr_w(c_0p5.i)); + r = __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(c_0p5.i), __lsx_vfmadd_s(x, q, __lsx_vfsub_s(x, (__m128)__lsx_vreplgr2vr_w(c_0p5.i)))); ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_0_84375); __m128i mask_x_lt_m6 = __lsx_vfcmp_clt_s(x, (__m128)__lsx_vreplgr2vr_w(0xc0c00000));