From 53fc15889d0ac7a7f626640fbe797131d0744123 Mon Sep 17 00:00:00 2001
From: futz12 <1391525377@qq.com>
Date: Tue, 17 Mar 2026 22:14:16 +0800
Subject: [PATCH 1/6] support loongarch elu erf gelu selu

---
 src/layer/loongarch/elu_loongarch.cpp  | 60 ++++++++++++++++
 src/layer/loongarch/elu_loongarch.h    | 21 ++++++
 src/layer/loongarch/erf_loongarch.cpp  | 56 +++++++++++++++
 src/layer/loongarch/erf_loongarch.h    | 21 ++++++
 src/layer/loongarch/gelu_loongarch.cpp | 98 ++++++++++++++++++++++++++
 src/layer/loongarch/gelu_loongarch.h   | 21 ++++++
 src/layer/loongarch/lsx_mathfun.h      | 48 +++++++++++++
 src/layer/loongarch/selu_loongarch.cpp | 73 +++++++++++++++++++
 src/layer/loongarch/selu_loongarch.h   | 21 ++++++
 9 files changed, 419 insertions(+)
 create mode 100644 src/layer/loongarch/elu_loongarch.cpp
 create mode 100644 src/layer/loongarch/elu_loongarch.h
 create mode 100644 src/layer/loongarch/erf_loongarch.cpp
 create mode 100644 src/layer/loongarch/erf_loongarch.h
 create mode 100644 src/layer/loongarch/gelu_loongarch.cpp
 create mode 100644 src/layer/loongarch/gelu_loongarch.h
 create mode 100644 src/layer/loongarch/selu_loongarch.cpp
 create mode 100644 src/layer/loongarch/selu_loongarch.h

diff --git a/src/layer/loongarch/elu_loongarch.cpp b/src/layer/loongarch/elu_loongarch.cpp
new file mode 100644
index 000000000000..9fae41006def
--- /dev/null
+++ b/src/layer/loongarch/elu_loongarch.cpp
@@ -0,0 +1,60 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "elu_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#include "loongarch_activation.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+ELU_loongarch::ELU_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int ELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = elu_ps(_p, _alpha);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (*ptr < 0.f)
+                *ptr = alpha * (expf(*ptr) - 1.f);
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/elu_loongarch.h b/src/layer/loongarch/elu_loongarch.h
new file mode 100644
index 000000000000..9e698099501a
--- /dev/null
+++ b/src/layer/loongarch/elu_loongarch.h
@@ -0,0 +1,21 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_ELU_LOONGARCH_H
+#define LAYER_ELU_LOONGARCH_H
+
+#include "elu.h"
+
+namespace ncnn {
+
+class ELU_loongarch : public ELU
+{
+public:
+    ELU_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELU_LOONGARCH_H
diff --git a/src/layer/loongarch/erf_loongarch.cpp b/src/layer/loongarch/erf_loongarch.cpp
new file mode 100644
index 000000000000..0a07f2b7f1b2
--- /dev/null
+++ b/src/layer/loongarch/erf_loongarch.cpp
@@ -0,0 +1,56 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "erf_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+Erf_loongarch::Erf_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Erf_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = erf_ps(_p);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = erff(*ptr);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/erf_loongarch.h b/src/layer/loongarch/erf_loongarch.h
new file mode 100644
index 000000000000..fb46a6375764
--- /dev/null
+++ b/src/layer/loongarch/erf_loongarch.h
@@ -0,0 +1,21 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_ERF_LOONGARCH_H
+#define LAYER_ERF_LOONGARCH_H
+
+#include "erf.h"
+
+namespace ncnn {
+
+class Erf_loongarch : public Erf
+{
+public:
+    Erf_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ERF_LOONGARCH_H
diff --git a/src/layer/loongarch/gelu_loongarch.cpp b/src/layer/loongarch/gelu_loongarch.cpp
new file mode 100644
index 000000000000..1be8345099ae
--- /dev/null
+++ b/src/layer/loongarch/gelu_loongarch.cpp
@@ -0,0 +1,98 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "gelu_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+GELU_loongarch::GELU_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int GELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        if (fast_gelu)
+        {
+            __m128 _half = (__m128)__lsx_vreplfr2vr_s(0.5f);
+            __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+            __m128 _fast1c = (__m128)__lsx_vreplfr2vr_s(0.79788452f);
+            __m128 _fast2c = (__m128)__lsx_vreplfr2vr_s(0.044715f);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+
+                __m128 _cube = __lsx_vfmul_s(_p, _p);
+                _cube = __lsx_vfmul_s(_p, _cube);
+                __m128 _blob = __lsx_vfmul_s(_fast2c, _cube);
+                _blob = __lsx_vfadd_s(_p, _blob);
+                _blob = __lsx_vfmul_s(_fast1c, _blob);
+                _blob = tanh_ps(_blob);
+                _blob = __lsx_vfadd_s(_one, _blob);
+                _blob = __lsx_vfmul_s(_half, __lsx_vfmul_s(_blob, _p));
+                __lsx_vst(_blob, ptr, 0);
+
+                ptr += 4;
+            }
+        }
+        else
+        {
+            __m128 _half = (__m128)__lsx_vreplfr2vr_s(0.5f);
+            __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+            __m128 _inv_sqrt2 = (__m128)__lsx_vreplfr2vr_s(0.70710678f);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+
+                __m128 _blob = __lsx_vfmul_s(_inv_sqrt2, _p);
+                _blob = erf_ps(_blob);
+                _blob = __lsx_vfadd_s(_one, _blob);
+                _blob = __lsx_vfmul_s(_half, __lsx_vfmul_s(_blob, _p));
+                __lsx_vst(_blob, ptr, 0);
+
+                ptr += 4;
+            }
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (fast_gelu)
+            {
+                *ptr = 0.5f * *ptr * (1.0f + tanhf(0.79788452f * (*ptr + 0.044715f * *ptr * *ptr * *ptr)));
+            }
+            else
+            {
+                *ptr = 0.5f * *ptr * (1.0f + erff(0.70710678f * *ptr));
+            }
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/gelu_loongarch.h b/src/layer/loongarch/gelu_loongarch.h
new file mode 100644
index 000000000000..46fb873f4ad6
--- /dev/null
+++ b/src/layer/loongarch/gelu_loongarch.h
@@ -0,0 +1,21 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_GELU_LOONGARCH_H
+#define LAYER_GELU_LOONGARCH_H
+
+#include "gelu.h"
+
+namespace ncnn {
+
+class GELU_loongarch : public GELU
+{
+public:
+    GELU_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GELU_LOONGARCH_H
diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
index aeb7f188ce4b..e8fdfc44958c 100644
--- a/src/layer/loongarch/lsx_mathfun.h
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -626,6 +626,54 @@ static inline __m128 atan2_ps(__m128 y, __m128 x)
     return final_result;
 }
 
+_LOONGARCH_FLOAT_CONST(c_erf_threshold, 0.927734375f);
+_LOONGARCH_FLOAT_CONST(c_erf_hi_c0, -1.72853470e-5f);
+_LOONGARCH_FLOAT_CONST(c_erf_hi_c1, 3.83197126e-4f);
+_LOONGARCH_FLOAT_CONST(c_erf_hi_c2, -3.88396438e-3f);
+_LOONGARCH_FLOAT_CONST(c_erf_hi_c3, 2.42546219e-2f);
+_LOONGARCH_FLOAT_CONST(c_erf_hi_c4, -1.06777877e-1f);
+_LOONGARCH_FLOAT_CONST(c_erf_hi_c5, -6.34846687e-1f);
+_LOONGARCH_FLOAT_CONST(c_erf_hi_c6, -1.28717512e-1f);
+_LOONGARCH_FLOAT_CONST(c_erf_lo_c0, -5.96761703e-4f);
+_LOONGARCH_FLOAT_CONST(c_erf_lo_c1, 4.99119423e-3f);
+_LOONGARCH_FLOAT_CONST(c_erf_lo_c2, -2.67681349e-2f);
+_LOONGARCH_FLOAT_CONST(c_erf_lo_c3, 1.12819925e-1f);
+_LOONGARCH_FLOAT_CONST(c_erf_lo_c4, -3.76125336e-1f);
+_LOONGARCH_FLOAT_CONST(c_erf_lo_c5, 1.28379166e-1f);
+
+static inline __m128 erf_ps(__m128 a)
+{
+    __m128 t = (__m128)__lsx_vbitclri_w((__m128i)a, 31);
+    __m128 s = __lsx_vfmul_s(a, a);
+
+    __m128i hi_mask = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(c_erf_threshold.i), t);
+
+    __m128 r_hi = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c0.i), (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c1.i));
+    __m128 u = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c2.i), (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c3.i));
+    r_hi = __lsx_vfmadd_s(r_hi, s, u);
+    r_hi = __lsx_vfmadd_s(r_hi, t, (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c4.i));
+    r_hi = __lsx_vfmadd_s(r_hi, t, (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c5.i));
+    r_hi = __lsx_vfmadd_s(r_hi, t, (__m128)__lsx_vreplgr2vr_w(c_erf_hi_c6.i));
+    r_hi = __lsx_vfmadd_s(r_hi, t, __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(c_0.i), t));
+    r_hi = __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(c_1.i), exp_ps(r_hi));
+
+    __m128 r_lo = (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c0.i);
+    r_lo = __lsx_vfmadd_s(r_lo, s, (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c1.i));
+    r_lo = __lsx_vfmadd_s(r_lo, s, (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c2.i));
+    r_lo = __lsx_vfmadd_s(r_lo, s, (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c3.i));
+    r_lo = __lsx_vfmadd_s(r_lo, s, (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c4.i));
+    r_lo = __lsx_vfmadd_s(r_lo, s, (__m128)__lsx_vreplgr2vr_w(c_erf_lo_c5.i));
+    r_lo = __lsx_vfmadd_s(r_lo, a, a);
+
+    __m128 r = (__m128)__lsx_vbitsel_v((__m128i)r_lo, (__m128i)r_hi, hi_mask);
+
+    __m128i sign_mask = __lsx_vreplgr2vr_w(0x80000000);
+    __m128i sign_a = __lsx_vand_v((__m128i)a, sign_mask);
+    r = (__m128)__lsx_vor_v((__m128i)r, sign_a);
+
+    return r;
+}
+
 static inline __m128 fmod_ps(__m128 a, __m128 b)
 {
     // fmod(a,b) = a - trunc(a/b)*b  (trunc toward 0)
diff --git a/src/layer/loongarch/selu_loongarch.cpp b/src/layer/loongarch/selu_loongarch.cpp
new file mode 100644
index 000000000000..7354fcdd6e2b
--- /dev/null
+++ b/src/layer/loongarch/selu_loongarch.cpp
@@ -0,0 +1,73 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "selu_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+SELU_loongarch::SELU_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int SELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+    float alphaxlambda = alpha * lambda;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        __m128 _alphaxlambda = (__m128)__lsx_vreplfr2vr_s(alphaxlambda);
+        __m128 _lambda = (__m128)__lsx_vreplfr2vr_s(lambda);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+
+            __m128 _nps = exp_ps(_p);
+            _nps = __lsx_vfsub_s(_nps, _one);
+            _nps = __lsx_vfmul_s(_nps, _alphaxlambda);
+
+            _p = __lsx_vfmul_s(_p, _lambda);
+
+            _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_nps, (__m128i)_lemask);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (*ptr < 0.f)
+                *ptr = (expf(*ptr) - 1.f) * alphaxlambda;
+            else
+                *ptr *= lambda;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/selu_loongarch.h b/src/layer/loongarch/selu_loongarch.h
new file mode 100644
index 000000000000..119f2cdccafa
--- /dev/null
+++ b/src/layer/loongarch/selu_loongarch.h
@@ -0,0 +1,21 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_SELU_LOONGARCH_H
+#define LAYER_SELU_LOONGARCH_H
+
+#include "selu.h"
+
+namespace ncnn {
+
+class SELU_loongarch : public SELU
+{
+public:
+    SELU_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SELU_LOONGARCH_H

From 28c568dcdfd5f1d46f8986f21c115af4bc7a8b36 Mon Sep 17 00:00:00 2001
From: futz12 <1391525377@qq.com>
Date: Wed, 18 Mar 2026 12:23:41 +0800
Subject: [PATCH 2/6] add elu

---
 src/layer/loongarch/elu_loongarch.cpp | 21 ++++++++++++++++++---
 src/layer/loongarch/lasx_mathfun.h    | 10 ++++++++++
 src/layer/loongarch/lsx_mathfun.h     | 10 ++++++++++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/src/layer/loongarch/elu_loongarch.cpp b/src/layer/loongarch/elu_loongarch.cpp
index 9fae41006def..674f10202899 100644
--- a/src/layer/loongarch/elu_loongarch.cpp
+++ b/src/layer/loongarch/elu_loongarch.cpp
@@ -6,7 +6,10 @@
 #if __loongarch_sx
 #include <lsxintrin.h>
 #include "lsx_mathfun.h"
-#include "loongarch_activation.h"
+#if __loongarch_asx
+#include <lasxintrin.h>
+#include "lasx_mathfun.h"
+#endif // __loongarch_asx
 #endif // __loongarch_sx
 
 namespace ncnn {
@@ -34,12 +37,24 @@ int ELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
 
         int i = 0;
 #if __loongarch_sx
-        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha);
+#if __loongarch_asx
+        __m256 _alpha_lasx = (__m256)__lasx_xvreplfr2vr_s(alpha);
+        for (; i + 7 < size; i += 8)
+        {
+            __builtin_prefetch(ptr + 32);
+            __m256 _p = (__m256)__lasx_xvld(ptr, 0);
+            _p = elu_ps(_p, _alpha_lasx);
+            __lasx_xvst(_p, ptr, 0);
+
+            ptr += 8;
+        }
+#endif // __loongarch_asx
+        __m128 _alpha_lsx = (__m128)__lsx_vreplfr2vr_s(alpha);
         for (; i + 3 < size; i += 4)
         {
             __builtin_prefetch(ptr + 16);
             __m128 _p = (__m128)__lsx_vld(ptr, 0);
-            _p = elu_ps(_p, _alpha);
+            _p = elu_ps(_p, _alpha_lsx);
             __lsx_vst(_p, ptr, 0);
 
             ptr += 4;
diff --git a/src/layer/loongarch/lasx_mathfun.h b/src/layer/loongarch/lasx_mathfun.h
index 28a9e6fd96a9..56b0a69cf767 100644
--- a/src/layer/loongarch/lasx_mathfun.h
+++ b/src/layer/loongarch/lasx_mathfun.h
@@ -601,4 +601,14 @@ static inline __m256 sigmoid256_ps(__m256 _v)
     return __lasx_xvfdiv_s(_one, _v);
 }
 
+static inline __m256 elu_ps(__m256 _v, __m256 _alpha)
+{
+    __m256 _zero = (__m256)__lasx_xvreplgr2vr_w(0);
+    __m256 _one = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i);
+    __m256 _pos = __lasx_xvfmax_s(_v, _zero);
+    __m256 _neg = __lasx_xvfmin_s(_v, _zero);
+    _neg = __lasx_xvfsub_s(exp256_ps(_neg), _one);
+    return __lasx_xvfadd_s(_pos, __lasx_xvfmul_s(_alpha, _neg));
+}
+
 #endif // LASX_MATHFUN_H
diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
index e8fdfc44958c..7e0b58314107 100644
--- a/src/layer/loongarch/lsx_mathfun.h
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -735,4 +735,14 @@ static inline __m128 remainder_ps(__m128 a, __m128 b)
     return __lsx_vfsub_s(a, __lsx_vfmul_s(rq, b));
 }
 
+static inline __m128 elu_ps(__m128 _v, __m128 _alpha)
+{
+    __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+    __m128 _one = (__m128)__lsx_vreplgr2vr_w(c_1.i);
+    __m128 _pos = __lsx_vfmax_s(_v, _zero);
+    __m128 _neg = __lsx_vfmin_s(_v, _zero);
+    _neg = __lsx_vfsub_s(exp_ps(_neg), _one);
+    return __lsx_vfadd_s(_pos, __lsx_vfmul_s(_alpha, _neg));
+}
+
 #endif // LSX_MATHFUN_H

From b166ecdc0929d88ea5745a4ac21004878950a910 Mon Sep 17 00:00:00 2001
From: futz12 <1391525377@qq.com>
Date: Thu, 19 Mar 2026 20:18:27 +0800
Subject: [PATCH 3/6] 1. use erfc instead of erf + 1 2. avoid exp overflow

---
 src/layer/loongarch/gelu_loongarch.cpp |   8 +-
 src/layer/loongarch/lsx_mathfun.h      | 170 +++++++++++++++++++++++++
 src/layer/loongarch/selu_loongarch.cpp |  16 +--
 3 files changed, 181 insertions(+), 13 deletions(-)

diff --git a/src/layer/loongarch/gelu_loongarch.cpp b/src/layer/loongarch/gelu_loongarch.cpp
index 1be8345099ae..11bca3cebdc2 100644
--- a/src/layer/loongarch/gelu_loongarch.cpp
+++ b/src/layer/loongarch/gelu_loongarch.cpp
@@ -60,16 +60,14 @@ int GELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) con
         else
         {
             __m128 _half = (__m128)__lsx_vreplfr2vr_s(0.5f);
-            __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
-            __m128 _inv_sqrt2 = (__m128)__lsx_vreplfr2vr_s(0.70710678f);
+            __m128 _inv_sqrt2 = (__m128)__lsx_vreplfr2vr_s(-0.70710678f);
             for (; i + 3 < size; i += 4)
             {
                 __builtin_prefetch(ptr + 16);
                 __m128 _p = (__m128)__lsx_vld(ptr, 0);
 
                 __m128 _blob = __lsx_vfmul_s(_inv_sqrt2, _p);
-                _blob = erf_ps(_blob);
-                _blob = __lsx_vfadd_s(_one, _blob);
+                _blob = erfc_ps(_blob);
                 _blob = __lsx_vfmul_s(_half, __lsx_vfmul_s(_blob, _p));
                 __lsx_vst(_blob, ptr, 0);
 
@@ -85,7 +83,7 @@ int GELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) con
             }
             else
             {
-                *ptr = 0.5f * *ptr * (1.0f + erff(0.70710678f * *ptr));
+                *ptr = 0.5f * *ptr * erfcf(-0.70710678f * *ptr);
             }
 
             ptr++;
diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
index 7e0b58314107..1ffdc703f7a8 100644
--- a/src/layer/loongarch/lsx_mathfun.h
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -745,4 +745,174 @@ static inline __m128 elu_ps(__m128 _v, __m128 _alpha)
     return __lsx_vfadd_s(_pos, __lsx_vfmul_s(_alpha, _neg));
 }
 
+_LOONGARCH_FLOAT_CONST(c_erfc_erx, 8.4506291151e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pp0, 1.2837916613e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pp1, -3.2504209876e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pp2, -2.8481749818e-2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pp3, -5.7702702470e-3f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pp4, -2.3763017452e-05f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qq1, 3.9791721106e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qq2, 6.5022252500e-2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qq3, 5.0813062117e-3f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qq4, 1.3249473704e-4f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qq5, -3.9602282413e-6f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pa0, -2.3621185683e-3f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pa1, 4.1485610604e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pa2, -3.7220788002e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pa3, 3.1834661961e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pa4, -1.1089469492e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pa5, 3.5478305072e-2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_pa6, -2.1663755178e-3f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qa1, 1.0642088205e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qa2, 5.4039794207e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qa3, 7.1828655899e-2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qa4, 1.2617121637e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qa5, 1.3637083583e-2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_qa6, 1.1984500103e-2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_ra0, -9.8649440333e-3f);
+_LOONGARCH_FLOAT_CONST(c_erfc_ra1, -6.9385856390e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_ra2, -1.0558626175e+1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_ra3, -6.2375331879e+1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_ra4, -1.6239666748e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_ra5, -1.8460508728e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_ra6, -8.1287437439e+1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_ra7, -9.8143291473e+00f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sa1, 1.9651271820e+1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sa2, 1.3765776062e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sa3, 4.3456588745e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sa4, 6.4538726807e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sa5, 4.2900814819e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sa6, 1.0863500214e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sa7, 6.5702495575e+00f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sa8, -6.0424413532e-02f);
+_LOONGARCH_FLOAT_CONST(c_erfc_rb0, -9.8649431020e-3f);
+_LOONGARCH_FLOAT_CONST(c_erfc_rb1, -7.9928326607e-1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_rb2, -1.7757955551e+1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_rb3, -1.6063638306e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_rb4, -6.3756646729e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_rb5, -1.0250950928e+3f);
+_LOONGARCH_FLOAT_CONST(c_erfc_rb6, -4.8351919556e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sb1, 3.0338060379e+1f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sb2, 3.2579251099e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sb3, 1.5367296143e+3f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sb4, 3.1998581543e+3f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sb5, 2.5530502930e+3f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sb6, 4.7452853394e+2f);
+_LOONGARCH_FLOAT_CONST(c_erfc_sb7, -2.2440952301e+1f);
+
+static inline __m128 erfc_ps(__m128 x)
+{
+    __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i);
+    __m128 two = (__m128)__lsx_vreplgr2vr_w(c_2.i);
+    __m128 zero = (__m128)__lsx_vreplgr2vr_w(c_0.i);
+
+    __m128 absx = (__m128)__lsx_vbitclri_w((__m128i)x, 31);
+    __m128 x2 = __lsx_vfmul_s(x, x);
+    __m128 t = __lsx_vfdiv_s(x2, one);
+    __m128 tt = __lsx_vfsub_s(absx, one);
+
+    __m128i mask_ge_1_25 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x3fa00000), absx);
+    t = (__m128)__lsx_vbitsel_v((__m128i)tt, (__m128i)t, mask_ge_1_25);
+
+    __m128i mask_ge_0_84375 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x3f580000), absx);
+    t = (__m128)__lsx_vbitsel_v((__m128i)x2, (__m128i)t, mask_ge_0_84375);
+
+    __m128 u = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb6.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_rb5.i));
+    u = __lsx_vfmadd_s(t, u, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb4.i));
+    u = __lsx_vfmadd_s(t, u, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb3.i));
+    u = __lsx_vfmadd_s(t, u, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb2.i));
+    u = __lsx_vfmadd_s(t, u, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb1.i));
+    u = __lsx_vfmadd_s(t, u, (__m128)__lsx_vreplgr2vr_w(c_erfc_rb0.i));
+
+    __m128 v = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb7.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_sb6.i));
+    v = __lsx_vfmadd_s(t, v, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb5.i));
+    v = __lsx_vfmadd_s(t, v, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb4.i));
+    v = __lsx_vfmadd_s(t, v, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb3.i));
+    v = __lsx_vfmadd_s(t, v, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb2.i));
+    v = __lsx_vfmadd_s(t, v, (__m128)__lsx_vreplgr2vr_w(c_erfc_sb1.i));
+
+    __m128 tu = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra7.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_ra6.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra5.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra4.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra3.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra2.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra1.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_ra0.i));
+
+    __m128 tv = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa8.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_sa7.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa6.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa5.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa4.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa3.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa2.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_sa1.i));
+
+    __m128i mask_ge_2_857143 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x4036db6d), absx);
+    u = (__m128)__lsx_vbitsel_v((__m128i)tu, (__m128i)u, mask_ge_2_857143);
+    v = (__m128)__lsx_vbitsel_v((__m128i)tv, (__m128i)v, mask_ge_2_857143);
+
+    tu = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa6.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_pa5.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa4.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa3.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa2.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa1.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pa0.i));
+
+    tv = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_qa6.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_qa5.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qa4.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qa3.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qa2.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qa1.i));
+
+    u = (__m128)__lsx_vbitsel_v((__m128i)tu, (__m128i)u, mask_ge_1_25);
+    v = (__m128)__lsx_vbitsel_v((__m128i)tv, (__m128i)v, mask_ge_1_25);
+
+    tu = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_pp4.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_pp3.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pp2.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pp1.i));
+    tu = __lsx_vfmadd_s(t, tu, (__m128)__lsx_vreplgr2vr_w(c_erfc_pp0.i));
+
+    tv = __lsx_vfmadd_s(t, (__m128)__lsx_vreplgr2vr_w(c_erfc_qq5.i), (__m128)__lsx_vreplgr2vr_w(c_erfc_qq4.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qq3.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qq2.i));
+    tv = __lsx_vfmadd_s(t, tv, (__m128)__lsx_vreplgr2vr_w(c_erfc_qq1.i));
+
+    u = (__m128)__lsx_vbitsel_v((__m128i)tu, (__m128i)u, mask_ge_0_84375);
+    v = (__m128)__lsx_vbitsel_v((__m128i)tv, (__m128i)v, mask_ge_0_84375);
+
+    v = __lsx_vfmadd_s(t, v, one);
+
+    __m128 q = __lsx_vfdiv_s(u, v);
+    __m128 ret = zero;
+
+    __m128i z_i = __lsx_vand_v((__m128i)absx, __lsx_vreplgr2vr_w(0xfffff000));
+    __m128 z = (__m128)z_i;
+
+    __m128 r = exp_ps(__lsx_vfmadd_s(__lsx_vfneg_s(z), z, (__m128)__lsx_vreplgr2vr_w(0xbf100000)));
+    __m128 tmp = __lsx_vfmadd_s(__lsx_vfsub_s(z, absx), __lsx_vfadd_s(z, absx), q);
+    r = __lsx_vfmul_s(r, exp_ps(tmp));
+    r = __lsx_vfdiv_s(r, absx);
+    t = __lsx_vfsub_s(two, r);
+    __m128i mask_x_ge_0 = __lsx_vfcmp_clt_s(zero, x);
+    r = (__m128)__lsx_vbitsel_v((__m128i)t, (__m128i)r, mask_x_ge_0);
+    __m128i mask_absx_ge_28 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x41e00000), absx);
+    ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_absx_ge_28);
+
+    r = __lsx_vfsub_s(q, (__m128)__lsx_vreplgr2vr_w(0x3f58560b));
+    t = __lsx_vfadd_s(q, (__m128)__lsx_vreplgr2vr_w(0x3f98560b));
+    r = (__m128)__lsx_vbitsel_v((__m128i)t, (__m128i)r, mask_x_ge_0);
+    ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_1_25);
+
+    r = __lsx_vfsub_s(__lsx_vfmadd_s(x, q, __lsx_vfsub_s(x, (__m128)__lsx_vreplgr2vr_w(c_0p5.i))), (__m128)__lsx_vreplgr2vr_w(c_0p5.i));
+    ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_0_84375);
+
+    __m128i mask_x_lt_m6 = __lsx_vfcmp_clt_s(x, (__m128)__lsx_vreplgr2vr_w(0xc0c00000));
+    ret = (__m128)__lsx_vbitsel_v((__m128i)ret, two, mask_x_lt_m6);
+
+    __m128i mask_nan = __lsx_vfcmp_cne_s(x, x);
+    ret = (__m128)__lsx_vbitsel_v((__m128i)ret, (__m128i)x, mask_nan);
+
+    return ret;
+}
+
 #endif // LSX_MATHFUN_H
diff --git a/src/layer/loongarch/selu_loongarch.cpp b/src/layer/loongarch/selu_loongarch.cpp
index 7354fcdd6e2b..b240cddfbf4c 100644
--- a/src/layer/loongarch/selu_loongarch.cpp
+++ b/src/layer/loongarch/selu_loongarch.cpp
@@ -36,22 +36,22 @@ int SELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) con
 #if __loongarch_sx
         __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
         __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
-        __m128 _alphaxlambda = (__m128)__lsx_vreplfr2vr_s(alphaxlambda);
+        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha);
         __m128 _lambda = (__m128)__lsx_vreplfr2vr_s(lambda);
         for (; i + 3 < size; i += 4)
         {
             __builtin_prefetch(ptr + 16);
             __m128 _p = (__m128)__lsx_vld(ptr, 0);
-            __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
 
-            __m128 _nps = exp_ps(_p);
-            _nps = __lsx_vfsub_s(_nps, _one);
-            _nps = __lsx_vfmul_s(_nps, _alphaxlambda);
+            __m128 _pos = __lsx_vfmax_s(_p, _zero);
+            __m128 _neg = __lsx_vfmin_s(_p, _zero);
 
-            _p = __lsx_vfmul_s(_p, _lambda);
+            __m128 _blob = exp_ps(_neg);
+            _blob = __lsx_vfsub_s(_blob, _one);
+            _blob = __lsx_vfmul_s(_alpha, _blob);
+            _blob = __lsx_vfmul_s(_lambda, __lsx_vfadd_s(_pos, _blob));
 
-            _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_nps, (__m128i)_lemask);
-            __lsx_vst(_p, ptr, 0);
+            __lsx_vst(_blob, ptr, 0);
 
             ptr += 4;
         }

From e3a2e8d90b755e1d1ba48cc2ae0d08336ec24569 Mon Sep 17 00:00:00 2001
From: futz12 <1391525377@qq.com>
Date: Fri, 20 Mar 2026 11:07:06 +0800
Subject: [PATCH 4/6] fix erfc

---
 src/layer/loongarch/lsx_mathfun.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
index 1ffdc703f7a8..b396b39dca48 100644
--- a/src/layer/loongarch/lsx_mathfun.h
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -888,7 +888,8 @@ static inline __m128 erfc_ps(__m128 x)
     __m128i z_i = __lsx_vand_v((__m128i)absx, __lsx_vreplgr2vr_w(0xfffff000));
     __m128 z = (__m128)z_i;
 
-    __m128 r = exp_ps(__lsx_vfmadd_s(__lsx_vfneg_s(z), z, (__m128)__lsx_vreplgr2vr_w(0xbf100000)));
+    __m128 neg_z = (__m128)__lsx_vxor_v((__m128i)z, __lsx_vreplgr2vr_w(0x80000000));
+    __m128 r = exp_ps(__lsx_vfmadd_s(neg_z, z, (__m128)__lsx_vreplgr2vr_w(0xbf100000)));
     __m128 tmp = __lsx_vfmadd_s(__lsx_vfsub_s(z, absx), __lsx_vfadd_s(z, absx), q);
     r = __lsx_vfmul_s(r, exp_ps(tmp));
     r = __lsx_vfdiv_s(r, absx);
@@ -907,7 +908,7 @@ static inline __m128 erfc_ps(__m128 x)
     ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_0_84375);
 
     __m128i mask_x_lt_m6 = __lsx_vfcmp_clt_s(x, (__m128)__lsx_vreplgr2vr_w(0xc0c00000));
-    ret = (__m128)__lsx_vbitsel_v((__m128i)ret, two, mask_x_lt_m6);
+    ret = (__m128)__lsx_vbitsel_v((__m128i)ret, (__m128i)two, mask_x_lt_m6);
 
     __m128i mask_nan = __lsx_vfcmp_cne_s(x, x);
     ret = (__m128)__lsx_vbitsel_v((__m128i)ret, (__m128i)x, mask_nan);

From f6d1889d7376b5087096997ee0c163d7f4c214ca Mon Sep 17 00:00:00 2001
From: futz12 <1391525377@qq.com>
Date: Sat, 21 Mar 2026 10:41:59 +0800
Subject: [PATCH 5/6] fix erfc

---
 src/layer/loongarch/lsx_mathfun.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
index b396b39dca48..0c3075097c18 100644
--- a/src/layer/loongarch/lsx_mathfun.h
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -899,8 +899,8 @@ static inline __m128 erfc_ps(__m128 x)
     __m128i mask_absx_ge_28 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x41e00000), absx);
     ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_absx_ge_28);
 
-    r = __lsx_vfsub_s(q, (__m128)__lsx_vreplgr2vr_w(0x3f58560b));
-    t = __lsx_vfadd_s(q, (__m128)__lsx_vreplgr2vr_w(0x3f98560b));
+    r = __lsx_vfsub_s(__lsx_vfsub_s(one, (__m128)__lsx_vreplgr2vr_w(c_erfc_erx.i)), q);
+    t = __lsx_vfadd_s(q, __lsx_vfadd_s(one, (__m128)__lsx_vreplgr2vr_w(c_erfc_erx.i)));
     r = (__m128)__lsx_vbitsel_v((__m128i)t, (__m128i)r, mask_x_ge_0);
     ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_1_25);
 

From 7d9918632a023aba6113af769921a12abf1e458b Mon Sep 17 00:00:00 2001
From: futz12 <1391525377@qq.com>
Date: Tue, 14 Apr 2026 00:43:26 +0800
Subject: [PATCH 6/6] flip the eqn sign

---
 src/layer/loongarch/lsx_mathfun.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
index 0c3075097c18..e8e491d35708 100644
--- a/src/layer/loongarch/lsx_mathfun.h
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -894,7 +894,7 @@ static inline __m128 erfc_ps(__m128 x)
     r = __lsx_vfmul_s(r, exp_ps(tmp));
     r = __lsx_vfdiv_s(r, absx);
     t = __lsx_vfsub_s(two, r);
-    __m128i mask_x_ge_0 = __lsx_vfcmp_clt_s(zero, x);
+    __m128i mask_x_ge_0 = __lsx_vfcmp_cle_s(zero, x);
     r = (__m128)__lsx_vbitsel_v((__m128i)t, (__m128i)r, mask_x_ge_0);
     __m128i mask_absx_ge_28 = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(0x41e00000), absx);
     ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_absx_ge_28);
@@ -904,7 +904,7 @@ static inline __m128 erfc_ps(__m128 x)
     r = (__m128)__lsx_vbitsel_v((__m128i)t, (__m128i)r, mask_x_ge_0);
     ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_1_25);
 
-    r = __lsx_vfsub_s(__lsx_vfmadd_s(x, q, __lsx_vfsub_s(x, (__m128)__lsx_vreplgr2vr_w(c_0p5.i))), (__m128)__lsx_vreplgr2vr_w(c_0p5.i));
+    r = __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(c_0p5.i), __lsx_vfmadd_s(x, q, __lsx_vfsub_s(x, (__m128)__lsx_vreplgr2vr_w(c_0p5.i))));
     ret = (__m128)__lsx_vbitsel_v((__m128i)r, (__m128i)ret, mask_ge_0_84375);
 
     __m128i mask_x_lt_m6 = __lsx_vfcmp_clt_s(x, (__m128)__lsx_vreplgr2vr_w(0xc0c00000));