Tencent · futz12 · Mar 17, 2026 · Mar 18, 2026 · Mar 19, 2026 · Mar 20, 2026
diff --git a/src/layer/loongarch/elu_loongarch.cpp b/src/layer/loongarch/elu_loongarch.cpp
@@ -0,0 +1,75 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "elu_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#if __loongarch_asx
+#include <lasxintrin.h>
+#include "lasx_mathfun.h"
+#endif // __loongarch_asx
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+ELU_loongarch::ELU_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int ELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+#if __loongarch_asx
+        __m256 _alpha_lasx = (__m256)__lasx_xvreplfr2vr_s(alpha);
+        for (; i + 7 < size; i += 8)
+        {
+            __builtin_prefetch(ptr + 32);
+            __m256 _p = (__m256)__lasx_xvld(ptr, 0);
+            _p = elu_ps(_p, _alpha_lasx);
+            __lasx_xvst(_p, ptr, 0);
+
+            ptr += 8;
+        }
+#endif // __loongarch_asx
+        __m128 _alpha_lsx = (__m128)__lsx_vreplfr2vr_s(alpha);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = elu_ps(_p, _alpha_lsx);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (*ptr < 0.f)
+                *ptr = alpha * (expf(*ptr) - 1.f);
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/elu_loongarch.h b/src/layer/loongarch/elu_loongarch.h
@@ -0,0 +1,21 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_ELU_LOONGARCH_H
+#define LAYER_ELU_LOONGARCH_H
+
+#include "elu.h"
+
+namespace ncnn {
+
+class ELU_loongarch : public ELU
+{
+public:
+    ELU_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELU_LOONGARCH_H
diff --git a/src/layer/loongarch/erf_loongarch.cpp b/src/layer/loongarch/erf_loongarch.cpp
@@ -0,0 +1,56 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "erf_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+Erf_loongarch::Erf_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Erf_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = erf_ps(_p);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = erff(*ptr);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/erf_loongarch.h b/src/layer/loongarch/erf_loongarch.h
@@ -0,0 +1,21 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_ERF_LOONGARCH_H
+#define LAYER_ERF_LOONGARCH_H
+
+#include "erf.h"
+
+namespace ncnn {
+
+class Erf_loongarch : public Erf
+{
+public:
+    Erf_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ERF_LOONGARCH_H
diff --git a/src/layer/loongarch/gelu_loongarch.cpp b/src/layer/loongarch/gelu_loongarch.cpp
@@ -0,0 +1,96 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "gelu_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+GELU_loongarch::GELU_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int GELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        if (fast_gelu)
+        {
+            __m128 _half = (__m128)__lsx_vreplfr2vr_s(0.5f);
+            __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+            __m128 _fast1c = (__m128)__lsx_vreplfr2vr_s(0.79788452f);
+            __m128 _fast2c = (__m128)__lsx_vreplfr2vr_s(0.044715f);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+
+                __m128 _cube = __lsx_vfmul_s(_p, _p);
+                _cube = __lsx_vfmul_s(_p, _cube);
+                __m128 _blob = __lsx_vfmul_s(_fast2c, _cube);
+                _blob = __lsx_vfadd_s(_p, _blob);
+                _blob = __lsx_vfmul_s(_fast1c, _blob);
+                _blob = tanh_ps(_blob);
+                _blob = __lsx_vfadd_s(_one, _blob);
+                _blob = __lsx_vfmul_s(_half, __lsx_vfmul_s(_blob, _p));
+                __lsx_vst(_blob, ptr, 0);
+
+                ptr += 4;
+            }
+        }
+        else
+        {
+            __m128 _half = (__m128)__lsx_vreplfr2vr_s(0.5f);
+            __m128 _inv_sqrt2 = (__m128)__lsx_vreplfr2vr_s(-0.70710678f);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+
+                __m128 _blob = __lsx_vfmul_s(_inv_sqrt2, _p);
+                _blob = erfc_ps(_blob);
+                _blob = __lsx_vfmul_s(_half, __lsx_vfmul_s(_blob, _p));
+                __lsx_vst(_blob, ptr, 0);
+
+                ptr += 4;
+            }
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (fast_gelu)
+            {
+                *ptr = 0.5f * *ptr * (1.0f + tanhf(0.79788452f * (*ptr + 0.044715f * *ptr * *ptr * *ptr)));
+            }
+            else
+            {
+                *ptr = 0.5f * *ptr * erfcf(-0.70710678f * *ptr);
+            }
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/gelu_loongarch.h b/src/layer/loongarch/gelu_loongarch.h
@@ -0,0 +1,21 @@
+// Copyright 2024 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_GELU_LOONGARCH_H
+#define LAYER_GELU_LOONGARCH_H
+
+#include "gelu.h"
+
+namespace ncnn {
+
+class GELU_loongarch : public GELU
+{
+public:
+    GELU_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GELU_LOONGARCH_H
diff --git a/src/layer/loongarch/lasx_mathfun.h b/src/layer/loongarch/lasx_mathfun.h
@@ -601,4 +601,14 @@ static inline __m256 sigmoid256_ps(__m256 _v)
     return __lasx_xvfdiv_s(_one, _v);
 }
 
+static inline __m256 elu_ps(__m256 _v, __m256 _alpha)
+{
+    __m256 _zero = (__m256)__lasx_xvreplgr2vr_w(0);
+    __m256 _one = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i);
+    __m256 _pos = __lasx_xvfmax_s(_v, _zero);
+    __m256 _neg = __lasx_xvfmin_s(_v, _zero);
+    _neg = __lasx_xvfsub_s(exp256_ps(_neg), _one);
+    return __lasx_xvfadd_s(_pos, __lasx_xvfmul_s(_alpha, _neg));
+}
+
 #endif // LASX_MATHFUN_H