Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions src/layer/loongarch/elu_loongarch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "elu_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#if __loongarch_asx
#include <lasxintrin.h>
#include "lasx_mathfun.h"
#endif // __loongarch_asx
#endif // __loongarch_sx

namespace ncnn {

ELU_loongarch::ELU_loongarch()
{
#if __loongarch_sx
support_packing = true;
#endif // __loongarch_sx
}

int ELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

int i = 0;
#if __loongarch_sx
#if __loongarch_asx
__m256 _alpha_lasx = (__m256)__lasx_xvreplfr2vr_s(alpha);
for (; i + 7 < size; i += 8)
{
__builtin_prefetch(ptr + 32);
__m256 _p = (__m256)__lasx_xvld(ptr, 0);
_p = elu_ps(_p, _alpha_lasx);
__lasx_xvst(_p, ptr, 0);

ptr += 8;
}
#endif // __loongarch_asx
__m128 _alpha_lsx = (__m128)__lsx_vreplfr2vr_s(alpha);
for (; i + 3 < size; i += 4)
{
__builtin_prefetch(ptr + 16);
__m128 _p = (__m128)__lsx_vld(ptr, 0);
_p = elu_ps(_p, _alpha_lsx);
__lsx_vst(_p, ptr, 0);

ptr += 4;
}
#endif // __loongarch_sx
for (; i < size; i++)
{
if (*ptr < 0.f)
*ptr = alpha * (expf(*ptr) - 1.f);

ptr++;
}
}

return 0;
}

} // namespace ncnn
21 changes: 21 additions & 0 deletions src/layer/loongarch/elu_loongarch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ELU_LOONGARCH_H
#define LAYER_ELU_LOONGARCH_H

#include "elu.h"

namespace ncnn {

class ELU_loongarch : public ELU
{
public:
ELU_loongarch();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_ELU_LOONGARCH_H
56 changes: 56 additions & 0 deletions src/layer/loongarch/erf_loongarch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright 2026 Futz12 <pchar.cn>
// SPDX-License-Identifier: BSD-3-Clause

#include "erf_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#endif // __loongarch_sx

namespace ncnn {

Erf_loongarch::Erf_loongarch()
{
#if __loongarch_sx
support_packing = true;
#endif
}

int Erf_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

int i = 0;
#if __loongarch_sx
for (; i + 3 < size; i += 4)
{
__builtin_prefetch(ptr + 16);
__m128 _p = (__m128)__lsx_vld(ptr, 0);
_p = erf_ps(_p);
__lsx_vst(_p, ptr, 0);

ptr += 4;
}
#endif // __loongarch_sx
for (; i < size; i++)
{
*ptr = erff(*ptr);
ptr++;
}
}

return 0;
}

} // namespace ncnn
21 changes: 21 additions & 0 deletions src/layer/loongarch/erf_loongarch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright 2026 Futz12 <pchar.cn>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ERF_LOONGARCH_H
#define LAYER_ERF_LOONGARCH_H

#include "erf.h"

namespace ncnn {

class Erf_loongarch : public Erf
{
public:
Erf_loongarch();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_ERF_LOONGARCH_H
96 changes: 96 additions & 0 deletions src/layer/loongarch/gelu_loongarch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gelu_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#endif // __loongarch_sx

namespace ncnn {

GELU_loongarch::GELU_loongarch()
{
#if __loongarch_sx
support_packing = true;
#endif // __loongarch_sx
}

int GELU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

int i = 0;
#if __loongarch_sx
if (fast_gelu)
{
__m128 _half = (__m128)__lsx_vreplfr2vr_s(0.5f);
__m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
__m128 _fast1c = (__m128)__lsx_vreplfr2vr_s(0.79788452f);
__m128 _fast2c = (__m128)__lsx_vreplfr2vr_s(0.044715f);
for (; i + 3 < size; i += 4)
{
__builtin_prefetch(ptr + 16);
__m128 _p = (__m128)__lsx_vld(ptr, 0);

__m128 _cube = __lsx_vfmul_s(_p, _p);
_cube = __lsx_vfmul_s(_p, _cube);
__m128 _blob = __lsx_vfmul_s(_fast2c, _cube);
_blob = __lsx_vfadd_s(_p, _blob);
_blob = __lsx_vfmul_s(_fast1c, _blob);
_blob = tanh_ps(_blob);
_blob = __lsx_vfadd_s(_one, _blob);
_blob = __lsx_vfmul_s(_half, __lsx_vfmul_s(_blob, _p));
__lsx_vst(_blob, ptr, 0);

ptr += 4;
}
}
else
{
__m128 _half = (__m128)__lsx_vreplfr2vr_s(0.5f);
__m128 _inv_sqrt2 = (__m128)__lsx_vreplfr2vr_s(-0.70710678f);
for (; i + 3 < size; i += 4)
{
__builtin_prefetch(ptr + 16);
__m128 _p = (__m128)__lsx_vld(ptr, 0);

__m128 _blob = __lsx_vfmul_s(_inv_sqrt2, _p);
_blob = erfc_ps(_blob);
_blob = __lsx_vfmul_s(_half, __lsx_vfmul_s(_blob, _p));
__lsx_vst(_blob, ptr, 0);
Comment thread
futz12 marked this conversation as resolved.

ptr += 4;
}
}
#endif // __loongarch_sx
for (; i < size; i++)
{
if (fast_gelu)
{
*ptr = 0.5f * *ptr * (1.0f + tanhf(0.79788452f * (*ptr + 0.044715f * *ptr * *ptr * *ptr)));
}
else
{
*ptr = 0.5f * *ptr * erfcf(-0.70710678f * *ptr);
}
Comment thread
futz12 marked this conversation as resolved.

ptr++;
}
}

return 0;
}

} // namespace ncnn
21 changes: 21 additions & 0 deletions src/layer/loongarch/gelu_loongarch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GELU_LOONGARCH_H
#define LAYER_GELU_LOONGARCH_H

#include "gelu.h"

namespace ncnn {

class GELU_loongarch : public GELU
{
public:
GELU_loongarch();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_GELU_LOONGARCH_H
10 changes: 10 additions & 0 deletions src/layer/loongarch/lasx_mathfun.h
Original file line number Diff line number Diff line change
Expand Up @@ -601,4 +601,14 @@ static inline __m256 sigmoid256_ps(__m256 _v)
return __lasx_xvfdiv_s(_one, _v);
}

static inline __m256 elu_ps(__m256 _v, __m256 _alpha)
{
__m256 _zero = (__m256)__lasx_xvreplgr2vr_w(0);
__m256 _one = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i);
__m256 _pos = __lasx_xvfmax_s(_v, _zero);
__m256 _neg = __lasx_xvfmin_s(_v, _zero);
_neg = __lasx_xvfsub_s(exp256_ps(_neg), _one);
return __lasx_xvfadd_s(_pos, __lasx_xvfmul_s(_alpha, _neg));
}

#endif // LASX_MATHFUN_H
Loading