Skip to content

Commit 0f5c6ef

Browse files
nihuiCopilotclaude
authored
massive mips and loongarch optimization (#6662)
Co-authored-by: nihui <171016+nihui@users.noreply.github.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3724d10 commit 0f5c6ef

396 files changed

Lines changed: 165628 additions & 24237 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

cmake/ncnn_add_layer.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ macro(ncnn_add_layer class)
401401
if(NCNN_RUNTIME_CPU AND NCNN_MSA)
402402
ncnn_add_arch_opt_layer(${class} msa "-mmsa")
403403
endif()
404-
if(NCNN_MMI)
404+
if(NCNN_RUNTIME_CPU AND NCNN_MMI)
405405
ncnn_add_arch_opt_source(${class} mmi "-mloongson-mmi")
406406
endif()
407407
endif()

src/layer/gemm.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,12 @@ static void gemm_transB_int8(const Mat& A_int8, const Mat& BT_int8, const Mat& A
227227
for (int k = 0; k < K; k++)
228228
{
229229
sum += ptrA[k] * ptrBT[k];
230+
#if __mips_loongson_mmi && !__mips_msa
231+
// GCC may mis-vectorize this int8 dot loop with -mloongson-mmi.
232+
// Keep this loop scalar without disabling tree-vectorize globally.
233+
asm volatile("" ::
234+
: "memory");
235+
#endif
230236
}
231237

232238
float sum_fp32 = sum * descale;

src/layer/loongarch/absval_loongarch.cpp

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,33 @@
55

66
#if __loongarch_sx
77
#include <lsxintrin.h>
8+
#if __loongarch_asx
9+
#include <lasxintrin.h>
10+
#endif // __loongarch_asx
811
#endif // __loongarch_sx
912

13+
#include "loongarch_usability.h"
14+
1015
namespace ncnn {
1116

1217
AbsVal_loongarch::AbsVal_loongarch()
1318
{
1419
#if __loongarch_sx
1520
support_packing = true;
21+
support_any_packing = true;
22+
#endif
23+
#if NCNN_BF16
24+
support_bf16_storage = true;
1625
#endif
1726
}
1827

1928
int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
2029
{
30+
int elembits = bottom_top_blob.elembits();
31+
32+
if (elembits == 16)
33+
return forward_inplace_bf16s_fp16s(bottom_top_blob, opt);
34+
2135
int w = bottom_top_blob.w;
2236
int h = bottom_top_blob.h;
2337
int d = bottom_top_blob.d;
@@ -32,6 +46,17 @@ int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) c
3246

3347
int i = 0;
3448
#if __loongarch_sx
49+
#if __loongarch_asx
50+
for (; i + 7 < size; i += 8)
51+
{
52+
__builtin_prefetch(ptr + 32);
53+
__m256i _p = __lasx_xvld(ptr, 0);
54+
__m256i _outp = __lasx_xvbitclri_w(_p, 31);
55+
__lasx_xvst(_outp, ptr, 0);
56+
57+
ptr += 8;
58+
}
59+
#endif // __loongarch_asx
3560
for (; i + 3 < size; i += 4)
3661
{
3762
__builtin_prefetch(ptr + 16);
@@ -53,4 +78,56 @@ int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) c
5378
return 0;
5479
}
5580

81+
int AbsVal_loongarch::forward_inplace_bf16s_fp16s(Mat& bottom_top_blob, const Option& opt) const
82+
{
83+
int w = bottom_top_blob.w;
84+
int h = bottom_top_blob.h;
85+
int d = bottom_top_blob.d;
86+
int channels = bottom_top_blob.c;
87+
int elempack = bottom_top_blob.elempack;
88+
int size = w * h * d * elempack;
89+
90+
// fp16/bf16 abs: sign bit is bit 15 for both formats.
91+
// Reinterpret pairs of 16-bit values as 32-bit and apply AND with
92+
// 0x7fff7fff to clear both sign bits in one 32-bit operation.
93+
// No fp32 round-trip required.
94+
95+
#pragma omp parallel for num_threads(opt.num_threads)
96+
for (int q = 0; q < channels; q++)
97+
{
98+
unsigned short* ptr = bottom_top_blob.channel(q);
99+
100+
int i = 0;
101+
#if __loongarch_sx
102+
#if __loongarch_asx
103+
__m256i _sign_mask256 = (__m256i)__lasx_xvreplgr2vr_w(0x7fff7fff);
104+
for (; i + 15 < size; i += 16)
105+
{
106+
__m256i _p = __lasx_xvld(ptr, 0);
107+
__m256i _outp = __lasx_xvand_v(_p, _sign_mask256);
108+
__lasx_xvst(_outp, ptr, 0);
109+
110+
ptr += 16;
111+
}
112+
#endif // __loongarch_asx
113+
__m128i _sign_mask = (__m128i)__lsx_vreplgr2vr_w(0x7fff7fff);
114+
for (; i + 7 < size; i += 8)
115+
{
116+
__m128i _p = __lsx_vld(ptr, 0);
117+
__m128i _outp = __lsx_vand_v(_p, _sign_mask);
118+
__lsx_vst(_outp, ptr, 0);
119+
120+
ptr += 8;
121+
}
122+
#endif // __loongarch_sx
123+
for (; i < size; i++)
124+
{
125+
*ptr = *ptr & 0x7fffu;
126+
ptr++;
127+
}
128+
}
129+
130+
return 0;
131+
}
132+
56133
} // namespace ncnn

src/layer/loongarch/absval_loongarch.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ class AbsVal_loongarch : public AbsVal
1414
AbsVal_loongarch();
1515

1616
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
17+
18+
protected:
19+
int forward_inplace_bf16s_fp16s(Mat& bottom_top_blob, const Option& opt) const;
1720
};
1821

1922
} // namespace ncnn

0 commit comments

Comments
 (0)