Tencent
diff --git a/‎cmake/ncnn_add_layer.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/ncnn_add_layer.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/layer/gemm.cpp‎
Lines changed: 6 additions & 0 deletions b/‎src/layer/gemm.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/layer/loongarch/absval_loongarch.cpp‎
Lines changed: 77 additions & 0 deletions b/‎src/layer/loongarch/absval_loongarch.cpp‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎src/layer/loongarch/absval_loongarch.h‎
Lines changed: 3 additions & 0 deletions b/‎src/layer/loongarch/absval_loongarch.h‎
Lines changed: 3 additions & 0 deletions
@@ -401,7 +401,7 @@ macro(ncnn_add_layer class)
         if(NCNN_RUNTIME_CPU AND NCNN_MSA)
             ncnn_add_arch_opt_layer(${class} msa "-mmsa")
         endif()
-        if(NCNN_MMI)
+        if(NCNN_RUNTIME_CPU AND NCNN_MMI)
             ncnn_add_arch_opt_source(${class} mmi "-mloongson-mmi")
         endif()
     endif()
 
@@ -227,6 +227,12 @@ static void gemm_transB_int8(const Mat& A_int8, const Mat& BT_int8, const Mat& A
             for (int k = 0; k < K; k++)
             {
                 sum += ptrA[k] * ptrBT[k];
+#if __mips_loongson_mmi && !__mips_msa
+                // GCC may mis-vectorize this int8 dot loop with -mloongson-mmi.
+                // Keep this loop scalar without disabling tree-vectorize globally.
+                asm volatile("" ::
+                             : "memory");
+#endif
             }
 
             float sum_fp32 = sum * descale;
 
@@ -5,19 +5,33 @@
 
 #if __loongarch_sx
 #include <lsxintrin.h>
+#if __loongarch_asx
+#include <lasxintrin.h>
+#endif // __loongarch_asx
 #endif // __loongarch_sx
 
+#include "loongarch_usability.h"
+
 namespace ncnn {
 
 AbsVal_loongarch::AbsVal_loongarch()
 {
 #if __loongarch_sx
     support_packing = true;
+    support_any_packing = true;
+#endif
+#if NCNN_BF16
+    support_bf16_storage = true;
 #endif
 }
 
 int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    int elembits = bottom_top_blob.elembits();
+
+    if (elembits == 16)
+        return forward_inplace_bf16s_fp16s(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int d = bottom_top_blob.d;
@@ -32,6 +46,17 @@ int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) c
 
         int i = 0;
 #if __loongarch_sx
+#if __loongarch_asx
+        for (; i + 7 < size; i += 8)
+        {
+            __builtin_prefetch(ptr + 32);
+            __m256i _p = __lasx_xvld(ptr, 0);
+            __m256i _outp = __lasx_xvbitclri_w(_p, 31);
+            __lasx_xvst(_outp, ptr, 0);
+
+            ptr += 8;
+        }
+#endif // __loongarch_asx
         for (; i + 3 < size; i += 4)
         {
             __builtin_prefetch(ptr + 16);
@@ -53,4 +78,56 @@ int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) c
     return 0;
 }
 
+int AbsVal_loongarch::forward_inplace_bf16s_fp16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    // fp16/bf16 abs: sign bit is bit 15 for both formats.
+    // Reinterpret pairs of 16-bit values as 32-bit and apply AND with
+    // 0x7fff7fff to clear both sign bits in one 32-bit operation.
+    // No fp32 round-trip required.
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        unsigned short* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+#if __loongarch_asx
+        __m256i _sign_mask256 = (__m256i)__lasx_xvreplgr2vr_w(0x7fff7fff);
+        for (; i + 15 < size; i += 16)
+        {
+            __m256i _p = __lasx_xvld(ptr, 0);
+            __m256i _outp = __lasx_xvand_v(_p, _sign_mask256);
+            __lasx_xvst(_outp, ptr, 0);
+
+            ptr += 16;
+        }
+#endif // __loongarch_asx
+        __m128i _sign_mask = (__m128i)__lsx_vreplgr2vr_w(0x7fff7fff);
+        for (; i + 7 < size; i += 8)
+        {
+            __m128i _p = __lsx_vld(ptr, 0);
+            __m128i _outp = __lsx_vand_v(_p, _sign_mask);
+            __lsx_vst(_outp, ptr, 0);
+
+            ptr += 8;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = *ptr & 0x7fffu;
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
 } // namespace ncnn
@@ -14,6 +14,9 @@ class AbsVal_loongarch : public AbsVal
     AbsVal_loongarch();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+    int forward_inplace_bf16s_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
Original file line number	Diff line number	Diff line change
`@@ -227,6 +227,12 @@ static void gemm_transB_int8(const Mat& A_int8, const Mat& BT_int8, const Mat& A`
`227`	`227`	`for (int k = 0; k < K; k++)`
`228`	`228`	`{`
`229`	`229`	`sum += ptrA[k] * ptrBT[k];`
	`230`	`+#if __mips_loongson_mmi && !__mips_msa`
	`231`	`+ // GCC may mis-vectorize this int8 dot loop with -mloongson-mmi.`
	`232`	`+ // Keep this loop scalar without disabling tree-vectorize globally.`
	`233`	`+ asm volatile("" ::`
	`234`	`+ : "memory");`
	`235`	`+#endif`
`230`	`236`	`}`
`231`	`237`
`232`	`238`	`float sum_fp32 = sum * descale;`