Tencent
diff --git a/‎src/layer/arm/arm_usability.h‎
Lines changed: 4 additions & 0 deletions b/‎src/layer/arm/arm_usability.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/layer/arm/gemm_arm.cpp‎
Lines changed: 46 additions & 157 deletions b/‎src/layer/arm/gemm_arm.cpp‎
Lines changed: 46 additions & 157 deletions
diff --git a/‎src/layer/arm/gemm_arm_asimdhp.cpp‎
Lines changed: 16 additions & 17 deletions b/‎src/layer/arm/gemm_arm_asimdhp.cpp‎
Lines changed: 16 additions & 17 deletions
diff --git a/‎src/layer/arm/gemm_arm_bf16.cpp‎
Lines changed: 64 additions & 0 deletions b/‎src/layer/arm/gemm_arm_bf16.cpp‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎src/layer/arm/gemm_arm_vfpv4.cpp‎
Lines changed: 14 additions & 15 deletions b/‎src/layer/arm/gemm_arm_vfpv4.cpp‎
Lines changed: 14 additions & 15 deletions
@@ -17,7 +17,11 @@ static inline signed char float2int8(float v)
 
 static inline uint16x4_t float2bfloat(float32x4_t _v)
 {
+#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+    return (uint16x4_t)vcvt_bf16_f32(_v);
+#else
     return vshrn_n_u32(vreinterpretq_u32_f32(_v), 16);
+#endif
 }
 static inline float32x4_t bfloat2float(uint16x4_t _v)
 {
 
@@ -13,7 +13,6 @@
 
 namespace ncnn {
 
-#include "gemm_bf16s_fp16s.h"
 #include "gemm_fp16s.h"
 
 #if NCNN_INT8
@@ -2378,11 +2377,11 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl
 
         if (transB)
         {
-            pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
+            pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);
         }
         else
         {
-            transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
+            transpose_pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);
         }
     }
 
@@ -2415,7 +2414,7 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl
 
             if (broadcast_type_C == 3)
             {
-                pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);
+                pack_A_tile_fp16(C, topT_tile, i, max_ii, j, max_jj);
             }
 
             const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;
@@ -2434,11 +2433,11 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl
                 {
                     if (transA)
                     {
-                        transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
+                        transpose_pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);
                     }
                     else
                     {
-                        pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
+                        pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);
                     }
                 }
 
@@ -2449,7 +2448,7 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl
 
             if (output_transpose)
             {
-                transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
+                transpose_unpack_output_tile_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
             }
         }
     }
@@ -2495,11 +2494,11 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to
 
         if (transB)
         {
-            pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
+            pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);
         }
         else
         {
-            transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
+            transpose_pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);
         }
     }
 
@@ -2528,7 +2527,7 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to
 
             if (broadcast_type_C == 3)
             {
-                pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);
+                pack_A_tile_fp16(C, topT_tile, i, max_ii, j, max_jj);
             }
 
             const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;
@@ -2550,7 +2549,7 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to
 
             if (output_transpose)
             {
-                transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
+                transpose_unpack_output_tile_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
             }
         }
     }
@@ -2605,7 +2604,7 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to
 
             if (broadcast_type_C == 3)
             {
-                pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);
+                pack_A_tile_fp16(C, topT_tile, i, max_ii, j, max_jj);
             }
 
             const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;
@@ -2624,11 +2623,11 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to
                 {
                     if (transA)
                     {
-                        transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
+                        transpose_pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);
                     }
                     else
                     {
-                        pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
+                        pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);
                     }
                 }
 
@@ -2639,7 +2638,7 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to
 
             if (output_transpose)
             {
-                transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
+                transpose_unpack_output_tile_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
             }
         }
     }
@@ -2684,7 +2683,7 @@ static int gemm_AT_BT_arm_fp16sa(const Mat& AT, const Mat& BT, const Mat& C, Mat
 
             if (broadcast_type_C == 3)
             {
-                pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);
+                pack_A_tile_fp16(C, topT_tile, i, max_ii, j, max_jj);
             }
 
             const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;
@@ -2706,7 +2705,7 @@ static int gemm_AT_BT_arm_fp16sa(const Mat& AT, const Mat& BT, const Mat& C, Mat
 
             if (output_transpose)
             {
-                transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
+                transpose_unpack_output_tile_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
             }
         }
     }
 
@@ -0,0 +1,64 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "cpu.h"
+#include "mat.h"
+#include "arm_usability.h"
+
+namespace ncnn {
+
+#if NCNN_BF16
+#include "gemm_bf16s.h"
+
+void pack_A_tile_bf16_bf16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
+{
+    pack_A_tile_bf16(A, AT, i, max_ii, k, max_kk);
+}
+
+void transpose_pack_A_tile_bf16_bf16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
+{
+    transpose_pack_A_tile_bf16(A, AT, i, max_ii, k, max_kk);
+}
+
+void pack_B_tile_bf16_bf16(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
+{
+    pack_B_tile_bf16(B, BT, j, max_jj, k, max_kk);
+}
+
+void transpose_pack_B_tile_bf16_bf16(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
+{
+    transpose_pack_B_tile_bf16(B, BT, j, max_jj, k, max_kk);
+}
+
+void pack_A_tile_fp32_to_bf16_bf16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
+{
+    pack_A_tile_fp32_to_bf16(A, AT, i, max_ii, k, max_kk);
+}
+
+void transpose_pack_A_tile_fp32_to_bf16_bf16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
+{
+    transpose_pack_A_tile_fp32_to_bf16(A, AT, i, max_ii, k, max_kk);
+}
+
+void pack_B_tile_fp32_to_bf16_bf16(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
+{
+    pack_B_tile_fp32_to_bf16(B, BT, j, max_jj, k, max_kk);
+}
+
+void transpose_pack_B_tile_fp32_to_bf16_bf16(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
+{
+    transpose_pack_B_tile_fp32_to_bf16(B, BT, j, max_jj, k, max_kk);
+}
+
+void unpack_output_tile_fp32_to_bf16_bf16(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, float alpha, float beta, int output_transpose)
+{
+    unpack_output_tile_fp32_to_bf16(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, alpha, beta, output_transpose);
+}
+
+void gemm_transB_packed_tile_bf16s_bf16(const Mat& AT_tile, const Mat& BT_tile, Mat& topT_tile, int max_ii, int max_jj, int k, int max_kk)
+{
+    gemm_transB_packed_tile_bf16s(AT_tile, BT_tile, topT_tile, max_ii, max_jj, k, max_kk);
+}
+#endif // NCNN_BF16
+
+} // namespace ncnn
@@ -13,7 +13,6 @@
 
 namespace ncnn {
 
-#include "gemm_bf16s_fp16s.h"
 #include "gemm_fp16s.h"
 
 #if NCNN_INT8
@@ -31,7 +30,7 @@ static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
     // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);
 
     int TILE_M, TILE_N, TILE_K;
-    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
+    get_optimal_tile_mnk_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
 
     // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);
 
@@ -65,11 +64,11 @@ static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
 
         if (transB)
         {
-            pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
+            pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);
         }
         else
         {
-            transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
+            transpose_pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);
         }
     }
 
@@ -121,11 +120,11 @@ static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
                 {
                     if (transA)
                     {
-                        transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
+                        transpose_pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);
                     }
                     else
                     {
-                        pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
+                        pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);
                     }
                 }
 
@@ -152,7 +151,7 @@ static int gemm_AT_arm_fp16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top
     // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);
 
     int TILE_M, TILE_N, TILE_K;
-    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
+    get_optimal_tile_mnk_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
 
     // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);
 
@@ -183,11 +182,11 @@ static int gemm_AT_arm_fp16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top
 
         if (transB)
         {
-            pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
+            pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);
         }
         else
         {
-            transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
+            transpose_pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);
         }
     }
 
@@ -254,7 +253,7 @@ static int gemm_BT_arm_fp16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top
     // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);
 
     int TILE_M, TILE_N, TILE_K;
-    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
+    get_optimal_tile_mnk_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
 
     // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);
 
@@ -313,11 +312,11 @@ static int gemm_BT_arm_fp16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top
                 {
                     if (transA)
                     {
-                        transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
+                        transpose_pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);
                     }
                     else
                     {
-                        pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
+                        pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);
                     }
                 }
 
@@ -342,7 +341,7 @@ static int gemm_AT_BT_arm_fp16s(const Mat& AT, const Mat& BT, const Mat& C, Mat&
     // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);
 
     int TILE_M, TILE_N, TILE_K;
-    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
+    get_optimal_tile_mnk_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
 
     // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);
 
@@ -413,7 +412,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
         const int K = constantK;
 
         int TILE_M, TILE_N, TILE_K;
-        get_optimal_tile_mnk_bf16s_fp16s(M, 0, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);
+        get_optimal_tile_mnk_fp16s(M, 0, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);
 
         const int nn_M = (M + TILE_M - 1) / TILE_M;
 
@@ -454,7 +453,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
         const int K = constantK;
 
         int TILE_M, TILE_N, TILE_K;
-        get_optimal_tile_mnk_bf16s_fp16s(0, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);
+        get_optimal_tile_mnk_fp16s(0, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);
 
         const int nn_N = (N + TILE_N - 1) / TILE_N;
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,11 @@ static inline signed char float2int8(float v)`
`17`	`17`
`18`	`18`	`static inline uint16x4_t float2bfloat(float32x4_t _v)`
`19`	`19`	`{`
	`20`	`+#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC`
	`21`	`+ return (uint16x4_t)vcvt_bf16_f32(_v);`
	`22`	`+#else`
`20`	`23`	`return vshrn_n_u32(vreinterpretq_u32_f32(_v), 16);`
	`24`	`+#endif`
`21`	`25`	`}`
`22`	`26`	`static inline float32x4_t bfloat2float(uint16x4_t _v)`
`23`	`27`	`{`
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,6 @@`
`13`	`13`
`14`	`14`	`namespace ncnn {`
`15`	`15`
`16`		`-#include "gemm_bf16s_fp16s.h"`
`17`	`16`	`#include "gemm_fp16s.h"`
`18`	`17`
`19`	`18`	`#if NCNN_INT8`
`@@ -2378,11 +2377,11 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl`
`2378`	`2377`
`2379`	`2378`	`if (transB)`
`2380`	`2379`	`{`
`2381`		`- pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);`
	`2380`	`+ pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);`
`2382`	`2381`	`}`
`2383`	`2382`	`else`
`2384`	`2383`	`{`
`2385`		`- transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);`
	`2384`	`+ transpose_pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);`
`2386`	`2385`	`}`
`2387`	`2386`	`}`
`2388`	`2387`
`@@ -2415,7 +2414,7 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl`
`2415`	`2414`
`2416`	`2415`	`if (broadcast_type_C == 3)`
`2417`	`2416`	`{`
`2418`		`- pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);`
	`2417`	`+ pack_A_tile_fp16(C, topT_tile, i, max_ii, j, max_jj);`
`2419`	`2418`	`}`
`2420`	`2419`
`2421`	`2420`	`const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;`
`@@ -2434,11 +2433,11 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl`
`2434`	`2433`	`{`
`2435`	`2434`	`if (transA)`
`2436`	`2435`	`{`
`2437`		`- transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);`
	`2436`	`+ transpose_pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);`
`2438`	`2437`	`}`
`2439`	`2438`	`else`
`2440`	`2439`	`{`
`2441`		`- pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);`
	`2440`	`+ pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);`
`2442`	`2441`	`}`
`2443`	`2442`	`}`
`2444`	`2443`
`@@ -2449,7 +2448,7 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl`
`2449`	`2448`
`2450`	`2449`	`if (output_transpose)`
`2451`	`2450`	`{`
`2452`		`- transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);`
	`2451`	`+ transpose_unpack_output_tile_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);`
`2453`	`2452`	`}`
`2454`	`2453`	`}`
`2455`	`2454`	`}`
`@@ -2495,11 +2494,11 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to`
`2495`	`2494`
`2496`	`2495`	`if (transB)`
`2497`	`2496`	`{`
`2498`		`- pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);`
	`2497`	`+ pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);`
`2499`	`2498`	`}`
`2500`	`2499`	`else`
`2501`	`2500`	`{`
`2502`		`- transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);`
	`2501`	`+ transpose_pack_B_tile_fp16(B, BT_tile, j, max_jj, k, max_kk);`
`2503`	`2502`	`}`
`2504`	`2503`	`}`
`2505`	`2504`
`@@ -2528,7 +2527,7 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to`
`2528`	`2527`
`2529`	`2528`	`if (broadcast_type_C == 3)`
`2530`	`2529`	`{`
`2531`		`- pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);`
	`2530`	`+ pack_A_tile_fp16(C, topT_tile, i, max_ii, j, max_jj);`
`2532`	`2531`	`}`
`2533`	`2532`
`2534`	`2533`	`const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;`
`@@ -2550,7 +2549,7 @@ static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& to`
`2550`	`2549`
`2551`	`2550`	`if (output_transpose)`
`2552`	`2551`	`{`
`2553`		`- transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);`
	`2552`	`+ transpose_unpack_output_tile_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);`
`2554`	`2553`	`}`
`2555`	`2554`	`}`
`2556`	`2555`	`}`
`@@ -2605,7 +2604,7 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to`
`2605`	`2604`
`2606`	`2605`	`if (broadcast_type_C == 3)`
`2607`	`2606`	`{`
`2608`		`- pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);`
	`2607`	`+ pack_A_tile_fp16(C, topT_tile, i, max_ii, j, max_jj);`
`2609`	`2608`	`}`
`2610`	`2609`
`2611`	`2610`	`const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;`
`@@ -2624,11 +2623,11 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to`
`2624`	`2623`	`{`
`2625`	`2624`	`if (transA)`
`2626`	`2625`	`{`
`2627`		`- transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);`
	`2626`	`+ transpose_pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);`
`2628`	`2627`	`}`
`2629`	`2628`	`else`
`2630`	`2629`	`{`
`2631`		`- pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);`
	`2630`	`+ pack_A_tile_fp16(A, AT_tile, i, max_ii, k, max_kk);`
`2632`	`2631`	`}`
`2633`	`2632`	`}`
`2634`	`2633`
`@@ -2639,7 +2638,7 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to`
`2639`	`2638`
`2640`	`2639`	`if (output_transpose)`
`2641`	`2640`	`{`
`2642`		`- transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);`
	`2641`	`+ transpose_unpack_output_tile_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);`
`2643`	`2642`	`}`
`2644`	`2643`	`}`
`2645`	`2644`	`}`
`@@ -2684,7 +2683,7 @@ static int gemm_AT_BT_arm_fp16sa(const Mat& AT, const Mat& BT, const Mat& C, Mat`
`2684`	`2683`
`2685`	`2684`	`if (broadcast_type_C == 3)`
`2686`	`2685`	`{`
`2687`		`- pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);`
	`2686`	`+ pack_A_tile_fp16(C, topT_tile, i, max_ii, j, max_jj);`
`2688`	`2687`	`}`
`2689`	`2688`
`2690`	`2689`	`const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;`
`@@ -2706,7 +2705,7 @@ static int gemm_AT_BT_arm_fp16sa(const Mat& AT, const Mat& BT, const Mat& C, Mat`
`2706`	`2705`
`2707`	`2706`	`if (output_transpose)`
`2708`	`2707`	`{`
`2709`		`- transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);`
	`2708`	`+ transpose_unpack_output_tile_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);`
`2710`	`2709`	`}`
`2711`	`2710`	`}`
`2712`	`2711`	`}`