Skip to content
This repository was archived by the owner on Dec 18, 2025. It is now read-only.

Commit 5126318

Browse files
authored
CMSIS-NN: Add MVE scalar versions (#1555)
Adds scalar versions for mat_mul_core_4x_s8/mat_mul_core_1x_s8 under flag ARM_MATH_AUTOVECTORIZE, which is required with -O0. Updates README about this.
1 parent e2b3ec2 commit 5126318

3 files changed

Lines changed: 28 additions & 6 deletions

File tree

CMSIS/NN/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ cmake .. -DCMAKE_TOOLCHAIN_FILE=~/ethos-u-core-platform/cmake/toolchain/armclang
9393

9494
### Compiler options
9595
Default optimization level is Ofast. Please change according to project needs. Just bear in mind it will impact performance.
96+
With only optimization level -O0, ARM_MATH_AUTOVECTORIZE needs to be defined.
9697

9798
The compiler option '-fomit-frame-pointer' is enabled by default at -O and higher. With no optimization level you may need to specifiy '-fomit-frame-pointer' as a minimum.
9899

CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
* Title: arm_nn_mat_mul_core_1x_s8.c
2222
* Description: General Matrix-multiplication function
2323
*
24-
* $Date: 7 July 2022
25-
* $Revision: V.3.0.0
24+
* $Date: 22 Aug 2022
25+
* $Revision: V.3.1.0
2626
*
2727
* Target Processor: Cortex-M cores
2828
* -------------------------------------------------------------------- */
@@ -54,7 +54,7 @@ arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
5454
const int32_t *bias,
5555
int8_t *output)
5656
{
57-
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
57+
#if defined(ARM_MATH_MVEI)
5858
const int8_t *col_base = col_base_ref;
5959
int32_t *output_mult = quant_params->multiplier;
6060
int32_t *output_shift = quant_params->shift;
@@ -70,6 +70,14 @@ arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
7070

7171
int32_t sum_tmp = 0;
7272

73+
#if defined(ARM_MATH_AUTOVECTORIZE)
74+
for (int j = 0; j < row_elements; j++)
75+
{
76+
int32_t col = col_base[j];
77+
sum_tmp += col;
78+
acc_n0 += row_base[j] * col;
79+
}
80+
#else
7381
__ASM volatile(" vldrb.8 q0, [%[col]], #16 \n"
7482
" wlstp.8 lr, %[cnt], 1f \n"
7583
"2: \n"
@@ -82,6 +90,7 @@ arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
8290
: [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(row_base), [out0] "+Te"(acc_n0)
8391
: [cnt] "r"(row_elements)
8492
: "q0", "q1", "memory", "r14");
93+
#endif
8594

8695
sum_tmp *= conv_params->input_offset;
8796
acc_n0 += sum_tmp;

CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
2+
* SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
33
*
44
* SPDX-License-Identifier: Apache-2.0
55
*
@@ -21,8 +21,8 @@
2121
* Title: arm_nn_mat_mul_core_4x_s8.c
2222
* Description: General matrix multiplication function for MVE extension
2323
*
24-
* $Date: 19. April 2022
25-
* $Revision: V.3.0.1
24+
* $Date: 22. Aug 2022
25+
* $Revision: V.3.1.0
2626
*
2727
* Target Processor: Cortex-M processors
2828
* -------------------------------------------------------------------- */
@@ -70,6 +70,17 @@ int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
7070
const int8_t *col_base = col_base_ref + i * row_elements;
7171
int32_t sum_tmp = 0;
7272

73+
#if defined(ARM_MATH_AUTOVECTORIZE)
74+
for (int j = 0; j < row_elements; j++)
75+
{
76+
int32_t col = col_base[j];
77+
sum_tmp += col;
78+
acc_n0 += ip_row_0[j] * col;
79+
acc_n1 += ip_row_1[j] * col;
80+
acc_n2 += ip_row_2[j] * col;
81+
acc_n3 += ip_row_3[j] * col;
82+
}
83+
#else
7384
__ASM volatile(" vldrb.8 q0, [%[col]], #16 \n"
7485
" wlstp.8 lr, %[cnt], 1f \n"
7586
"2: \n"
@@ -97,6 +108,7 @@ int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
97108
[out3] "+Te"(acc_n3)
98109
: [cnt] "r"(row_elements)
99110
: "q0", "q1", "q2", "q3", "q4", "memory", "r14");
111+
#endif
100112

101113
int32x4_t res = {acc_n0, acc_n1, acc_n2, acc_n3};
102114
sum_tmp *= conv_params->input_offset;

0 commit comments

Comments
 (0)