Skip to content
This repository was archived by the owner on Dec 18, 2025. It is now read-only.

Commit e2b3ec2

Browse files
authored
CMSIS-NN: update MVE intrinsics usage and predication (#1554)
1 parent ed78d6b commit e2b3ec2

4 files changed

Lines changed: 22 additions & 21 deletions

File tree

CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
2+
* SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
33
*
44
* SPDX-License-Identifier: Apache-2.0
55
*
@@ -21,8 +21,8 @@
2121
* Title: arm_nn_mat_mult_s8.c
2222
* Description: General Matrix-multiplication function
2323
*
24-
* $Date: 27. October 2021
25-
* $Revision: V.2.0.6
24+
* $Date: 16 August 2022
25+
* $Revision: V.2.0.7
2626
*
2727
* Target Processor: Cortex-M cores
2828
* -------------------------------------------------------------------- */
@@ -73,7 +73,7 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
7373
for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
7474
{
7575
mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
76-
const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
76+
const int16x8_t offset = vdupq_x_n_s16(col_offset, p);
7777
row_len_tmp -= 8;
7878

7979
int16x8_t c0 = vldrbq_s16(ip_c0);
@@ -133,7 +133,7 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
133133
for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
134134
{
135135
const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
136-
const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
136+
const int16x8_t offset = vdupq_x_n_s16(col_offset, p);
137137
row_len_tmp -= 8;
138138

139139
int16x8_t c0 = vldrbq_s16(ip_c0);

CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2022 Arm Limited or its affiliates.
2+
* SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
33
*
44
* SPDX-License-Identifier: Apache-2.0
55
*
@@ -21,8 +21,8 @@
2121
* Title: arm_nn_vec_mat_mult_t_s8
2222
* Description: s8 vector by matrix (transposed) multiplication
2323
*
24-
* $Date: 2 May 2022
25-
* $Revision: V.4.0.1
24+
* $Date: 16 Aug 2022
25+
* $Revision: V.4.0.2
2626
*
2727
* Target Processor: Cortex-M
2828
*
@@ -115,7 +115,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
115115
if (bias)
116116
{
117117
int32x4_t b = vldrwq_z_s32(bias, p);
118-
acc = vaddq_m_s32(vuninitializedq_s32(), acc, b, p);
118+
acc = vaddq_x_s32(acc, b, p);
119119
bias += 3;
120120
}
121121
const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0};

CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
* Title: arm_max_pool_s16.c
2222
* Description: Pooling function implementations
2323
*
24-
* $Date: 20 June 2022
25-
* $Revision: V.2.1.0
24+
* $Date: 16 August 2022
25+
* $Revision: V.2.1.1
2626
*
2727
* Target Processor: Cortex-M CPUs
2828
*
@@ -94,8 +94,8 @@ static void clamp_output(int16_t *source, int32_t length, const int16_t act_min,
9494
mve_pred16_t p = vctp16q((uint32_t)length);
9595
length -= 8;
9696
const int16x8_t src = vldrhq_z_s16(source, p);
97-
int16x8_t res = vmaxq_m_s16(vuninitializedq_s16(), src, min, p);
98-
res = vminq_m_s16(vuninitializedq_s16(), res, max, p);
97+
int16x8_t res = vmaxq_x_s16(src, min, p);
98+
res = vminq_x_s16(res, max, p);
9999
vstrhq_p_s16(source, res, p);
100100
source += 8;
101101
}

CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
2+
* SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
33
*
44
* SPDX-License-Identifier: Apache-2.0
55
*
@@ -21,8 +21,8 @@
2121
* Title: arm_max_pool_s8.c
2222
* Description: Pooling function implementations
2323
*
24-
* $Date: 19 April 2022
25-
* $Revision: V.3.0.0
24+
* $Date: 16 August 2022
25+
* $Revision: V.3.0.1
2626
*
2727
* Target Processor: Cortex-M CPUs
2828
*
@@ -40,7 +40,7 @@ static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int
4040
mve_pred16_t p = vctp8q((uint32_t)length);
4141
const int8x16_t op_1 = vldrbq_z_s8(base, p);
4242
const int8x16_t op_2 = vldrbq_z_s8(target, p);
43-
const int8x16_t max = vmaxq_m_s8(vuninitializedq_s8(), op_1, op_2, p);
43+
const int8x16_t max = vmaxq_x_s8(op_1, op_2, p);
4444
vstrbq_p_s8(base, max, p);
4545
base += 16;
4646
target += 16;
@@ -98,15 +98,16 @@ static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, co
9898
{
9999
#if defined(ARM_MATH_MVEI)
100100
int32_t loop_count = (length + 15) / 16;
101+
const int8x16_t vmin = vdupq_n_s8((int8_t)act_min);
102+
const int8x16_t vmax = vdupq_n_s8((int8_t)act_max);
103+
101104
for (int i = 0; i < loop_count; i++)
102105
{
103106
mve_pred16_t p = vctp8q((uint32_t)length);
104107
length -= 16;
105108
const int8x16_t src = vldrbq_z_s8(source, p);
106-
const int8x16_t predicated_min = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_min, p);
107-
const int8x16_t predicated_max = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_max, p);
108-
int8x16_t res = vmaxq_m_s8(vuninitializedq_s8(), src, predicated_min, p);
109-
res = vminq_m_s8(vuninitializedq_s8(), res, predicated_max, p);
109+
int8x16_t res = vmaxq_x_s8(src, vmin, p);
110+
res = vminq_x_s8(res, vmax, p);
110111
vstrbq_p_s8(source, res, p);
111112
source += 16;
112113
}

0 commit comments

Comments
 (0)