CMSIS-NN: update MVE intrinsics usage and predication (#1554)

annietllnd · web-flow · commit e2b3ec292061 · 2022-08-19T15:43:41.000+02:00
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mat_mult_s8.c
  * Description:  General Matrix-multiplication function
  *
- * $Date:        27. October 2021
- * $Revision:    V.2.0.6
+ * $Date:        16 August 2022
+ * $Revision:    V.2.0.7
  *
  * Target Processor:  Cortex-M cores
  * -------------------------------------------------------------------- */
@@ -73,7 +73,7 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
             for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
             {
                 mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
-                const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
+                const int16x8_t offset = vdupq_x_n_s16(col_offset, p);
                 row_len_tmp -= 8;
 
                 int16x8_t c0 = vldrbq_s16(ip_c0);
@@ -133,7 +133,7 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
                 for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
                 {
                     const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
-                    const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
+                    const int16x8_t offset = vdupq_x_n_s16(col_offset, p);
                     row_len_tmp -= 8;
 
                     int16x8_t c0 = vldrbq_s16(ip_c0);
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_vec_mat_mult_t_s8
  * Description:  s8 vector by matrix (transposed) multiplication
  *
- * $Date:        2 May 2022
- * $Revision:    V.4.0.1
+ * $Date:        16 Aug 2022
+ * $Revision:    V.4.0.2
  *
  * Target Processor:  Cortex-M
  *
@@ -115,7 +115,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
         if (bias)
         {
             int32x4_t b = vldrwq_z_s32(bias, p);
-            acc = vaddq_m_s32(vuninitializedq_s32(), acc, b, p);
+            acc = vaddq_x_s32(acc, b, p);
             bias += 3;
         }
         const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0};
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
@@ -21,8 +21,8 @@
  * Title:        arm_max_pool_s16.c
  * Description:  Pooling function implementations
  *
- * $Date:        20 June 2022
- * $Revision:    V.2.1.0
+ * $Date:        16 August 2022
+ * $Revision:    V.2.1.1
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -94,8 +94,8 @@ static void clamp_output(int16_t *source, int32_t length, const int16_t act_min,
         mve_pred16_t p = vctp16q((uint32_t)length);
         length -= 8;
         const int16x8_t src = vldrhq_z_s16(source, p);
-        int16x8_t res = vmaxq_m_s16(vuninitializedq_s16(), src, min, p);
-        res = vminq_m_s16(vuninitializedq_s16(), res, max, p);
+        int16x8_t res = vmaxq_x_s16(src, min, p);
+        res = vminq_x_s16(res, max, p);
         vstrhq_p_s16(source, res, p);
         source += 8;
     }
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_max_pool_s8.c
  * Description:  Pooling function implementations
  *
- * $Date:        19 April 2022
- * $Revision:    V.3.0.0
+ * $Date:        16 August 2022
+ * $Revision:    V.3.0.1
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -40,7 +40,7 @@ static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int
         mve_pred16_t p = vctp8q((uint32_t)length);
         const int8x16_t op_1 = vldrbq_z_s8(base, p);
         const int8x16_t op_2 = vldrbq_z_s8(target, p);
-        const int8x16_t max = vmaxq_m_s8(vuninitializedq_s8(), op_1, op_2, p);
+        const int8x16_t max = vmaxq_x_s8(op_1, op_2, p);
         vstrbq_p_s8(base, max, p);
         base += 16;
         target += 16;
@@ -98,15 +98,16 @@ static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, co
 {
 #if defined(ARM_MATH_MVEI)
     int32_t loop_count = (length + 15) / 16;
+    const int8x16_t vmin = vdupq_n_s8((int8_t)act_min);
+    const int8x16_t vmax = vdupq_n_s8((int8_t)act_max);
+
     for (int i = 0; i < loop_count; i++)
     {
         mve_pred16_t p = vctp8q((uint32_t)length);
         length -= 16;
         const int8x16_t src = vldrbq_z_s8(source, p);
-        const int8x16_t predicated_min = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_min, p);
-        const int8x16_t predicated_max = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_max, p);
-        int8x16_t res = vmaxq_m_s8(vuninitializedq_s8(), src, predicated_min, p);
-        res = vminq_m_s8(vuninitializedq_s8(), res, predicated_max, p);
+        int8x16_t res = vmaxq_x_s8(src, vmin, p);
+        res = vminq_x_s8(res, vmax, p);
         vstrbq_p_s8(source, res, p);
         source += 16;
     }

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (C) 2010-2021 Arm Limited or its affiliates.`
	`2`	`+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: Apache-2.0`
`5`	`5`	`*`
`@@ -21,8 +21,8 @@`
`21`	`21`	`* Title: arm_nn_mat_mult_s8.c`
`22`	`22`	`* Description: General Matrix-multiplication function`
`23`	`23`	`*`
`24`		`- * $Date: 27. October 2021`
`25`		`- * $Revision: V.2.0.6`
	`24`	`+ * $Date: 16 August 2022`
	`25`	`+ * $Revision: V.2.0.7`
`26`	`26`	`*`
`27`	`27`	`* Target Processor: Cortex-M cores`
`28`	`28`	`* -------------------------------------------------------------------- */`
`@@ -73,7 +73,7 @@ q7_t arm_nn_mat_mult_s8(const q7_t input_row,`
`73`	`73`	`for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)`
`74`	`74`	`{`
`75`	`75`	`mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);`
`76`		`- const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);`
	`76`	`+ const int16x8_t offset = vdupq_x_n_s16(col_offset, p);`
`77`	`77`	`row_len_tmp -= 8;`
`78`	`78`
`79`	`79`	`int16x8_t c0 = vldrbq_s16(ip_c0);`
`@@ -133,7 +133,7 @@ q7_t arm_nn_mat_mult_s8(const q7_t input_row,`
`133`	`133`	`for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)`
`134`	`134`	`{`
`135`	`135`	`const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);`
`136`		`- const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);`
	`136`	`+ const int16x8_t offset = vdupq_x_n_s16(col_offset, p);`
`137`	`137`	`row_len_tmp -= 8;`
`138`	`138`
`139`	`139`	`int16x8_t c0 = vldrbq_s16(ip_c0);`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (C) 2020-2022 Arm Limited or its affiliates.`
	`2`	`+ * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: Apache-2.0`
`5`	`5`	`*`
`@@ -21,8 +21,8 @@`
`21`	`21`	`* Title: arm_nn_vec_mat_mult_t_s8`
`22`	`22`	`* Description: s8 vector by matrix (transposed) multiplication`
`23`	`23`	`*`
`24`		`- * $Date: 2 May 2022`
`25`		`- * $Revision: V.4.0.1`
	`24`	`+ * $Date: 16 Aug 2022`
	`25`	`+ * $Revision: V.4.0.2`
`26`	`26`	`*`
`27`	`27`	`* Target Processor: Cortex-M`
`28`	`28`	`*`
`@@ -115,7 +115,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,`
`115`	`115`	`if (bias)`
`116`	`116`	`{`
`117`	`117`	`int32x4_t b = vldrwq_z_s32(bias, p);`
`118`		`- acc = vaddq_m_s32(vuninitializedq_s32(), acc, b, p);`
	`118`	`+ acc = vaddq_x_s32(acc, b, p);`
`119`	`119`	`bias += 3;`
`120`	`120`	`}`
`121`	`121`	`const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0};`