Skip to content

Commit 9c9151c

Browse files
committed
feat: Add NETopKV function.
* The Neon(TM) implementation of TopKV reduces execution time from 447.8 ms (CPP) to 11.65 ms for the same workload (F32, C=1000, N=32000, k=3, 6 threads), achieving an approximate 38× speedup. This gain comes from SIMD vectorization, removal of per-element branches, and a more efficient inner loop. * Resolves ARMCL-1227 Change-Id: Ifdf161ce4254dc5ecd57aff9ae22410facd31705 Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
1 parent b63392e commit 9c9151c

File tree

26 files changed

+1884
-5
lines changed

26 files changed

+1884
-5
lines changed

Android.bp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,7 @@ cc_library_static {
462462
"src/cpu/kernels/CpuScatterKernel.cpp",
463463
"src/cpu/kernels/CpuSoftmaxKernel.cpp",
464464
"src/cpu/kernels/CpuSubKernel.cpp",
465+
"src/cpu/kernels/CpuTopKVKernel.cpp",
465466
"src/cpu/kernels/CpuTransposeKernel.cpp",
466467
"src/cpu/kernels/CpuWeightsReshapeKernel.cpp",
467468
"src/cpu/kernels/CpuWinogradConv2dKernel.cpp",
@@ -611,6 +612,11 @@ cc_library_static {
611612
"src/cpu/kernels/sub/neon/qasymm8.cpp",
612613
"src/cpu/kernels/sub/neon/qasymm8_signed.cpp",
613614
"src/cpu/kernels/sub/neon/qsymm16.cpp",
615+
"src/cpu/kernels/topkv/generic/neon/fp16.cpp",
616+
"src/cpu/kernels/topkv/generic/neon/fp32.cpp",
617+
"src/cpu/kernels/topkv/generic/neon/integer.cpp",
618+
"src/cpu/kernels/topkv/generic/neon/qasymm8.cpp",
619+
"src/cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp",
614620
"src/cpu/operators/CpuActivation.cpp",
615621
"src/cpu/operators/CpuAdd.cpp",
616622
"src/cpu/operators/CpuAddMulAdd.cpp",
@@ -649,6 +655,7 @@ cc_library_static {
649655
"src/cpu/operators/CpuScatter.cpp",
650656
"src/cpu/operators/CpuSoftmax.cpp",
651657
"src/cpu/operators/CpuSub.cpp",
658+
"src/cpu/operators/CpuTopKV.cpp",
652659
"src/cpu/operators/CpuTranspose.cpp",
653660
"src/cpu/operators/CpuWinogradConv2d.cpp",
654661
"src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp",
@@ -978,6 +985,7 @@ cc_library_static {
978985
"src/runtime/NEON/functions/NEStackLayer.cpp",
979986
"src/runtime/NEON/functions/NEStridedSlice.cpp",
980987
"src/runtime/NEON/functions/NETile.cpp",
988+
"src/runtime/NEON/functions/NETopKV.cpp",
981989
"src/runtime/NEON/functions/NETranspose.cpp",
982990
"src/runtime/NEON/functions/NEUnstack.cpp",
983991
"src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",

arm_compute/runtime/NEON/NEFunctions.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2016-2025 Arm Limited.
2+
* Copyright (c) 2016-2026 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -112,6 +112,7 @@
112112
#include "arm_compute/runtime/NEON/functions/NEStackLayer.h"
113113
#include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
114114
#include "arm_compute/runtime/NEON/functions/NETile.h"
115+
#include "arm_compute/runtime/NEON/functions/NETopKV.h"
115116
#include "arm_compute/runtime/NEON/functions/NETranspose.h"
116117
#include "arm_compute/runtime/NEON/functions/NEUnstack.h"
117118
#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/*
2+
* Copyright (c) 2026 Arm Limited.
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to
8+
* deal in the Software without restriction, including without limitation the
9+
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10+
* sell copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NETOPKV_H
25+
#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NETOPKV_H
26+
27+
/** @file
28+
* @publicapi
29+
*/
30+
31+
#include "arm_compute/core/Error.h"
32+
#include "arm_compute/runtime/IFunction.h"
33+
34+
#include <memory>
35+
36+
namespace arm_compute
37+
{
38+
// Forward declarations
39+
class ITensor;
40+
class ITensorInfo;
41+
42+
/** Basic function to run cpu::kernels::CpuTopKVKernel
43+
*
44+
*/
45+
class NETopKV : public IFunction
46+
{
47+
public:
48+
/** Constructor */
49+
NETopKV();
50+
/** Prevent instances of this class from being copied (As this class contains pointers) */
51+
NETopKV(const NETopKV &) = delete;
52+
/** Default move constructor */
53+
NETopKV(NETopKV &&);
54+
/** Prevent instances of this class from being copied (As this class contains pointers) */
55+
NETopKV &operator=(const NETopKV &) = delete;
56+
/** Default move assignment operator */
57+
NETopKV &operator=(NETopKV &&);
58+
/** Destructor */
59+
~NETopKV();
60+
/** Set the input and output of the kernel.
61+
*
62+
* Valid data layouts:
63+
* - All
64+
*
65+
* Valid data type configurations:
66+
* |src1 |src2 |dst |
67+
* |:--------------|:--------------|:--------------|
68+
* |QASYMM8 |U32 |U8 |
69+
* |QASYMM8_SIGNED |U32 |U8 |
70+
* |S32 |U32 |U8 |
71+
* |F16 |U32 |U8 |
72+
* |F32 |U32 |U8 |
73+
*
74+
* @param[in] predictions A batch_size x classes tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED/S32
75+
* @param[in] targets A batch_size 1D tensor of class ids. Data types supported: U32
76+
* @param[out] output Computed precision at @p k as a bool 1D tensor. Data types supported: U8
77+
* @param[in] k Number of top elements to look at for computing precision.
78+
*/
79+
void configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k);
80+
81+
/** Static function to check if given info will lead to a valid configuration.
82+
*
83+
* Similar to @ref NETopKV::configure()
84+
*
85+
* @return a status
86+
*/
87+
static Status
88+
validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
89+
90+
// Inherited methods overridden
91+
void run() override;
92+
93+
private:
94+
struct Impl;
95+
std::unique_ptr<Impl> _impl;
96+
};
97+
} // namespace arm_compute
98+
#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NETOPKV_H

arm_compute/runtime/OperatorList.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2023, 2025 Arm Limited.
2+
* Copyright (c) 2021-2023, 2025-2026 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -1033,6 +1033,16 @@
10331033
*
10341034
*/
10351035

1036+
/** TopKV
1037+
*
1038+
* Description:
1039+
* Function to compute TopKV
1040+
*
1041+
* Equivalent Android NNAPI Op:
1042+
* ANEURALNETWORKS_TOPK_V2
1043+
*
1044+
*/
1045+
10361046
/** Transpose
10371047
*
10381048
* Description:

docs/user_guide/operator_list.dox

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
///
2-
/// Copyright (c) 2021-2025 Arm Limited.
2+
/// Copyright (c) 2021-2026 Arm Limited.
33
///
44
/// SPDX-License-Identifier: MIT
55
///
@@ -3219,6 +3219,27 @@ where N = batches, C = channels, H = height, W = width, D = depth
32193219
<tr><th>src<th>dst
32203220
<tr><td>All<td>All
32213221
</table>
3222+
<tr>
3223+
<td rowspan="1">TopKV
3224+
<td rowspan="1" style="width:200px;"> Function to compute TopKV
3225+
<td rowspan="1">
3226+
<ul>
3227+
<li>ANEURALNETWORKS_TOPK_V2
3228+
</ul>
3229+
<td>NETopKV
3230+
<td>
3231+
<ul>
3232+
<li>All
3233+
</ul>
3234+
<td>
3235+
<table>
3236+
<tr><th>src1<th>src2<th>dst
3237+
<tr><td>QASYMM8<td>U32<td>U8
3238+
<tr><td>QASYMM8_SIGNED<td>U32<td>U8
3239+
<tr><td>S32<td>U32<td>U8
3240+
<tr><td>F16<td>U32<td>U8
3241+
<tr><td>F32<td>U32<td>U8
3242+
</table>
32223243
<tr>
32233244
<td rowspan="2">Transpose
32243245
<td rowspan="2" style="width:200px;"> Function to transpose a 2D tensor.

filelist.json

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2453,6 +2453,27 @@
24532453
]
24542454
}
24552455
},
2456+
"TopKV": {
2457+
"files": {
2458+
"common": [
2459+
"src/cpu/kernels/CpuTopKVKernel.cpp",
2460+
"src/cpu/operators/CpuTopKV.cpp",
2461+
"src/runtime/NEON/functions/NETopKV.cpp"
2462+
],
2463+
"neon": {
2464+
"fp16": [ "src/cpu/kernels/topkv/generic/neon/fp16.cpp" ],
2465+
"fp32": [ "src/cpu/kernels/topkv/generic/neon/fp32.cpp" ],
2466+
"integer":["src/cpu/kernels/topkv/generic/neon/integer.cpp"],
2467+
"qasymm8": [
2468+
"src/cpu/kernels/topkv/generic/neon/qasymm8.cpp"
2469+
],
2470+
"qasymm8_signed": [
2471+
"src/cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp"
2472+
]
2473+
}
2474+
2475+
}
2476+
},
24562477
"Transpose": {
24572478
"files": {
24582479
"common": [

src/BUILD.bazel

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,7 @@ filegroup(
738738
"cpu/kernels/CpuScatterKernel.cpp",
739739
"cpu/kernels/CpuSoftmaxKernel.cpp",
740740
"cpu/kernels/CpuSubKernel.cpp",
741+
"cpu/kernels/CpuTopKVKernel.cpp",
741742
"cpu/kernels/CpuTransposeKernel.cpp",
742743
"cpu/kernels/CpuWeightsReshapeKernel.cpp",
743744
"cpu/kernels/CpuWinogradConv2dKernel.cpp",
@@ -848,6 +849,10 @@ filegroup(
848849
"cpu/kernels/sub/neon/qasymm8.cpp",
849850
"cpu/kernels/sub/neon/qasymm8_signed.cpp",
850851
"cpu/kernels/sub/neon/qsymm16.cpp",
852+
"cpu/kernels/topkv/generic/neon/fp32.cpp",
853+
"cpu/kernels/topkv/generic/neon/integer.cpp",
854+
"cpu/kernels/topkv/generic/neon/qasymm8.cpp",
855+
"cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp",
851856
"cpu/operators/CpuActivation.cpp",
852857
"cpu/operators/CpuAdd.cpp",
853858
"cpu/operators/CpuAddMulAdd.cpp",
@@ -886,6 +891,7 @@ filegroup(
886891
"cpu/operators/CpuScatter.cpp",
887892
"cpu/operators/CpuSoftmax.cpp",
888893
"cpu/operators/CpuSub.cpp",
894+
"cpu/operators/CpuTopKV.cpp",
889895
"cpu/operators/CpuTranspose.cpp",
890896
"cpu/operators/CpuWinogradConv2d.cpp",
891897
"cpu/operators/internal/CpuGemmAssemblyDispatch.cpp",
@@ -994,6 +1000,7 @@ filegroup(
9941000
"runtime/NEON/functions/NEStackLayer.cpp",
9951001
"runtime/NEON/functions/NEStridedSlice.cpp",
9961002
"runtime/NEON/functions/NETile.cpp",
1003+
"runtime/NEON/functions/NETopKV.cpp",
9971004
"runtime/NEON/functions/NETranspose.cpp",
9981005
"runtime/NEON/functions/NEUnstack.cpp",
9991006
"runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
@@ -1109,7 +1116,8 @@ filegroup(
11091116
"cpu/kernels/scatter/generic/neon/fp16.cpp",
11101117
"cpu/kernels/select/generic/neon/fp16.cpp",
11111118
"cpu/kernels/softmax/generic/neon/fp16.cpp",
1112-
"cpu/kernels/sub/neon/fp16.cpp"] +
1119+
"cpu/kernels/sub/neon/fp16.cpp",
1120+
"cpu/kernels/topkv/generic/neon/fp16.cpp"] +
11131121
glob(["**/*.h",
11141122
"**/*.hpp",
11151123
"**/*.inl"]),

src/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,7 @@ target_sources(
732732
cpu/kernels/CpuScatterKernel.cpp
733733
cpu/kernels/CpuSoftmaxKernel.cpp
734734
cpu/kernels/CpuSubKernel.cpp
735+
cpu/kernels/CpuTopKVKernel.cpp
735736
cpu/kernels/CpuTransposeKernel.cpp
736737
cpu/kernels/CpuWeightsReshapeKernel.cpp
737738
cpu/kernels/CpuWinogradConv2dKernel.cpp
@@ -842,6 +843,10 @@ target_sources(
842843
cpu/kernels/sub/neon/qasymm8.cpp
843844
cpu/kernels/sub/neon/qasymm8_signed.cpp
844845
cpu/kernels/sub/neon/qsymm16.cpp
846+
cpu/kernels/topkv/generic/neon/fp32.cpp
847+
cpu/kernels/topkv/generic/neon/integer.cpp
848+
cpu/kernels/topkv/generic/neon/qasymm8.cpp
849+
cpu/kernels/topkv/generic/neon/qasymm8_signed.cpp
845850
cpu/operators/CpuActivation.cpp
846851
cpu/operators/CpuAdd.cpp
847852
cpu/operators/CpuAddMulAdd.cpp
@@ -880,6 +885,7 @@ target_sources(
880885
cpu/operators/CpuScatter.cpp
881886
cpu/operators/CpuSoftmax.cpp
882887
cpu/operators/CpuSub.cpp
888+
cpu/operators/CpuTopKV.cpp
883889
cpu/operators/CpuTranspose.cpp
884890
cpu/operators/CpuWinogradConv2d.cpp
885891
cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -988,6 +994,7 @@ target_sources(
988994
runtime/NEON/functions/NEStackLayer.cpp
989995
runtime/NEON/functions/NEStridedSlice.cpp
990996
runtime/NEON/functions/NETile.cpp
997+
runtime/NEON/functions/NETopKV.cpp
991998
runtime/NEON/functions/NETranspose.cpp
992999
runtime/NEON/functions/NEUnstack.cpp
9931000
runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -1109,4 +1116,5 @@ target_sources(
11091116
cpu/kernels/select/generic/neon/fp16.cpp
11101117
cpu/kernels/softmax/generic/neon/fp16.cpp
11111118
cpu/kernels/sub/neon/fp16.cpp
1119+
cpu/kernels/topkv/generic/neon/fp16.cpp
11121120
)

0 commit comments

Comments
 (0)