Skip to content

Commit ed57040

Browse files
authored
NNlib update to latest version for HiFi1 build (pytorch#18112)
Differential Revision: D96245585 Pull Request resolved: pytorch#18112
1 parent af25cd8 commit ed57040

17 files changed

Lines changed: 96 additions & 4025 deletions

backends/cadence/aot/functions_hifi.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,12 @@
286286
- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
287287
kernels:
288288
- arg_meta: null
289-
kernel_name: impl::HiFi::native::im2row_out
289+
kernel_name: impl::HiFi::im2row_out
290290

291291
- func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
292292
kernels:
293293
- arg_meta: null
294-
kernel_name: impl::HiFi::native::im2row_per_tensor_out
294+
kernel_name: impl::HiFi::im2row_per_tensor_out
295295

296296
- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
297297
variants: function

backends/cadence/hifi/kernels/CMakeLists.txt

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,15 @@
88
add_library(
99
cadence_kernels
1010
kernels.cpp
11-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
12-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
13-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
14-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
1511
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c
1612
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c
17-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
1813
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
1914
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c
20-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
21-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
2215
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c
23-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
24-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
2516
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
2617
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c
27-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
18+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
2819
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
29-
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
3020
)
3121
# Let files say "include <executorch/path/to/header.h>".
3222
set(_common_include_directories

backends/cadence/hifi/kernels/kernels.h

Lines changed: 0 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,9 @@ using executorch::runtime::Result;
1818

1919
/* Potential NNLIB function/APIs */
2020

21-
extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
22-
FLOAT32* __restrict__ p_out,
23-
const WORD32* const p_out_shape,
24-
const FLOAT32* __restrict__ p_inp1,
25-
const WORD32* const p_inp1_shape,
26-
const FLOAT32* __restrict__ p_inp2,
27-
const WORD32* const p_inp2_shape);
28-
2921
extern "C" void
3022
xa_nn_elm_atan2_f32(FLOAT32* z, const FLOAT32* y, const FLOAT32* x, WORD32 N);
3123

32-
extern "C" WORD32 xa_nn_elm_clamp_f32xf32xf32_f32(
33-
FLOAT32* __restrict__ p_out,
34-
const FLOAT32* __restrict__ p_inp,
35-
const FLOAT32* __restrict__ p_min,
36-
const FLOAT32* __restrict__ p_max,
37-
WORD32 num_elm);
38-
3924
extern "C" WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32(
4025
FLOAT32* __restrict__ p_out,
4126
const WORD32* const p_out_shape,
@@ -46,14 +31,6 @@ extern "C" WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32(
4631
const FLOAT32* __restrict__ p_max,
4732
const WORD32* const p_max_shape);
4833

49-
extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(
50-
FLOAT32* __restrict__ p_out,
51-
const WORD32* const p_out_shape,
52-
const FLOAT32* __restrict__ p_inp1,
53-
const WORD32* const p_inp1_shape,
54-
const FLOAT32* __restrict__ p_inp2,
55-
const WORD32* const p_inp2_shape);
56-
5734
extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
5835
FLOAT32* __restrict__ p_out,
5936
const FLOAT32* __restrict__ p_inp1,
@@ -70,22 +47,6 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
7047
const WORD32* const p_inp2_shape,
7148
WORD32 mode);
7249

73-
extern "C" WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(
74-
WORD8* __restrict__ p_out,
75-
const FLOAT32* __restrict__ p_inp1,
76-
const FLOAT32* __restrict__ p_inp2,
77-
WORD32 num_elm,
78-
WORD32 kernel_type);
79-
80-
extern "C" WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
81-
WORD8* __restrict__ p_out,
82-
const WORD32* const p_out_shape,
83-
const FLOAT32* __restrict__ p_inp1,
84-
const WORD32* const p_inp1_shape,
85-
const FLOAT32* __restrict__ p_inp2,
86-
const WORD32* const p_inp2_shape,
87-
WORD32 kernel_type);
88-
8950
extern "C" WORD32 xa_nn_elm_fmod_f32xf32_f32(
9051
FLOAT32* __restrict__ p_out,
9152
const FLOAT32* __restrict__ p_inp1,
@@ -106,42 +67,6 @@ extern "C" WORD32 xa_nn_elm_logicalxor_boolxbool_bool(
10667
const WORD8* __restrict__ p_inp2,
10768
WORD32 num_elm);
10869

109-
extern "C" WORD32 xa_nn_elm_maximum_f32xf32_f32(
110-
FLOAT32* __restrict__ p_out,
111-
const FLOAT32* __restrict__ p_inp1,
112-
const FLOAT32* __restrict__ p_inp2,
113-
WORD32 num_elm);
114-
115-
extern "C" WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
116-
FLOAT32* __restrict__ p_out,
117-
const WORD32* const p_out_shape,
118-
const FLOAT32* __restrict__ p_inp1,
119-
const WORD32* const p_inp1_shape,
120-
const FLOAT32* __restrict__ p_inp2,
121-
const WORD32* const p_inp2_shape);
122-
123-
extern "C" WORD32 xa_nn_elm_minimum_f32xf32_f32(
124-
FLOAT32* __restrict__ p_out,
125-
const FLOAT32* __restrict__ p_inp1,
126-
const FLOAT32* __restrict__ p_inp2,
127-
WORD32 num_elm);
128-
129-
extern "C" WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
130-
FLOAT32* __restrict__ p_out,
131-
const WORD32* const p_out_shape,
132-
const FLOAT32* __restrict__ p_inp1,
133-
const WORD32* const p_inp1_shape,
134-
const FLOAT32* __restrict__ p_inp2,
135-
const WORD32* const p_inp2_shape);
136-
137-
extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
138-
FLOAT32* __restrict__ p_out,
139-
const WORD32* const p_out_shape,
140-
const FLOAT32* __restrict__ p_inp1,
141-
const WORD32* const p_inp1_shape,
142-
const FLOAT32* __restrict__ p_inp2,
143-
const WORD32* const p_inp2_shape);
144-
14570
extern "C" void xa_nn_elm_pow_f32(
14671
FLOAT32* __restrict__ z,
14772
const FLOAT32* __restrict__ x,
@@ -162,23 +87,6 @@ extern "C" WORD32 xa_nn_elm_remainder_broadcast_4D_f32xf32_f32(
16287
const FLOAT32* __restrict__ p_inp2,
16388
const WORD32* const p_inp2_shape);
16489

165-
extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(
166-
FLOAT32* __restrict__ p_out,
167-
const FLOAT32* __restrict__ p_inp1,
168-
const FLOAT32* __restrict__ p_inp2,
169-
const unsigned char* __restrict__ p_condition,
170-
WORD32 num_elm);
171-
172-
extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(
173-
FLOAT32* __restrict__ p_out,
174-
const WORD32* const p_out_shape,
175-
const FLOAT32* __restrict__ p_inp1,
176-
const WORD32* const p_inp1_shape,
177-
const FLOAT32* __restrict__ p_inp2,
178-
const WORD32* const p_inp2_shape,
179-
const unsigned char* __restrict__ p_condition,
180-
const WORD32* const p_condition_shape);
181-
18290
extern "C" WORD32 xa_nn_im2row_quantized(
18391
const WORD8* __restrict__ data_im,
18492
const WORD32 in_zero_point,
@@ -212,60 +120,12 @@ extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
212120
WORD32 num_axis_dims,
213121
void* __restrict__ p_scratch_in);
214122

215-
extern "C" WORD32 xa_nn_transpose_32_32(
216-
WORD32* __restrict__ p_out,
217-
const WORD32* const p_out_shape,
218-
const WORD32* __restrict__ p_inp,
219-
const WORD32* const p_inp_shape,
220-
const WORD32* __restrict__ p_permute_vec,
221-
WORD32 num_out_dims,
222-
WORD32 num_inp_dims);
223-
224123
namespace impl {
225124
namespace HiFi {
226125
namespace kernels {
227126

228127
void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size);
229128

230-
void memcpy(void* dst, const void* src, size_t num_bytes);
231-
232-
WORD32 matmul_asym8uxasym8u_asym8u(
233-
UWORD8* __restrict__ p_out, // output uint8 matrix
234-
const UWORD8* __restrict__ p_mat1, // weight uint8 matrix
235-
const UWORD8* __restrict__ p_vec1, // input uint8 matrix
236-
const WORD32* __restrict__ p_bias, // bias int32 vec
237-
WORD32 rows, // rows of p_mat1
238-
WORD32 cols1, // columns of p_mat1
239-
WORD32 row_stride1, // row stride of p_mat1
240-
WORD32 vec_count, // rows of p_mat2
241-
WORD32 vec_offset, // vec_offset of p_mat2.
242-
WORD32 out_offset, // out_offset, i.e., offset of next output element
243-
WORD32 out_stride, // out_stride, i.e., stride to go to next output row
244-
WORD32 mat1_zero_bias, // zero_point of p_mat1
245-
WORD32 vec1_zero_bias, // zero_point of p_vec1
246-
const WORD32* __restrict__ out_multiplier,
247-
const WORD32* __restrict__ out_shift,
248-
WORD32 out_zero_bias,
249-
bool per_channel_quantized = false); // per-channel quantized weight
250-
251-
WORD32 xa_nn_matmul_asym8uxasym8u_asym8u(
252-
UWORD8* __restrict__ p_out,
253-
const UWORD8* __restrict__ p_mat1,
254-
const UWORD8* __restrict__ p_mat2,
255-
const WORD32* __restrict__ p_bias,
256-
WORD32 rows,
257-
WORD32 cols,
258-
WORD32 row_stride,
259-
WORD32 vec_count,
260-
WORD32 vec_offset,
261-
WORD32 out_offset,
262-
WORD32 out_stride,
263-
WORD32 mat1_zero_bias,
264-
WORD32 vec1_zero_bias,
265-
WORD32 out_multiplier,
266-
WORD32 out_shift,
267-
WORD32 out_zero_bias);
268-
269129
template <typename T>
270130
T quantize(const float x, float scale, int32_t zero_point);
271131

backends/cadence/hifi/operators/op_clamp.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,13 +155,13 @@ Tensor& clamp_Tensor_out(
155155
inp_shape[i + off_inp] = in.size(i);
156156
}
157157

158-
WORD32 ret_val = xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
158+
WORD32 ret_val = xa_nn_elm_min_4D_Bcast_f32xf32_f32(
159159
out_data, out_shape, inp_data, inp_shape, max_data, max_shape);
160160

161161
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
162162

163163
} else {
164-
WORD32 ret_val = xa_nn_elm_minimum_f32xf32_f32(
164+
WORD32 ret_val = xa_nn_elm_min_f32xf32_f32(
165165
out_data, inp_data, max_data, out.numel());
166166

167167
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
@@ -192,13 +192,13 @@ Tensor& clamp_Tensor_out(
192192
min_shape[i + off_min] = min.size(i);
193193
for (int i = 0; i < inp_dim; i++)
194194
inp_shape[i + off_inp] = in.size(i);
195-
WORD32 ret_val = xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
195+
WORD32 ret_val = xa_nn_elm_max_4D_Bcast_f32xf32_f32(
196196
out_data, out_shape, inp_data, inp_shape, min_data, min_shape);
197197

198198
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
199199

200200
} else {
201-
WORD32 ret_val = xa_nn_elm_maximum_f32xf32_f32(
201+
WORD32 ret_val = xa_nn_elm_max_f32xf32_f32(
202202
out_data, inp_data, min_data, out.numel());
203203

204204
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);

backends/cadence/hifi/operators/op_eq.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,15 +94,21 @@ Tensor& eq_Tensor_out(
9494
for (int i = 0; i < b.dim(); i++)
9595
inp2_shape[i + off_b] = b.size(i);
9696

97-
WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
98-
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 4);
97+
WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
98+
p_out,
99+
out_shape,
100+
p_inp1,
101+
inp1_shape,
102+
p_inp2,
103+
inp2_shape,
104+
COMPARE_EQUAL);
99105

100106
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
101107
} else {
102108
int num_elm = out.numel();
103109

104-
WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
105-
p_out, p_inp1, p_inp2, num_elm, 4);
110+
WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
111+
p_out, p_inp1, p_inp2, num_elm, COMPARE_EQUAL);
106112

107113
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
108114
}

backends/cadence/hifi/operators/op_ge.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,15 +94,21 @@ Tensor& ge_Tensor_out(
9494
for (int i = 0; i < b.dim(); i++)
9595
inp2_shape[i + off_b] = b.size(i);
9696

97-
WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
98-
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 0);
97+
WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
98+
p_out,
99+
out_shape,
100+
p_inp1,
101+
inp1_shape,
102+
p_inp2,
103+
inp2_shape,
104+
COMPARE_GREATEREQUAL);
99105

100106
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
101107
} else {
102108
int num_elm = out.numel();
103109

104-
WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
105-
p_out, p_inp1, p_inp2, num_elm, 0);
110+
WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
111+
p_out, p_inp1, p_inp2, num_elm, COMPARE_GREATEREQUAL);
106112

107113
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
108114
}

backends/cadence/hifi/operators/op_gt.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,15 +96,21 @@ Tensor& gt_Tensor_out(
9696
for (int i = 0; i < b.dim(); i++)
9797
inp2_shape[i + off_b] = b.size(i);
9898

99-
WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
100-
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 1);
99+
WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
100+
p_out,
101+
out_shape,
102+
p_inp1,
103+
inp1_shape,
104+
p_inp2,
105+
inp2_shape,
106+
COMPARE_GREATER);
101107

102108
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
103109
} else {
104110
int num_elm = out.numel();
105111

106-
WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
107-
p_out, p_inp1, p_inp2, num_elm, 1);
112+
WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
113+
p_out, p_inp1, p_inp2, num_elm, COMPARE_GREATER);
108114

109115
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
110116
}

backends/cadence/hifi/operators/op_le.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,15 +95,21 @@ Tensor& le_Tensor_out(
9595
for (int i = 0; i < b.dim(); i++)
9696
inp2_shape[i + off_b] = b.size(i);
9797

98-
WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
99-
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 2);
98+
WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
99+
p_out,
100+
out_shape,
101+
p_inp1,
102+
inp1_shape,
103+
p_inp2,
104+
inp2_shape,
105+
COMPARE_LESSEREQUAL);
100106

101107
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
102108
} else {
103109
int num_elm = out.numel();
104110

105-
WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
106-
p_out, p_inp1, p_inp2, num_elm, 2);
111+
WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
112+
p_out, p_inp1, p_inp2, num_elm, COMPARE_LESSEREQUAL);
107113

108114
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
109115
}

backends/cadence/hifi/operators/op_lt.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,15 +93,21 @@ Tensor& lt_Tensor_out(
9393
for (int i = 0; i < b.dim(); i++)
9494
inp2_shape[i + off_b] = b.size(i);
9595

96-
WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
97-
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 3);
96+
WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
97+
p_out,
98+
out_shape,
99+
p_inp1,
100+
inp1_shape,
101+
p_inp2,
102+
inp2_shape,
103+
COMPARE_LESSER);
98104

99105
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
100106
} else {
101107
int num_elm = out.numel();
102108

103-
WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
104-
p_out, p_inp1, p_inp2, num_elm, 3);
109+
WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
110+
p_out, p_inp1, p_inp2, num_elm, COMPARE_LESSER);
105111

106112
ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
107113
}

0 commit comments

Comments
 (0)