@@ -64,6 +64,12 @@ static int cmp_uint64_t(const void *a, const void *b)
6464 qsort((cyc), NTESTS, sizeof(uint64_t), cmp_uint64_t); \
6565 printf(txt " cycles=%" PRIu64 "\n", (cyc)[NTESTS >> 1] / NITERATIONS);
6666
67+ #define BENCH_NATIVE_OK (txt , call ) \
68+ BENCH(txt, CHECK((call) == MLK_NATIVE_FUNC_SUCCESS))
69+
70+ #define BENCH_NATIVE_NOT_FALLBACK (txt , call ) \
71+ BENCH(txt, CHECK((call) != MLK_NATIVE_FUNC_FALLBACK))
72+
6773static int bench (void )
6874{
6975 MLK_ALIGN uint64_t data0 [1024 ];
@@ -207,46 +213,145 @@ static int bench(void)
207213 BENCH ("mlk_gen_matrix" ,
208214 mlk_gen_matrix ((mlk_polymat * )data0 , (uint8_t * )data1 , 0 ))
209215
216+ /* Native backend components */
217+
218+ #if defined(MLK_USE_NATIVE_NTT )
219+ BENCH_NATIVE_OK ("mlk_ntt_native" , mlk_ntt_native ((int16_t * )data0 ));
220+ #endif
221+
222+ #if defined(MLK_USE_NATIVE_INTT )
223+ BENCH_NATIVE_OK ("mlk_intt_native" , mlk_intt_native ((int16_t * )data0 ));
224+ #endif
225+
226+ #if defined(MLK_USE_NATIVE_POLY_REDUCE )
227+ BENCH_NATIVE_OK ("mlk_poly_reduce_native" ,
228+ mlk_poly_reduce_native ((int16_t * )data0 ));
229+ #endif
230+
231+ #if defined(MLK_USE_NATIVE_POLY_TOMONT )
232+ BENCH_NATIVE_OK ("mlk_poly_tomont_native" ,
233+ mlk_poly_tomont_native ((int16_t * )data0 ));
234+ #endif
235+
236+ #if defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE )
237+ BENCH_NATIVE_OK (
238+ "mlk_poly_mulcache_compute_native" ,
239+ mlk_poly_mulcache_compute_native ((int16_t * )data0 , (int16_t * )data1 ));
240+ #endif
210241
211- #if defined(MLK_ARITH_BACKEND_AARCH64 )
212-
213- printf ("---AArch64 native backend components---\n" );
214-
215- BENCH ("ntt-native" ,
216- CHECK (mlk_ntt_native ((int16_t * )data0 ) == MLK_NATIVE_FUNC_SUCCESS ));
217- BENCH ("intt-native" ,
218- CHECK (mlk_intt_native ((int16_t * )data0 ) == MLK_NATIVE_FUNC_SUCCESS ));
219- BENCH ("mlk_poly-reduce-native" ,
220- CHECK (mlk_poly_reduce_native ((int16_t * )data0 ) ==
221- MLK_NATIVE_FUNC_SUCCESS ));
222- BENCH ("mlk_poly-tomont-native" ,
223- CHECK (mlk_poly_tomont_native ((int16_t * )data0 ) ==
224- MLK_NATIVE_FUNC_SUCCESS ));
225- BENCH ("mlk_poly-tobytes-native" ,
226- CHECK (mlk_poly_tobytes_native ((uint8_t * )data0 , (int16_t * )data1 ) ==
227- MLK_NATIVE_FUNC_SUCCESS ));
228- BENCH ("mlk_poly-mulcache-compute-native" ,
229- CHECK (mlk_poly_mulcache_compute_native ((int16_t * )data0 ,
230- (int16_t * )data1 ) ==
231- MLK_NATIVE_FUNC_SUCCESS ));
242+ #if defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED )
232243#if MLKEM_K == 2
233- BENCH ( "mlk_polyvec-basemul-acc-montgomery-cached-k2-native " ,
234- CHECK ( mlk_polyvec_basemul_acc_montgomery_cached_k2_native (
235- (int16_t * )data0 , (int16_t * )data1 , (int16_t * )data2 ,
236- (int16_t * )data3 ) == MLK_NATIVE_FUNC_SUCCESS ));
244+ BENCH_NATIVE_OK ( "mlk_polyvec_basemul_acc_montgomery_cached_k2_native " ,
245+ mlk_polyvec_basemul_acc_montgomery_cached_k2_native (
246+ (int16_t * )data0 , (int16_t * )data1 , (int16_t * )data2 ,
247+ (int16_t * )data3 ));
237248#elif MLKEM_K == 3
238- BENCH ( "mlk_polyvec-basemul-acc-montgomery-cached-k3-native " ,
239- CHECK ( mlk_polyvec_basemul_acc_montgomery_cached_k3_native (
240- (int16_t * )data0 , (int16_t * )data1 , (int16_t * )data2 ,
241- (int16_t * )data3 ) == MLK_NATIVE_FUNC_SUCCESS ));
249+ BENCH_NATIVE_OK ( "mlk_polyvec_basemul_acc_montgomery_cached_k3_native " ,
250+ mlk_polyvec_basemul_acc_montgomery_cached_k3_native (
251+ (int16_t * )data0 , (int16_t * )data1 , (int16_t * )data2 ,
252+ (int16_t * )data3 ));
242253#elif MLKEM_K == 4
243- BENCH ( "mlk_polyvec-basemul-acc-montgomery-cached-k4-native " ,
244- CHECK ( mlk_polyvec_basemul_acc_montgomery_cached_k4_native (
245- (int16_t * )data0 , (int16_t * )data1 , (int16_t * )data2 ,
246- (int16_t * )data3 ) == MLK_NATIVE_FUNC_SUCCESS ));
254+ BENCH_NATIVE_OK ( "mlk_polyvec_basemul_acc_montgomery_cached_k4_native " ,
255+ mlk_polyvec_basemul_acc_montgomery_cached_k4_native (
256+ (int16_t * )data0 , (int16_t * )data1 , (int16_t * )data2 ,
257+ (int16_t * )data3 ));
247258#endif /* MLKEM_K == 4 */
259+ #endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
260+
261+ #if defined(MLK_USE_NATIVE_POLY_TOBYTES )
262+ BENCH_NATIVE_OK ("mlk_poly_tobytes_native" ,
263+ mlk_poly_tobytes_native ((uint8_t * )data0 , (int16_t * )data1 ));
264+ #endif
265+
266+ #if defined(MLK_USE_NATIVE_POLY_FROMBYTES )
267+ BENCH_NATIVE_OK (
268+ "mlk_poly_frombytes_native" ,
269+ mlk_poly_frombytes_native ((int16_t * )data0 , (uint8_t * )data1 ));
270+ #endif
271+
272+ #if defined(MLK_USE_NATIVE_REJ_UNIFORM )
273+ BENCH_NATIVE_NOT_FALLBACK (
274+ "mlk_rej_uniform_native" ,
275+ mlk_rej_uniform_native ((int16_t * )data0 , MLKEM_N , (uint8_t * )data1 , 768 ));
276+ #endif
277+
278+ #if MLKEM_K == 2 || MLKEM_K == 3
279+ #if defined(MLK_USE_NATIVE_POLY_COMPRESS_D4 )
280+ BENCH_NATIVE_OK (
281+ "mlk_poly_compress_d4_native" ,
282+ mlk_poly_compress_d4_native ((uint8_t * )data0 , (int16_t * )data1 ));
283+ #endif
284+
285+ #if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10 )
286+ BENCH_NATIVE_OK (
287+ "mlk_poly_compress_d10_native" ,
288+ mlk_poly_compress_d10_native ((uint8_t * )data0 , (int16_t * )data1 ));
289+ #endif
290+
291+ #if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4 )
292+ BENCH_NATIVE_OK (
293+ "mlk_poly_decompress_d4_native" ,
294+ mlk_poly_decompress_d4_native ((int16_t * )data0 , (uint8_t * )data1 ));
295+ #endif
296+
297+ #if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10 )
298+ BENCH_NATIVE_OK (
299+ "mlk_poly_decompress_d10_native" ,
300+ mlk_poly_decompress_d10_native ((int16_t * )data0 , (uint8_t * )data1 ));
301+ #endif
302+ #endif /* MLKEM_K == 2 || MLKEM_K == 3 */
303+
304+ #if MLKEM_K == 4
305+ #if defined(MLK_USE_NATIVE_POLY_COMPRESS_D5 )
306+ BENCH_NATIVE_OK (
307+ "mlk_poly_compress_d5_native" ,
308+ mlk_poly_compress_d5_native ((uint8_t * )data0 , (int16_t * )data1 ));
309+ #endif
310+
311+ #if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11 )
312+ BENCH_NATIVE_OK (
313+ "mlk_poly_compress_d11_native" ,
314+ mlk_poly_compress_d11_native ((uint8_t * )data0 , (int16_t * )data1 ));
315+ #endif
316+
317+ #if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5 )
318+ BENCH_NATIVE_OK (
319+ "mlk_poly_decompress_d5_native" ,
320+ mlk_poly_decompress_d5_native ((int16_t * )data0 , (uint8_t * )data1 ));
321+ #endif
322+
323+ #if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11 )
324+ BENCH_NATIVE_OK (
325+ "mlk_poly_decompress_d11_native" ,
326+ mlk_poly_decompress_d11_native ((int16_t * )data0 , (uint8_t * )data1 ));
327+ #endif
328+ #endif /* MLKEM_K == 4 */
329+
330+ #if defined(MLK_USE_FIPS202_X1_NATIVE )
331+ BENCH_NATIVE_OK ("mlk_keccak_f1600_x1_native" ,
332+ mlk_keccak_f1600_x1_native (data0 ));
333+ #endif
334+
335+ #if defined(MLK_USE_FIPS202_X4_NATIVE )
336+ BENCH_NATIVE_OK ("mlk_keccak_f1600_x4_native" ,
337+ mlk_keccak_f1600_x4_native (data0 ));
338+ #endif
248339
249- #endif /* MLK_ARITH_BACKEND_AARCH64 */
340+ #if defined(MLK_USE_FIPS202_X4_XOR_BYTES_NATIVE )
341+ BENCH_NATIVE_OK (
342+ "mlk_keccakf1600_xor_bytes_x4_native" ,
343+ mlk_keccakf1600_xor_bytes_x4_native (
344+ data0 , (uint8_t * )data1 , (uint8_t * )data2 , (uint8_t * )data3 ,
345+ (uint8_t * )data4 , 0 , 25 * sizeof (uint64_t )));
346+ #endif /* MLK_USE_FIPS202_X4_XOR_BYTES_NATIVE */
347+
348+ #if defined(MLK_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE )
349+ BENCH_NATIVE_OK (
350+ "mlk_keccakf1600_extract_bytes_x4_native" ,
351+ mlk_keccakf1600_extract_bytes_x4_native (
352+ data0 , (uint8_t * )data1 , (uint8_t * )data2 , (uint8_t * )data3 ,
353+ (uint8_t * )data4 , 0 , 25 * sizeof (uint64_t )));
354+ #endif /* MLK_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE */
250355
251356 return 0 ;
252357}
0 commit comments