66
77#include "DeeployPULPMath.h"
88#include "pmsis.h"
9+ // #include "perf_utils.h"
910
1011void PULP_Gemm_fp32_fp32_fp32_fp32 (const float32_t * __restrict__ pSrcA ,
1112 const float32_t * __restrict__ pSrcB ,
@@ -17,6 +18,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
1718 int8_t core_id = pi_core_id ();
1819 int8_t log2Core = LOG2 (NUM_CORES );
1920
21+ //RW: Performance monitoring is currently disabled
22+ // perf_stats_t perf_start, perf_end, perf_total;
23+
24+ // // Initialize and start performance counters (only core 0)
25+ // if (core_id == 0) {
26+ // perf_bench_init();
27+ // perf_bench_start();
28+ // perf_bench_read(&perf_start);
29+ // }
30+
2031 uint32_t M_chunk = (M >> log2Core ) + ((M & (NUM_CORES - 1 )) != 0 );
2132 uint32_t M_start = MIN (core_id * M_chunk , M );
2233 uint32_t M_end = MIN (M_start + M_chunk , M );
@@ -351,4 +362,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
351362 }
352363 }
353364 }
365+
366+ // RW: Stop performance counters and print results (only core 0)
367+ // if (core_id == 0) {
368+ // perf_bench_stop();
369+ // perf_bench_read(&perf_end);
370+ // perf_bench_diff(&perf_total, &perf_end, &perf_start);
371+
372+ // char label[100];
373+ // snprintf(label, sizeof(label), "GEMM M=%u N=%u O=%u transA=%u transB=%u",
374+ // M, N, O, transA, transB);
375+ // perf_bench_print(label, &perf_total);
376+ // }
354377}
0 commit comments