Skip to content

Commit a6231d0

Browse files
committed
Use gvsoc csr to get microbenchmarking result around gemm
1 parent c547dd5 commit a6231d0

2 files changed

Lines changed: 182 additions & 0 deletions

File tree

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
/*
2+
* Performance Counter Utilities for PULP Benchmarking
3+
*/
4+
5+
#ifndef __PERF_UTILS_H__
6+
#define __PERF_UTILS_H__
7+
8+
#include "pmsis.h"
9+
#include "archi/riscv/pcer_v2.h"
10+
11+
// Performance event IDs (compatible with PMSIS)
12+
#define PI_PERF_CYCLES CSR_PCER_CYCLES
13+
#define PI_PERF_INSTR CSR_PCER_INSTR
14+
#define PI_PERF_LD_STALL CSR_PCER_LD_STALL
15+
#define PI_PERF_JMP_STALL CSR_PCER_JMP_STALL
16+
#define PI_PERF_IMISS CSR_PCER_IMISS
17+
#define PI_PERF_LD CSR_PCER_LD
18+
#define PI_PERF_ST CSR_PCER_ST
19+
#define PI_PERF_JUMP CSR_PCER_JUMP
20+
#define PI_PERF_BRANCH CSR_PCER_BRANCH
21+
#define PI_PERF_TAKEN_BRANCH CSR_PCER_TAKEN_BRANCH
22+
#define PI_PERF_RVC CSR_PCER_RVC
23+
#define PI_PERF_LD_EXT CSR_PCER_LD_EXT
24+
#define PI_PERF_ST_EXT CSR_PCER_ST_EXT
25+
#define PI_PERF_LD_EXT_CYC CSR_PCER_LD_EXT_CYC
26+
#define PI_PERF_ST_EXT_CYC CSR_PCER_ST_EXT_CYC
27+
#define PI_PERF_TCDM_CONT CSR_PCER_TCDM_CONT
28+
29+
// Benchmark statistics structure
30+
typedef struct {
31+
unsigned int cycles;
32+
unsigned int instr;
33+
unsigned int ld;
34+
unsigned int st;
35+
unsigned int ld_stall;
36+
unsigned int jmp_stall;
37+
unsigned int imiss;
38+
unsigned int branch;
39+
unsigned int taken_branch;
40+
unsigned int rvc;
41+
unsigned int ld_ext;
42+
unsigned int st_ext;
43+
unsigned int ld_ext_cyc;
44+
unsigned int st_ext_cyc;
45+
unsigned int tcdm_cont;
46+
} perf_stats_t;
47+
48+
// Initialize performance counters for comprehensive benchmarking
49+
static inline void perf_bench_init() {
50+
// Enable all performance counters
51+
pi_perf_conf(
52+
(1 << PI_PERF_CYCLES) |
53+
(1 << PI_PERF_INSTR) |
54+
(1 << PI_PERF_LD_STALL) |
55+
(1 << PI_PERF_JMP_STALL) |
56+
(1 << PI_PERF_IMISS) |
57+
(1 << PI_PERF_LD) |
58+
(1 << PI_PERF_ST) |
59+
(1 << PI_PERF_JUMP) |
60+
(1 << PI_PERF_BRANCH) |
61+
(1 << PI_PERF_TAKEN_BRANCH) |
62+
(1 << PI_PERF_RVC) |
63+
(1 << PI_PERF_LD_EXT) |
64+
(1 << PI_PERF_ST_EXT) |
65+
(1 << PI_PERF_LD_EXT_CYC) |
66+
(1 << PI_PERF_ST_EXT_CYC) |
67+
(1 << PI_PERF_TCDM_CONT)
68+
);
69+
}
70+
71+
// Start performance monitoring
72+
static inline void perf_bench_start() {
73+
pi_perf_reset();
74+
pi_perf_start();
75+
}
76+
77+
// Stop performance monitoring
78+
static inline void perf_bench_stop() {
79+
pi_perf_stop();
80+
}
81+
82+
// Read all performance counters into structure
83+
static inline void perf_bench_read(perf_stats_t *stats) {
84+
stats->cycles = pi_perf_read(PI_PERF_CYCLES);
85+
stats->instr = pi_perf_read(PI_PERF_INSTR);
86+
stats->ld = pi_perf_read(PI_PERF_LD);
87+
stats->st = pi_perf_read(PI_PERF_ST);
88+
stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL);
89+
stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL);
90+
stats->imiss = pi_perf_read(PI_PERF_IMISS);
91+
stats->branch = pi_perf_read(PI_PERF_BRANCH);
92+
stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH);
93+
stats->rvc = pi_perf_read(PI_PERF_RVC);
94+
stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT);
95+
stats->st_ext = pi_perf_read(PI_PERF_ST_EXT);
96+
stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC);
97+
stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC);
98+
stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT);
99+
}
100+
101+
// Print performance statistics (core 0 only to avoid clutter)
102+
static inline void perf_bench_print(const char *label, perf_stats_t *stats) {
103+
if (pi_core_id() == 0) {
104+
printf("\n=== Performance Statistics: %s ===\n", label);
105+
printf("Cycles: %10u\n", stats->cycles);
106+
printf("Instructions: %10u\n", stats->instr);
107+
printf("IPC: %10.3f\n",
108+
stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f);
109+
printf("\n--- Instruction Mix ---\n");
110+
printf("Loads: %10u (%.2f%%)\n", stats->ld,
111+
stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f);
112+
printf("Stores: %10u (%.2f%%)\n", stats->st,
113+
stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f);
114+
printf("Branches: %10u (%.2f%%)\n", stats->branch,
115+
stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f);
116+
printf("Taken Branches: %10u (%.2f%%)\n", stats->taken_branch,
117+
stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch : 0.0f);
118+
printf("Compressed (RVC): %10u (%.2f%%)\n", stats->rvc,
119+
stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f);
120+
printf("\n--- Stalls & Hazards ---\n");
121+
printf("Load Stalls: %10u\n", stats->ld_stall);
122+
printf("Jump Stalls: %10u\n", stats->jmp_stall);
123+
printf("I-cache Misses: %10u\n", stats->imiss);
124+
printf("TCDM Contentions: %10u\n", stats->tcdm_cont);
125+
printf("\n--- Memory Hierarchy ---\n");
126+
printf("External Loads: %10u (%.2f%%)\n", stats->ld_ext,
127+
stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f);
128+
printf("External Stores: %10u (%.2f%%)\n", stats->st_ext,
129+
stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f);
130+
printf("Ext Load Cycles: %10u (avg: %.2f)\n", stats->ld_ext_cyc,
131+
stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f);
132+
printf("Ext Store Cycles: %10u (avg: %.2f)\n", stats->st_ext_cyc,
133+
stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f);
134+
printf("========================================\n\n");
135+
}
136+
}
137+
138+
// Compute difference between two stats (for analyzing specific code sections)
139+
static inline void perf_bench_diff(perf_stats_t *result,
140+
perf_stats_t *end,
141+
perf_stats_t *start) {
142+
result->cycles = end->cycles - start->cycles;
143+
result->instr = end->instr - start->instr;
144+
result->ld = end->ld - start->ld;
145+
result->st = end->st - start->st;
146+
result->ld_stall = end->ld_stall - start->ld_stall;
147+
result->jmp_stall = end->jmp_stall - start->jmp_stall;
148+
result->imiss = end->imiss - start->imiss;
149+
result->branch = end->branch - start->branch;
150+
result->taken_branch = end->taken_branch - start->taken_branch;
151+
result->rvc = end->rvc - start->rvc;
152+
result->ld_ext = end->ld_ext - start->ld_ext;
153+
result->st_ext = end->st_ext - start->st_ext;
154+
result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc;
155+
result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc;
156+
result->tcdm_cont = end->tcdm_cont - start->tcdm_cont;
157+
}
158+
159+
#endif // __PERF_UTILS_H__

TargetLibraries/PULPOpen/src/Gemm.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include "DeeployPULPMath.h"
88
#include "pmsis.h"
9+
#include "perf_utils.h"
910

1011
void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
1112
const float32_t *__restrict__ pSrcB,
@@ -17,6 +18,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
1718
int8_t core_id = pi_core_id();
1819
int8_t log2Core = LOG2(NUM_CORES);
1920

21+
// Performance monitoring structures
22+
perf_stats_t perf_start, perf_end, perf_total;
23+
24+
// Initialize and start performance counters (only core 0)
25+
if (core_id == 0) {
26+
perf_bench_init();
27+
perf_bench_start();
28+
perf_bench_read(&perf_start);
29+
}
30+
2031
uint32_t M_chunk = (M >> log2Core) + ((M & (NUM_CORES - 1)) != 0);
2132
uint32_t M_start = MIN(core_id * M_chunk, M);
2233
uint32_t M_end = MIN(M_start + M_chunk, M);
@@ -351,4 +362,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
351362
}
352363
}
353364
}
365+
366+
// Stop performance counters and print results (only core 0)
367+
if (core_id == 0) {
368+
perf_bench_stop();
369+
perf_bench_read(&perf_end);
370+
perf_bench_diff(&perf_total, &perf_end, &perf_start);
371+
372+
char label[100];
373+
snprintf(label, sizeof(label), "GEMM M=%u N=%u O=%u transA=%u transB=%u",
374+
M, N, O, transA, transB);
375+
perf_bench_print(label, &perf_total);
376+
}
354377
}

0 commit comments

Comments
 (0)