|
| 1 | +/* |
| 2 | + * Performance Counter Utilities for PULP Benchmarking |
| 3 | + */ |
| 4 | + |
| 5 | +#ifndef __PERF_UTILS_H__ |
| 6 | +#define __PERF_UTILS_H__ |
| 7 | + |
| 8 | +#include "pmsis.h" |
| 9 | +#include "archi/riscv/pcer_v2.h" |
| 10 | + |
| 11 | +// Performance event IDs (compatible with PMSIS) |
| 12 | +#define PI_PERF_CYCLES CSR_PCER_CYCLES |
| 13 | +#define PI_PERF_INSTR CSR_PCER_INSTR |
| 14 | +#define PI_PERF_LD_STALL CSR_PCER_LD_STALL |
| 15 | +#define PI_PERF_JMP_STALL CSR_PCER_JMP_STALL |
| 16 | +#define PI_PERF_IMISS CSR_PCER_IMISS |
| 17 | +#define PI_PERF_LD CSR_PCER_LD |
| 18 | +#define PI_PERF_ST CSR_PCER_ST |
| 19 | +#define PI_PERF_JUMP CSR_PCER_JUMP |
| 20 | +#define PI_PERF_BRANCH CSR_PCER_BRANCH |
| 21 | +#define PI_PERF_TAKEN_BRANCH CSR_PCER_TAKEN_BRANCH |
| 22 | +#define PI_PERF_RVC CSR_PCER_RVC |
| 23 | +#define PI_PERF_LD_EXT CSR_PCER_LD_EXT |
| 24 | +#define PI_PERF_ST_EXT CSR_PCER_ST_EXT |
| 25 | +#define PI_PERF_LD_EXT_CYC CSR_PCER_LD_EXT_CYC |
| 26 | +#define PI_PERF_ST_EXT_CYC CSR_PCER_ST_EXT_CYC |
| 27 | +#define PI_PERF_TCDM_CONT CSR_PCER_TCDM_CONT |
| 28 | + |
| 29 | +// Benchmark statistics structure |
| 30 | +typedef struct { |
| 31 | + unsigned int cycles; |
| 32 | + unsigned int instr; |
| 33 | + unsigned int ld; |
| 34 | + unsigned int st; |
| 35 | + unsigned int ld_stall; |
| 36 | + unsigned int jmp_stall; |
| 37 | + unsigned int imiss; |
| 38 | + unsigned int branch; |
| 39 | + unsigned int taken_branch; |
| 40 | + unsigned int rvc; |
| 41 | + unsigned int ld_ext; |
| 42 | + unsigned int st_ext; |
| 43 | + unsigned int ld_ext_cyc; |
| 44 | + unsigned int st_ext_cyc; |
| 45 | + unsigned int tcdm_cont; |
| 46 | +} perf_stats_t; |
| 47 | + |
| 48 | +// Initialize performance counters for comprehensive benchmarking |
| 49 | +static inline void perf_bench_init() { |
| 50 | + // Enable all performance counters |
| 51 | + pi_perf_conf( |
| 52 | + (1 << PI_PERF_CYCLES) | |
| 53 | + (1 << PI_PERF_INSTR) | |
| 54 | + (1 << PI_PERF_LD_STALL) | |
| 55 | + (1 << PI_PERF_JMP_STALL) | |
| 56 | + (1 << PI_PERF_IMISS) | |
| 57 | + (1 << PI_PERF_LD) | |
| 58 | + (1 << PI_PERF_ST) | |
| 59 | + (1 << PI_PERF_JUMP) | |
| 60 | + (1 << PI_PERF_BRANCH) | |
| 61 | + (1 << PI_PERF_TAKEN_BRANCH) | |
| 62 | + (1 << PI_PERF_RVC) | |
| 63 | + (1 << PI_PERF_LD_EXT) | |
| 64 | + (1 << PI_PERF_ST_EXT) | |
| 65 | + (1 << PI_PERF_LD_EXT_CYC) | |
| 66 | + (1 << PI_PERF_ST_EXT_CYC) | |
| 67 | + (1 << PI_PERF_TCDM_CONT) |
| 68 | + ); |
| 69 | +} |
| 70 | + |
| 71 | +// Start performance monitoring |
| 72 | +static inline void perf_bench_start() { |
| 73 | + pi_perf_reset(); |
| 74 | + pi_perf_start(); |
| 75 | +} |
| 76 | + |
| 77 | +// Stop performance monitoring |
| 78 | +static inline void perf_bench_stop() { |
| 79 | + pi_perf_stop(); |
| 80 | +} |
| 81 | + |
| 82 | +// Read all performance counters into structure |
| 83 | +static inline void perf_bench_read(perf_stats_t *stats) { |
| 84 | + stats->cycles = pi_perf_read(PI_PERF_CYCLES); |
| 85 | + stats->instr = pi_perf_read(PI_PERF_INSTR); |
| 86 | + stats->ld = pi_perf_read(PI_PERF_LD); |
| 87 | + stats->st = pi_perf_read(PI_PERF_ST); |
| 88 | + stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL); |
| 89 | + stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL); |
| 90 | + stats->imiss = pi_perf_read(PI_PERF_IMISS); |
| 91 | + stats->branch = pi_perf_read(PI_PERF_BRANCH); |
| 92 | + stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH); |
| 93 | + stats->rvc = pi_perf_read(PI_PERF_RVC); |
| 94 | + stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT); |
| 95 | + stats->st_ext = pi_perf_read(PI_PERF_ST_EXT); |
| 96 | + stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC); |
| 97 | + stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC); |
| 98 | + stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT); |
| 99 | +} |
| 100 | + |
| 101 | +// Print performance statistics (core 0 only to avoid clutter) |
| 102 | +static inline void perf_bench_print(const char *label, perf_stats_t *stats) { |
| 103 | + if (pi_core_id() == 0) { |
| 104 | + printf("\n=== Performance Statistics: %s ===\n", label); |
| 105 | + printf("Cycles: %10u\n", stats->cycles); |
| 106 | + printf("Instructions: %10u\n", stats->instr); |
| 107 | + printf("IPC: %10.3f\n", |
| 108 | + stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f); |
| 109 | + printf("\n--- Instruction Mix ---\n"); |
| 110 | + printf("Loads: %10u (%.2f%%)\n", stats->ld, |
| 111 | + stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f); |
| 112 | + printf("Stores: %10u (%.2f%%)\n", stats->st, |
| 113 | + stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f); |
| 114 | + printf("Branches: %10u (%.2f%%)\n", stats->branch, |
| 115 | + stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f); |
| 116 | + printf("Taken Branches: %10u (%.2f%%)\n", stats->taken_branch, |
| 117 | + stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch : 0.0f); |
| 118 | + printf("Compressed (RVC): %10u (%.2f%%)\n", stats->rvc, |
| 119 | + stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f); |
| 120 | + printf("\n--- Stalls & Hazards ---\n"); |
| 121 | + printf("Load Stalls: %10u\n", stats->ld_stall); |
| 122 | + printf("Jump Stalls: %10u\n", stats->jmp_stall); |
| 123 | + printf("I-cache Misses: %10u\n", stats->imiss); |
| 124 | + printf("TCDM Contentions: %10u\n", stats->tcdm_cont); |
| 125 | + printf("\n--- Memory Hierarchy ---\n"); |
| 126 | + printf("External Loads: %10u (%.2f%%)\n", stats->ld_ext, |
| 127 | + stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f); |
| 128 | + printf("External Stores: %10u (%.2f%%)\n", stats->st_ext, |
| 129 | + stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f); |
| 130 | + printf("Ext Load Cycles: %10u (avg: %.2f)\n", stats->ld_ext_cyc, |
| 131 | + stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f); |
| 132 | + printf("Ext Store Cycles: %10u (avg: %.2f)\n", stats->st_ext_cyc, |
| 133 | + stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f); |
| 134 | + printf("========================================\n\n"); |
| 135 | + } |
| 136 | +} |
| 137 | + |
| 138 | +// Compute difference between two stats (for analyzing specific code sections) |
| 139 | +static inline void perf_bench_diff(perf_stats_t *result, |
| 140 | + perf_stats_t *end, |
| 141 | + perf_stats_t *start) { |
| 142 | + result->cycles = end->cycles - start->cycles; |
| 143 | + result->instr = end->instr - start->instr; |
| 144 | + result->ld = end->ld - start->ld; |
| 145 | + result->st = end->st - start->st; |
| 146 | + result->ld_stall = end->ld_stall - start->ld_stall; |
| 147 | + result->jmp_stall = end->jmp_stall - start->jmp_stall; |
| 148 | + result->imiss = end->imiss - start->imiss; |
| 149 | + result->branch = end->branch - start->branch; |
| 150 | + result->taken_branch = end->taken_branch - start->taken_branch; |
| 151 | + result->rvc = end->rvc - start->rvc; |
| 152 | + result->ld_ext = end->ld_ext - start->ld_ext; |
| 153 | + result->st_ext = end->st_ext - start->st_ext; |
| 154 | + result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc; |
| 155 | + result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc; |
| 156 | + result->tcdm_cont = end->tcdm_cont - start->tcdm_cont; |
| 157 | +} |
| 158 | + |
| 159 | +#endif // __PERF_UTILS_H__ |
0 commit comments