1+ use linux_perf_event_reader:: constants:: {
2+ PERF_COUNT_HW_CPU_CYCLES , PERF_COUNT_HW_INSTRUCTIONS , PERF_TYPE_HARDWARE , PERF_TYPE_RAW ,
3+ } ;
4+
15/// Subset of perf events that CodSpeed supports.
6+ ///
7+ /// Each variant is a semantic slot of the cache/execution model, named by
8+ /// [`Self::to_perf_string`] and backed by a concrete PMU event resolved for
9+ /// the current CPU (see [`Self::to_samply_spec`]).
210#[ derive( Debug , Clone , Copy ) ]
311pub enum PerfEvent {
412 CpuCycles ,
13+ /// L1 data cache accesses.
514 L1DCache ,
15+ /// Accesses one level below L1: what L1 misses spill into. Hits in L1 are
16+ /// derived as `L1DCache - L2DCache`.
617 L2DCache ,
18+ /// Misses out of the last profiled cache level (i.e. trips to memory).
19+ /// Hits below L1 are derived as `L2DCache - CacheMisses`.
720 CacheMisses ,
821 Instructions ,
922}
1023
1124impl PerfEvent {
25+ /// The event name backing this slot.
1226 pub fn to_perf_string ( & self ) -> & ' static str {
1327 match self {
1428 PerfEvent :: CpuCycles => "cpu-cycles" ,
@@ -28,10 +42,147 @@ impl PerfEvent {
2842 PerfEvent :: Instructions ,
2943 ]
3044 }
45+
46+ /// The `<name>:<type>:<config>` spec for samply's `--perf-events`,
47+ /// resolving this slot to a concrete PMU event of the CPU we are running
48+ /// on.
49+ ///
50+ /// `None` when the slot has no suitable backing event on this CPU.
51+ /// The column is labelled with [`Self::to_perf_string`] so samply profiles
52+ /// carry the same event names as perf ones and parse through one path.
53+ pub fn to_samply_spec ( & self ) -> Option < String > {
54+ let ( event_type, config) = self . perf_event_attr ( ) ?;
55+ Some ( format ! (
56+ "{}:{}:{:#x}" ,
57+ self . to_perf_string( ) ,
58+ event_type,
59+ config
60+ ) )
61+ }
62+
63+ /// The `perf_event_attr` `(type, config)` encoding backing this slot on
64+ /// the current CPU.
65+ fn perf_event_attr ( & self ) -> Option < ( u32 , u64 ) > {
66+ match self {
67+ // Generalized hardware events, portable across architectures.
68+ PerfEvent :: CpuCycles => Some ( ( PERF_TYPE_HARDWARE , PERF_COUNT_HW_CPU_CYCLES . into ( ) ) ) ,
69+ PerfEvent :: Instructions => {
70+ Some ( ( PERF_TYPE_HARDWARE , PERF_COUNT_HW_INSTRUCTIONS . into ( ) ) )
71+ }
72+ _ => Some ( ( PERF_TYPE_RAW , self . raw_cache_config ( ) ?) ) ,
73+ }
74+ }
75+
76+ /// Raw PMU encoding of this cache slot on x86_64: `umask << 8 | event`,
77+ /// the layout the kernel expects in `perf_event_attr.config` for
78+ /// `PERF_TYPE_RAW`.
79+ ///
80+ /// Only Intel has a vetted selection; other vendors get no cache events.
81+ /// EventCode/UMask come from Intel's perfmon tables, listed per mnemonic in
82+ /// the Skylake-X core event file
83+ /// (<https://github.com/intel/perfmon/blob/main/SKX/events/skylakex_core.json>),
84+ /// stable since Skylake.
85+ #[ cfg( target_arch = "x86_64" ) ]
86+ fn raw_cache_config ( & self ) -> Option < u64 > {
87+ if !is_genuine_intel ( ) {
88+ // Not tested on AMD or other x86_64 vendors yet
89+ return None ;
90+ }
91+ // Retired load instructions, by the cache level that served them
92+ // (demand loads only; stores and prefetches don't count).
93+ match self {
94+ // MEM_INST_RETIRED.ALL_LOADS: 0xD0 | 0x81 << 8
95+ PerfEvent :: L1DCache => Some ( 0x81d0 ) ,
96+ // MEM_LOAD_RETIRED.L1_MISS: 0xD1 | 0x08 << 8
97+ PerfEvent :: L2DCache => Some ( 0x08d1 ) ,
98+ // MEM_LOAD_RETIRED.L3_MISS: 0xD1 | 0x20 << 8
99+ PerfEvent :: CacheMisses => Some ( 0x20d1 ) ,
100+ _ => None ,
101+ }
102+ }
103+
104+ /// Raw PMU encoding of this cache slot on arm64: the architected PMU event
105+ /// number, used directly as `perf_event_attr.config` for `PERF_TYPE_RAW`.
106+ ///
107+ /// These are common (architected) event numbers, listed per mnemonic in
108+ /// Arm's PMU event table for the Cortex-A72 fleet
109+ /// (<https://github.com/ARM-software/data/blob/master/pmu/cortex-a72.json>).
110+ #[ cfg( target_arch = "aarch64" ) ]
111+ fn raw_cache_config ( & self ) -> Option < u64 > {
112+ match self {
113+ // L1D_CACHE (0x04): L1 data cache accesses, loads and stores.
114+ PerfEvent :: L1DCache => Some ( 0x04 ) ,
115+ // L1D_CACHE_REFILL (0x03): L1D line fills. Defined against the same
116+ // access population as L1D_CACHE — unlike L2D_CACHE, which also
117+ // counts L1 write-backs, instruction-side refills and table
118+ // walks, and counts lines where L1D_CACHE counts operations —
119+ // so the `L1DCache - L2DCache` hit derivation stays sound.
120+ PerfEvent :: L2DCache => Some ( 0x03 ) ,
121+ // L2D_CACHE_REFILL (0x17): refills of L2 or L1 from outside those
122+ // caches. On the Cortex-A72 macro-runner fleet (a1.metal) there
123+ // is no L3, so these are trips to DRAM. Includes instruction-side
124+ // refills, so it can exceed L1D_CACHE_REFILL in icache-missing
125+ // code; the derived hit counts saturate against that.
126+ PerfEvent :: CacheMisses => Some ( 0x17 ) ,
127+ _ => None ,
128+ }
129+ }
130+
131+ #[ cfg( not( any( target_arch = "x86_64" , target_arch = "aarch64" ) ) ) ]
132+ fn raw_cache_config ( & self ) -> Option < u64 > {
133+ None
134+ }
135+ }
136+
137+ #[ cfg( target_arch = "x86_64" ) ]
138+ fn is_genuine_intel ( ) -> bool {
139+ use std:: arch:: x86_64:: __cpuid;
140+ // CPUID leaf 0: vendor string in EBX,EDX,ECX.
141+ let leaf0 = unsafe { __cpuid ( 0 ) } ;
142+ let mut vendor = [ 0u8 ; 12 ] ;
143+ vendor[ 0 ..4 ] . copy_from_slice ( & leaf0. ebx . to_le_bytes ( ) ) ;
144+ vendor[ 4 ..8 ] . copy_from_slice ( & leaf0. edx . to_le_bytes ( ) ) ;
145+ vendor[ 8 ..12 ] . copy_from_slice ( & leaf0. ecx . to_le_bytes ( ) ) ;
146+ & vendor == b"GenuineIntel"
31147}
32148
33149impl std:: fmt:: Display for PerfEvent {
34150 fn fmt ( & self , f : & mut std:: fmt:: Formatter < ' _ > ) -> std:: fmt:: Result {
35151 write ! ( f, "{}" , self . to_perf_string( ) )
36152 }
37153}
154+
155+ #[ cfg( test) ]
156+ mod tests {
157+ use super :: * ;
158+
159+ #[ test]
160+ fn portable_slots_have_samply_specs ( ) {
161+ assert_eq ! (
162+ PerfEvent :: CpuCycles . to_samply_spec( ) . unwrap( ) ,
163+ "cpu-cycles:0:0x0"
164+ ) ;
165+ assert_eq ! (
166+ PerfEvent :: Instructions . to_samply_spec( ) . unwrap( ) ,
167+ "instructions:0:0x1"
168+ ) ;
169+ }
170+
171+ #[ test]
172+ fn event_names_are_unique ( ) {
173+ let mut names: Vec < _ > = PerfEvent :: all_events ( )
174+ . iter ( )
175+ . map ( |event| event. to_perf_string ( ) )
176+ . collect ( ) ;
177+ names. sort ( ) ;
178+ names. dedup ( ) ;
179+ assert_eq ! ( names. len( ) , PerfEvent :: all_events( ) . len( ) ) ;
180+ }
181+
182+ #[ test]
183+ fn print_specs_for_this_host ( ) {
184+ for event in PerfEvent :: all_events ( ) {
185+ println ! ( "{event:?} -> {:?}" , event. to_samply_spec( ) ) ;
186+ }
187+ }
188+ }
0 commit comments