11/// Subset of perf events that CodSpeed supports.
2+ ///
3+ /// Each variant is a semantic slot of the cache/execution model; the concrete
4+ /// perf event chosen for it depends on the architecture (see
5+ /// [`Self::to_perf_string`]).
26#[ derive( Debug , Clone , Copy ) ]
37pub enum PerfEvent {
48 CpuCycles ,
9+ /// L1 data cache accesses.
510 L1DCache ,
11+ /// Accesses one level below L1: what L1 misses spill into. Hits in L1 are
12+ /// derived as `L1DCache - L2DCache`.
613 L2DCache ,
14+ /// Misses out of the last profiled cache level (i.e. trips to memory).
15+ /// Hits below L1 are derived as `L2DCache - CacheMisses`.
716 CacheMisses ,
817 Instructions ,
918}
1019
1120impl PerfEvent {
21+ /// Every perf event name that can back this slot, across all supported
22+ /// architectures. For parsers, which must handle profiles recorded on any
23+ /// architecture regardless of where they run.
24+ pub fn perf_strings ( & self ) -> & ' static [ & ' static str ] {
25+ match self {
26+ PerfEvent :: CpuCycles => & [ "cpu-cycles" ] ,
27+ PerfEvent :: L1DCache => & [ "l1d_cache" , "L1-dcache-loads" ] ,
28+ PerfEvent :: L2DCache => & [ "l2d_cache" , "L1-dcache-load-misses" ] ,
29+ PerfEvent :: CacheMisses => & [ "l2d_cache_refill" , "cache-misses" ] ,
30+ PerfEvent :: Instructions => & [ "instructions" ] ,
31+ }
32+ }
33+
34+ /// The perf event name backing this slot on the current architecture.
35+ ///
36+ /// On arm64 these are the architected PMU events (resolved through sysfs):
37+ /// `l2d_cache` counts all L2 accesses and `l2d_cache_refill` its misses.
38+ /// On x86_64 there is no generalized combined L2 event, so the slots are
39+ /// backed by the generalized cache events: L1 read misses stand in for
40+ /// "accesses below L1", and `cache-misses` (last-level misses) for trips
41+ /// to memory — lumping L2 and L3 hits together in the derived
42+ /// `L2DCache - CacheMisses`.
1243 pub fn to_perf_string ( & self ) -> & ' static str {
44+ #[ cfg( target_arch = "x86_64" ) ]
45+ match self {
46+ PerfEvent :: CpuCycles => "cpu-cycles" ,
47+ PerfEvent :: L1DCache => "L1-dcache-loads" ,
48+ PerfEvent :: L2DCache => "L1-dcache-load-misses" ,
49+ PerfEvent :: CacheMisses => "cache-misses" ,
50+ PerfEvent :: Instructions => "instructions" ,
51+ }
52+ #[ cfg( not( target_arch = "x86_64" ) ) ]
1353 match self {
1454 PerfEvent :: CpuCycles => "cpu-cycles" ,
1555 PerfEvent :: L1DCache => "l1d_cache" ,
@@ -28,10 +68,153 @@ impl PerfEvent {
2868 PerfEvent :: Instructions ,
2969 ]
3070 }
71+
72+ /// Architecture-independent name for this slot in samply profiles.
73+ ///
74+ /// samply labels each extra-event column with the name we pass it, so
75+ /// every architecture shares one name per slot and parsers match on it
76+ /// directly — unlike the perf integration, where columns carry the
77+ /// arch-specific event names of [`Self::perf_strings`].
78+ pub fn samply_name ( & self ) -> & ' static str {
79+ match self {
80+ PerfEvent :: CpuCycles => "cpu-cycles" ,
81+ PerfEvent :: L1DCache => "l1d-cache" ,
82+ PerfEvent :: L2DCache => "l2d-cache" ,
83+ PerfEvent :: CacheMisses => "cache-misses" ,
84+ PerfEvent :: Instructions => "instructions" ,
85+ }
86+ }
87+
88+ /// The `<name>:<type>:<config>` spec for samply's `--perf-events`,
89+ /// resolving this slot to a concrete PMU event of the CPU we are running
90+ /// on. `None` when the slot has no suitable backing event on this CPU.
91+ pub fn to_samply_spec ( & self ) -> Option < String > {
92+ let ( event_type, config) = self . perf_event_attr ( ) ?;
93+ Some ( format ! (
94+ "{}:{}:{:#x}" ,
95+ self . samply_name( ) ,
96+ event_type,
97+ config
98+ ) )
99+ }
100+
101+ /// The `perf_event_attr` `(type, config)` encoding backing this slot on
102+ /// the current CPU.
103+ fn perf_event_attr ( & self ) -> Option < ( u32 , u64 ) > {
104+ // perf_event_attr type values from <linux/perf_event.h>.
105+ const PERF_TYPE_HARDWARE : u32 = 0 ;
106+ const PERF_TYPE_RAW : u32 = 4 ;
107+ match self {
108+ // Generalized hardware events, portable across architectures.
109+ PerfEvent :: CpuCycles => Some ( ( PERF_TYPE_HARDWARE , 0 ) ) ,
110+ PerfEvent :: Instructions => Some ( ( PERF_TYPE_HARDWARE , 1 ) ) ,
111+ _ => Some ( ( PERF_TYPE_RAW , self . raw_cache_config ( ) ?) ) ,
112+ }
113+ }
114+
115+ /// Raw PMU encoding of this cache slot on x86_64: `umask << 8 | event`.
116+ ///
117+ /// Only Intel has a vetted selection; other vendors get no cache events.
118+ /// The events are picked so that each slot counts demand traffic of one
119+ /// consistent population, keeping the derived hit counts
120+ /// (`L1DCache - L2DCache`, `L2DCache - CacheMisses`) from underflowing
121+ /// the way mixed-population events (e.g. loads vs. all-cause line fills)
122+ /// can in store- or prefetch-heavy code.
123+ #[ cfg( target_arch = "x86_64" ) ]
124+ fn raw_cache_config ( & self ) -> Option < u64 > {
125+ if !is_genuine_intel ( ) {
126+ return None ;
127+ }
128+ // Retired load instructions, by the cache level that served them:
129+ // MEM_INST_RETIRED.ALL_LOADS, MEM_LOAD_RETIRED.L1_MISS and
130+ // MEM_LOAD_RETIRED.L3_MISS. Demand loads only (stores and prefetches
131+ // don't count), encodings stable since Skylake.
132+ match self {
133+ PerfEvent :: L1DCache => Some ( 0x81d0 ) ,
134+ PerfEvent :: L2DCache => Some ( 0x08d1 ) ,
135+ PerfEvent :: CacheMisses => Some ( 0x20d1 ) ,
136+ _ => None ,
137+ }
138+ }
139+
140+ /// Raw PMU encoding of this cache slot on arm64: the architected PMU
141+ /// event number (Arm ARM D8.11).
142+ #[ cfg( target_arch = "aarch64" ) ]
143+ fn raw_cache_config ( & self ) -> Option < u64 > {
144+ match self {
145+ // L1D_CACHE: L1 data cache accesses, loads and stores.
146+ PerfEvent :: L1DCache => Some ( 0x04 ) ,
147+ // L1D_CACHE_REFILL: L1D line fills. Defined against the same
148+ // access population as L1D_CACHE — unlike L2D_CACHE, which also
149+ // counts L1 write-backs, instruction-side refills and table
150+ // walks, and counts lines where L1D_CACHE counts operations —
151+ // so the `L1DCache - L2DCache` hit derivation stays sound.
152+ PerfEvent :: L2DCache => Some ( 0x03 ) ,
153+ // L2D_CACHE_REFILL: refills of L2 or L1 from outside those
154+ // caches. On the Cortex-A72 macro-runner fleet (a1.metal) there
155+ // is no L3, so these are trips to DRAM. Includes instruction-side
156+ // refills, so it can exceed L1D_CACHE_REFILL in icache-missing
157+ // code; the derived hit counts saturate against that.
158+ PerfEvent :: CacheMisses => Some ( 0x17 ) ,
159+ _ => None ,
160+ }
161+ }
162+
163+ #[ cfg( not( any( target_arch = "x86_64" , target_arch = "aarch64" ) ) ) ]
164+ fn raw_cache_config ( & self ) -> Option < u64 > {
165+ None
166+ }
167+ }
168+
169+ #[ cfg( target_arch = "x86_64" ) ]
170+ fn is_genuine_intel ( ) -> bool {
171+ use std:: arch:: x86_64:: __cpuid;
172+ // CPUID leaf 0: vendor string in EBX,EDX,ECX.
173+ let leaf0 = unsafe { __cpuid ( 0 ) } ;
174+ let mut vendor = [ 0u8 ; 12 ] ;
175+ vendor[ 0 ..4 ] . copy_from_slice ( & leaf0. ebx . to_le_bytes ( ) ) ;
176+ vendor[ 4 ..8 ] . copy_from_slice ( & leaf0. edx . to_le_bytes ( ) ) ;
177+ vendor[ 8 ..12 ] . copy_from_slice ( & leaf0. ecx . to_le_bytes ( ) ) ;
178+ & vendor == b"GenuineIntel"
31179}
32180
33181impl std:: fmt:: Display for PerfEvent {
34182 fn fmt ( & self , f : & mut std:: fmt:: Formatter < ' _ > ) -> std:: fmt:: Result {
35183 write ! ( f, "{}" , self . to_perf_string( ) )
36184 }
37185}
186+
187+ #[ cfg( test) ]
188+ mod tests {
189+ use super :: * ;
190+
191+ #[ test]
192+ fn portable_slots_have_samply_specs ( ) {
193+ assert_eq ! (
194+ PerfEvent :: CpuCycles . to_samply_spec( ) . unwrap( ) ,
195+ "cpu-cycles:0:0x0"
196+ ) ;
197+ assert_eq ! (
198+ PerfEvent :: Instructions . to_samply_spec( ) . unwrap( ) ,
199+ "instructions:0:0x1"
200+ ) ;
201+ }
202+
203+ #[ test]
204+ fn samply_names_are_unique ( ) {
205+ let mut names: Vec < _ > = PerfEvent :: all_events ( )
206+ . iter ( )
207+ . map ( |event| event. samply_name ( ) )
208+ . collect ( ) ;
209+ names. sort ( ) ;
210+ names. dedup ( ) ;
211+ assert_eq ! ( names. len( ) , PerfEvent :: all_events( ) . len( ) ) ;
212+ }
213+
214+ #[ test]
215+ fn print_specs_for_this_host ( ) {
216+ for event in PerfEvent :: all_events ( ) {
217+ println ! ( "{event:?} -> {:?}" , event. to_samply_spec( ) ) ;
218+ }
219+ }
220+ }
0 commit comments