Skip to content

Commit 5c3fd1c

Browse files
feat(runner-shared): update PerfEvent to also support samply format
1 parent 1325537 commit 5c3fd1c

4 files changed

Lines changed: 155 additions & 1 deletion

File tree

Cargo.lock

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ serde_json = "1.0"
103103
serde = { version = "1.0.228", features = ["derive"] }
104104
ipc-channel = "0.20"
105105
itertools = "0.14.0"
106+
linux-perf-event-reader = "0.10.2" # matches the version linux-perf-data resolves to
106107
env_logger = "0.11.10"
107108
tempfile = "3.27.0"
108109
object = { version = "0.39", default-features = false, features = ["read_core", "elf"] }

crates/runner-shared/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ serde_json = { workspace = true }
1111
# Pinned to 1.x: 2.0 changes the wire format and serde integration
1212
bincode = "1.3"
1313
itertools = { workspace = true }
14+
linux-perf-event-reader = { workspace = true }
1415
log = { workspace = true }
1516
rmp = "0.8.15"
1617
rmp-serde = "1.3.1"

crates/runner-shared/src/perf_event.rs

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,28 @@
1+
use linux_perf_event_reader::constants::{
2+
PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, PERF_TYPE_HARDWARE, PERF_TYPE_RAW,
3+
};
4+
15
/// Subset of perf events that CodSpeed supports.
6+
///
7+
/// Each variant is a semantic slot of the cache/execution model, named by
8+
/// [`Self::to_perf_string`] and backed by a concrete PMU event resolved for
9+
/// the current CPU (see [`Self::to_samply_spec`]).
210
#[derive(Debug, Clone, Copy)]
311
pub enum PerfEvent {
412
CpuCycles,
13+
/// L1 data cache accesses.
514
L1DCache,
15+
/// Accesses one level below L1: what L1 misses spill into. Hits in L1 are
16+
/// derived as `L1DCache - L2DCache`.
617
L2DCache,
18+
/// Misses out of the last profiled cache level (i.e. trips to memory).
19+
/// Hits below L1 are derived as `L2DCache - CacheMisses`.
720
CacheMisses,
821
Instructions,
922
}
1023

1124
impl PerfEvent {
25+
/// The event name backing this slot.
1226
pub fn to_perf_string(&self) -> &'static str {
1327
match self {
1428
PerfEvent::CpuCycles => "cpu-cycles",
@@ -28,10 +42,147 @@ impl PerfEvent {
2842
PerfEvent::Instructions,
2943
]
3044
}
45+
46+
/// The `<name>:<type>:<config>` spec for samply's `--perf-events`,
47+
/// resolving this slot to a concrete PMU event of the CPU we are running
48+
/// on.
49+
///
50+
/// `None` when the slot has no suitable backing event on this CPU.
51+
/// The column is labelled with [`Self::to_perf_string`] so samply profiles
52+
/// carry the same event names as perf ones and parse through one path.
53+
pub fn to_samply_spec(&self) -> Option<String> {
54+
let (event_type, config) = self.perf_event_attr()?;
55+
Some(format!(
56+
"{}:{}:{:#x}",
57+
self.to_perf_string(),
58+
event_type,
59+
config
60+
))
61+
}
62+
63+
/// The `perf_event_attr` `(type, config)` encoding backing this slot on
64+
/// the current CPU.
65+
fn perf_event_attr(&self) -> Option<(u32, u64)> {
66+
match self {
67+
// Generalized hardware events, portable across architectures.
68+
PerfEvent::CpuCycles => Some((PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES.into())),
69+
PerfEvent::Instructions => {
70+
Some((PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS.into()))
71+
}
72+
_ => Some((PERF_TYPE_RAW, self.raw_cache_config()?)),
73+
}
74+
}
75+
76+
/// Raw PMU encoding of this cache slot on x86_64: `umask << 8 | event`,
77+
/// the layout the kernel expects in `perf_event_attr.config` for
78+
/// `PERF_TYPE_RAW`.
79+
///
80+
/// Only Intel has a vetted selection; other vendors get no cache events.
81+
/// EventCode/UMask come from Intel's perfmon tables, listed per mnemonic in
82+
/// the Skylake-X core event file
83+
/// (<https://github.com/intel/perfmon/blob/main/SKX/events/skylakex_core.json>),
84+
/// stable since Skylake.
85+
#[cfg(target_arch = "x86_64")]
86+
fn raw_cache_config(&self) -> Option<u64> {
87+
if !is_genuine_intel() {
88+
// Not tested on AMD or other x86_64 vendors yet
89+
return None;
90+
}
91+
// Retired load instructions, by the cache level that served them
92+
// (demand loads only; stores and prefetches don't count).
93+
match self {
94+
// MEM_INST_RETIRED.ALL_LOADS: 0xD0 | 0x81 << 8
95+
PerfEvent::L1DCache => Some(0x81d0),
96+
// MEM_LOAD_RETIRED.L1_MISS: 0xD1 | 0x08 << 8
97+
PerfEvent::L2DCache => Some(0x08d1),
98+
// MEM_LOAD_RETIRED.L3_MISS: 0xD1 | 0x20 << 8
99+
PerfEvent::CacheMisses => Some(0x20d1),
100+
_ => None,
101+
}
102+
}
103+
104+
/// Raw PMU encoding of this cache slot on arm64: the architected PMU event
105+
/// number, used directly as `perf_event_attr.config` for `PERF_TYPE_RAW`.
106+
///
107+
/// These are common (architected) event numbers, listed per mnemonic in
108+
/// Arm's PMU event table for the Cortex-A72 fleet
109+
/// (<https://github.com/ARM-software/data/blob/master/pmu/cortex-a72.json>).
110+
#[cfg(target_arch = "aarch64")]
111+
fn raw_cache_config(&self) -> Option<u64> {
112+
match self {
113+
// L1D_CACHE (0x04): L1 data cache accesses, loads and stores.
114+
PerfEvent::L1DCache => Some(0x04),
115+
// L1D_CACHE_REFILL (0x03): L1D line fills. Defined against the same
116+
// access population as L1D_CACHE — unlike L2D_CACHE, which also
117+
// counts L1 write-backs, instruction-side refills and table
118+
// walks, and counts lines where L1D_CACHE counts operations —
119+
// so the `L1DCache - L2DCache` hit derivation stays sound.
120+
PerfEvent::L2DCache => Some(0x03),
121+
// L2D_CACHE_REFILL (0x17): refills of L2 or L1 from outside those
122+
// caches. On the Cortex-A72 macro-runner fleet (a1.metal) there
123+
// is no L3, so these are trips to DRAM. Includes instruction-side
124+
// refills, so it can exceed L1D_CACHE_REFILL in icache-missing
125+
// code; the derived hit counts saturate against that.
126+
PerfEvent::CacheMisses => Some(0x17),
127+
_ => None,
128+
}
129+
}
130+
131+
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
132+
fn raw_cache_config(&self) -> Option<u64> {
133+
None
134+
}
135+
}
136+
137+
#[cfg(target_arch = "x86_64")]
138+
fn is_genuine_intel() -> bool {
139+
use std::arch::x86_64::__cpuid;
140+
// CPUID leaf 0: vendor string in EBX,EDX,ECX.
141+
let leaf0 = unsafe { __cpuid(0) };
142+
let mut vendor = [0u8; 12];
143+
vendor[0..4].copy_from_slice(&leaf0.ebx.to_le_bytes());
144+
vendor[4..8].copy_from_slice(&leaf0.edx.to_le_bytes());
145+
vendor[8..12].copy_from_slice(&leaf0.ecx.to_le_bytes());
146+
&vendor == b"GenuineIntel"
31147
}
32148

33149
impl std::fmt::Display for PerfEvent {
34150
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35151
write!(f, "{}", self.to_perf_string())
36152
}
37153
}
154+
155+
#[cfg(test)]
156+
mod tests {
157+
use super::*;
158+
159+
#[test]
160+
fn portable_slots_have_samply_specs() {
161+
assert_eq!(
162+
PerfEvent::CpuCycles.to_samply_spec().unwrap(),
163+
"cpu-cycles:0:0x0"
164+
);
165+
assert_eq!(
166+
PerfEvent::Instructions.to_samply_spec().unwrap(),
167+
"instructions:0:0x1"
168+
);
169+
}
170+
171+
#[test]
172+
fn event_names_are_unique() {
173+
let mut names: Vec<_> = PerfEvent::all_events()
174+
.iter()
175+
.map(|event| event.to_perf_string())
176+
.collect();
177+
names.sort();
178+
names.dedup();
179+
assert_eq!(names.len(), PerfEvent::all_events().len());
180+
}
181+
182+
#[test]
183+
fn print_specs_for_this_host() {
184+
for event in PerfEvent::all_events() {
185+
println!("{event:?} -> {:?}", event.to_samply_spec());
186+
}
187+
}
188+
}

0 commit comments

Comments
 (0)