Skip to content

Commit ca886e6

Browse files
feat(walltime): capture per-arch hardware events with the samply profiler
Set SAMPLY_PERF_EVENTS when wrapping the benchmark command with samply, so the profile carries per-sample hardware event deltas alongside the stack samples, mirroring the event capture the perf profiler does. Make PerfEvent's perf names architecture-aware: each variant is a semantic slot of the cache model, backed by the architected PMU events on arm64 (l1d_cache, l2d_cache, l2d_cache_refill, as before) and by the generalized cache events on x86_64 (L1-dcache-loads, L1-dcache-load-misses, cache-misses), where no combined L2 event is exposed. This also gives the perf profiler a working event set on x86_64, which previously disabled event sampling entirely. samply degrades gracefully to cycles-only sampling when the PMU can't deliver the events, so the env var is set unconditionally on Linux. Refs COD-2810 Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 7002d2f commit ca886e6

2 files changed

Lines changed: 198 additions & 0 deletions

File tree

crates/runner-shared/src/perf_event.rs

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,55 @@
11
/// Subset of perf events that CodSpeed supports.
2+
///
3+
/// Each variant is a semantic slot of the cache/execution model; the concrete
4+
/// perf event chosen for it depends on the architecture (see
5+
/// [`Self::to_perf_string`]).
26
#[derive(Debug, Clone, Copy)]
37
pub enum PerfEvent {
48
CpuCycles,
9+
/// L1 data cache accesses.
510
L1DCache,
11+
/// Accesses one level below L1: what L1 misses spill into. Hits in L1 are
12+
/// derived as `L1DCache - L2DCache`.
613
L2DCache,
14+
/// Misses out of the last profiled cache level (i.e. trips to memory).
15+
/// Hits below L1 are derived as `L2DCache - CacheMisses`.
716
CacheMisses,
817
Instructions,
918
}
1019

1120
impl PerfEvent {
21+
/// Every perf event name that can back this slot, across all supported
22+
/// architectures. For parsers, which must handle profiles recorded on any
23+
/// architecture regardless of where they run.
24+
pub fn perf_strings(&self) -> &'static [&'static str] {
25+
match self {
26+
PerfEvent::CpuCycles => &["cpu-cycles"],
27+
PerfEvent::L1DCache => &["l1d_cache", "L1-dcache-loads"],
28+
PerfEvent::L2DCache => &["l2d_cache", "L1-dcache-load-misses"],
29+
PerfEvent::CacheMisses => &["l2d_cache_refill", "cache-misses"],
30+
PerfEvent::Instructions => &["instructions"],
31+
}
32+
}
33+
34+
/// The perf event name backing this slot on the current architecture.
35+
///
36+
/// On arm64 these are the architected PMU events (resolved through sysfs):
37+
/// `l2d_cache` counts all L2 accesses and `l2d_cache_refill` its misses.
38+
/// On x86_64 there is no generalized combined L2 event, so the slots are
39+
/// backed by the generalized cache events: L1 read misses stand in for
40+
/// "accesses below L1", and `cache-misses` (last-level misses) for trips
41+
/// to memory — lumping L2 and L3 hits together in the derived
42+
/// `L2DCache - CacheMisses`.
1243
pub fn to_perf_string(&self) -> &'static str {
44+
#[cfg(target_arch = "x86_64")]
45+
match self {
46+
PerfEvent::CpuCycles => "cpu-cycles",
47+
PerfEvent::L1DCache => "L1-dcache-loads",
48+
PerfEvent::L2DCache => "L1-dcache-load-misses",
49+
PerfEvent::CacheMisses => "cache-misses",
50+
PerfEvent::Instructions => "instructions",
51+
}
52+
#[cfg(not(target_arch = "x86_64"))]
1353
match self {
1454
PerfEvent::CpuCycles => "cpu-cycles",
1555
PerfEvent::L1DCache => "l1d_cache",
@@ -28,10 +68,153 @@ impl PerfEvent {
2868
PerfEvent::Instructions,
2969
]
3070
}
71+
72+
/// Architecture-independent name for this slot in samply profiles.
73+
///
74+
/// samply labels each extra-event column with the name we pass it, so
75+
/// every architecture shares one name per slot and parsers match on it
76+
/// directly — unlike the perf integration, where columns carry the
77+
/// arch-specific event names of [`Self::perf_strings`].
78+
pub fn samply_name(&self) -> &'static str {
79+
match self {
80+
PerfEvent::CpuCycles => "cpu-cycles",
81+
PerfEvent::L1DCache => "l1d-cache",
82+
PerfEvent::L2DCache => "l2d-cache",
83+
PerfEvent::CacheMisses => "cache-misses",
84+
PerfEvent::Instructions => "instructions",
85+
}
86+
}
87+
88+
/// The `<name>:<type>:<config>` spec for samply's `--perf-events`,
89+
/// resolving this slot to a concrete PMU event of the CPU we are running
90+
/// on. `None` when the slot has no suitable backing event on this CPU.
91+
pub fn to_samply_spec(&self) -> Option<String> {
92+
let (event_type, config) = self.perf_event_attr()?;
93+
Some(format!(
94+
"{}:{}:{:#x}",
95+
self.samply_name(),
96+
event_type,
97+
config
98+
))
99+
}
100+
101+
/// The `perf_event_attr` `(type, config)` encoding backing this slot on
102+
/// the current CPU.
103+
fn perf_event_attr(&self) -> Option<(u32, u64)> {
104+
// perf_event_attr type values from <linux/perf_event.h>.
105+
const PERF_TYPE_HARDWARE: u32 = 0;
106+
const PERF_TYPE_RAW: u32 = 4;
107+
match self {
108+
// Generalized hardware events, portable across architectures.
109+
PerfEvent::CpuCycles => Some((PERF_TYPE_HARDWARE, 0)),
110+
PerfEvent::Instructions => Some((PERF_TYPE_HARDWARE, 1)),
111+
_ => Some((PERF_TYPE_RAW, self.raw_cache_config()?)),
112+
}
113+
}
114+
115+
/// Raw PMU encoding of this cache slot on x86_64: `umask << 8 | event`.
116+
///
117+
/// Only Intel has a vetted selection; other vendors get no cache events.
118+
/// The events are picked so that each slot counts demand traffic of one
119+
/// consistent population, keeping the derived hit counts
120+
/// (`L1DCache - L2DCache`, `L2DCache - CacheMisses`) from underflowing
121+
/// the way mixed-population events (e.g. loads vs. all-cause line fills)
122+
/// can in store- or prefetch-heavy code.
123+
#[cfg(target_arch = "x86_64")]
124+
fn raw_cache_config(&self) -> Option<u64> {
125+
if !is_genuine_intel() {
126+
return None;
127+
}
128+
// Retired load instructions, by the cache level that served them:
129+
// MEM_INST_RETIRED.ALL_LOADS, MEM_LOAD_RETIRED.L1_MISS and
130+
// MEM_LOAD_RETIRED.L3_MISS. Demand loads only (stores and prefetches
131+
// don't count), encodings stable since Skylake.
132+
match self {
133+
PerfEvent::L1DCache => Some(0x81d0),
134+
PerfEvent::L2DCache => Some(0x08d1),
135+
PerfEvent::CacheMisses => Some(0x20d1),
136+
_ => None,
137+
}
138+
}
139+
140+
/// Raw PMU encoding of this cache slot on arm64: the architected PMU
141+
/// event number (Arm ARM D8.11).
142+
#[cfg(target_arch = "aarch64")]
143+
fn raw_cache_config(&self) -> Option<u64> {
144+
match self {
145+
// L1D_CACHE: L1 data cache accesses, loads and stores.
146+
PerfEvent::L1DCache => Some(0x04),
147+
// L1D_CACHE_REFILL: L1D line fills. Defined against the same
148+
// access population as L1D_CACHE — unlike L2D_CACHE, which also
149+
// counts L1 write-backs, instruction-side refills and table
150+
// walks, and counts lines where L1D_CACHE counts operations —
151+
// so the `L1DCache - L2DCache` hit derivation stays sound.
152+
PerfEvent::L2DCache => Some(0x03),
153+
// L2D_CACHE_REFILL: refills of L2 or L1 from outside those
154+
// caches. On the Cortex-A72 macro-runner fleet (a1.metal) there
155+
// is no L3, so these are trips to DRAM. Includes instruction-side
156+
// refills, so it can exceed L1D_CACHE_REFILL in icache-missing
157+
// code; the derived hit counts saturate against that.
158+
PerfEvent::CacheMisses => Some(0x17),
159+
_ => None,
160+
}
161+
}
162+
163+
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
164+
fn raw_cache_config(&self) -> Option<u64> {
165+
None
166+
}
167+
}
168+
169+
#[cfg(target_arch = "x86_64")]
170+
fn is_genuine_intel() -> bool {
171+
use std::arch::x86_64::__cpuid;
172+
// CPUID leaf 0: vendor string in EBX,EDX,ECX.
173+
let leaf0 = unsafe { __cpuid(0) };
174+
let mut vendor = [0u8; 12];
175+
vendor[0..4].copy_from_slice(&leaf0.ebx.to_le_bytes());
176+
vendor[4..8].copy_from_slice(&leaf0.edx.to_le_bytes());
177+
vendor[8..12].copy_from_slice(&leaf0.ecx.to_le_bytes());
178+
&vendor == b"GenuineIntel"
31179
}
32180

33181
impl std::fmt::Display for PerfEvent {
34182
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35183
write!(f, "{}", self.to_perf_string())
36184
}
37185
}
186+
187+
#[cfg(test)]
188+
mod tests {
189+
use super::*;
190+
191+
#[test]
192+
fn portable_slots_have_samply_specs() {
193+
assert_eq!(
194+
PerfEvent::CpuCycles.to_samply_spec().unwrap(),
195+
"cpu-cycles:0:0x0"
196+
);
197+
assert_eq!(
198+
PerfEvent::Instructions.to_samply_spec().unwrap(),
199+
"instructions:0:0x1"
200+
);
201+
}
202+
203+
#[test]
204+
fn samply_names_are_unique() {
205+
let mut names: Vec<_> = PerfEvent::all_events()
206+
.iter()
207+
.map(|event| event.samply_name())
208+
.collect();
209+
names.sort();
210+
names.dedup();
211+
assert_eq!(names.len(), PerfEvent::all_events().len());
212+
}
213+
214+
#[test]
215+
fn print_specs_for_this_host() {
216+
for event in PerfEvent::all_events() {
217+
println!("{event:?} -> {:?}", event.to_samply_spec());
218+
}
219+
}
220+
}

src/executor/wall_time/profiler/samply/mod.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,21 @@ impl Profiler for SamplyProfiler {
117117
),
118118
]);
119119

120+
// Extra hardware events to capture alongside the sampling event,
121+
// stored by samply as per-sample delta columns in the profile, as
122+
// `<name>:<type>:<config>` specs resolved for the CPU we run on.
123+
// samply degrades gracefully to cycles-only sampling when the PMU
124+
// can't deliver them, so this is safe to request unconditionally.
125+
// Linux only: the events go through perf_event_open.
126+
#[cfg(target_os = "linux")]
127+
cmd_builder.env(
128+
"SAMPLY_PERF_EVENTS",
129+
runner_shared::perf_event::PerfEvent::all_events()
130+
.iter()
131+
.filter_map(|event| event.to_samply_spec())
132+
.join(","),
133+
);
134+
120135
// If `setup` decided the bash on PATH is Apple-signed, prepend brew's
121136
// bin so samply's spawned shell resolves to the ad-hoc-signed brew bash
122137
// instead. Only the samply child's PATH is touched.

0 commit comments

Comments
 (0)