Skip to content

Commit 96cc7ba

Browse files
committed
fix(benchmark): improve measurement integrity and CI diagnostics
1 parent 352c3ff commit 96cc7ba

6 files changed

Lines changed: 129 additions & 25 deletions

File tree

.github/workflows/benchmark.yml

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,16 +145,39 @@ jobs:
145145
print(f' ANOVA p-value: {sec.get(\"anova_p_value\")}')
146146
print(f' ANOVA F-stat: {sec.get(\"anova_f_stat\")}')
147147
print(f' Leakage bits: {sec.get(\"leakage_bits\")}\n')
148+
149+
diag = r.get('diagnostics', {})
150+
print(' Top opcode deltas:')
151+
for row in diag.get('top_delta_opcodes', [])[:5]:
152+
print(
153+
f' {row.get(\"name\", row.get(\"opcode\", \"?\"))}: '
154+
f'ns/DU={row.get(\"ns_per_du\")!r} '
155+
f'delta={row.get(\"delta_ns\"):+.17g} '
156+
f'stddev={row.get(\"stddev_ns\")!r} '
157+
f'n={row.get(\"iterations\")}'
158+
)
159+
print(' Category spans:')
160+
for row in diag.get('category_summary', []):
161+
print(
162+
f' cat{row.get(\"category\")}: '
163+
f'mean={row.get(\"mean_ns_per_du\")!r} '
164+
f'span={row.get(\"span_ns_per_du\")!r} '
165+
f'min={row.get(\"min_ns_per_du\")!r} '
166+
f'max={row.get(\"max_ns_per_du\")!r}'
167+
)
168+
print()
148169
except Exception as e:
149170
print(f'Error reading metrics: {e}')
150171
"
151172
152-
# ── Upload clean result ──
153-
- name: Upload clean JSON
173+
# ── Upload raw + clean result ──
174+
- name: Upload benchmark JSON
154175
uses: actions/upload-artifact@v4
155176
with:
156177
name: bench-${{ matrix.platform }}
157-
path: bench_clean.json
178+
path: |
179+
bench_raw.json
180+
bench_clean.json
158181
159182
# ── Store in gh-pages (push to main/dev only — becomes baseline) ──
160183
- name: Convert for chart

runtime/bench/bench_output.hpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,15 @@ inline void emit_json(const char* commit,
2828
std::printf(" \"policy\": \"%s\",\n", policy);
2929
std::printf(" \"oram\": \"%s\"\n", oram);
3030
std::printf(" },\n");
31-
std::printf(" \"baseline_ns_per_du\": %.2f,\n", baseline_ns_per_insn);
31+
std::printf(" \"baseline_ns_per_du\": %.17g,\n", baseline_ns_per_insn);
3232
std::printf(" \"results\": [\n");
3333

3434
for (size_t i = 0; i < results.size(); ++i) {
3535
const auto& r = results[i];
3636
std::printf(" {\n");
37+
std::printf(" \"name\": \"%s\",\n", r.name);
3738
std::printf(" \"opcode\": \"%s\",\n", to_string(r.opcode));
39+
std::printf(" \"is_baseline\": %s,\n", r.is_baseline ? "true" : "false");
3840
std::printf(" \"category\": %u,\n", r.category);
3941
std::printf(" \"du_count\": %u,\n", r.insn_count);
4042
std::printf(" \"iterations\": %u,\n", r.iterations);
@@ -44,11 +46,11 @@ inline void emit_json(const char* commit,
4446
static_cast<unsigned long long>(r.median_ns));
4547
std::printf(" \"p95_ns\": %llu,\n",
4648
static_cast<unsigned long long>(r.p95_ns));
47-
std::printf(" \"mean_ns\": %.2f,\n", r.mean_ns);
48-
std::printf(" \"stddev_ns\": %.2f,\n", r.stddev_ns);
49-
std::printf(" \"ns_per_du\": %.2f,\n", r.ns_per_insn);
50-
std::printf(" \"delta_ns\": %.2f,\n", r.handler_ns);
51-
std::printf(" \"du_per_sec\": %.0f,\n", r.ips);
49+
std::printf(" \"mean_ns\": %.17g,\n", r.mean_ns);
50+
std::printf(" \"stddev_ns\": %.17g,\n", r.stddev_ns);
51+
std::printf(" \"ns_per_du\": %.17g,\n", r.ns_per_insn);
52+
std::printf(" \"delta_ns\": %.17g,\n", r.handler_ns);
53+
std::printf(" \"du_per_sec\": %.17g,\n", r.ips);
5254
std::printf(" \"samples\": [");
5355
for (size_t j = 0; j < r.samples.size(); ++j) {
5456
std::printf("%llu%s", static_cast<unsigned long long>(r.samples[j]),

runtime/bench/bench_stats.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ struct BenchResult {
1616
const char* name = "";
1717
VmOpcode opcode = VmOpcode::NOP;
1818
uint8_t category = 0;
19+
bool is_baseline = false;
1920
uint32_t insn_count = 0;
2021
uint32_t iterations = 0;
2122
uint64_t min_ns = 0;
@@ -36,6 +37,7 @@ inline BenchResult compute_stats(const char* name, VmOpcode opcode,
3637
r.name = name;
3738
r.opcode = opcode;
3839
r.category = Common::VM::vm_opcode_category(opcode);
40+
r.is_baseline = (name != nullptr && std::string(name) == "NOP_BASELINE");
3941
r.insn_count = insn_count;
4042
r.iterations = static_cast<uint32_t>(samples.size());
4143

runtime/bench/program_factory.cpp

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,15 @@ static TestBB make_nop_bb(uint32_t bb_id, uint8_t epoch_base, uint32_t N) {
7878
return bb;
7979
}
8080

81+
/// Build one tail BB that halts immediately.
82+
static TestBB make_halt_bb(uint32_t bb_id, uint8_t epoch_base, uint32_t N) {
83+
auto bb = make_bb(bb_id, epoch_base);
84+
bb.instructions.push_back({VmOpcode::HALT, f_none(), 0, 0, 0});
85+
for (uint32_t i = 1; i < N; ++i)
86+
bb.instructions.push_back({VmOpcode::NOP, f_none(), 0, 0, 0});
87+
return bb;
88+
}
89+
8190
/// Build setup BBs of exactly N insns each.
8291
///
8392
/// For N≥2: one BB with setup_insns + NOP padding + JMP (fits in N insns).
@@ -267,6 +276,8 @@ DUBenchProgram build_du_program(const OpcodeBenchSpec& spec,
267276
}
268277
}
269278

279+
const uint32_t halt_bb_id = next_bb_id + K;
280+
270281
// ── Build K measured BBs ────────────────────────────────────────
271282
for (uint32_t i = 0; i < K; ++i) {
272283
uint32_t bb_id = next_bb_id++;
@@ -275,7 +286,7 @@ DUBenchProgram build_du_program(const OpcodeBenchSpec& spec,
275286
if (is_branch) {
276287
// Branch at end of DU, target = next BB
277288
TestInstruction br = real_insn;
278-
br.aux = (i + 1 < K) ? (bb_id + 1) : bb_id; // last BB: self (will HALT)
289+
br.aux = (i + 1 < K) ? (bb_id + 1) : halt_bb_id;
279290
bbs.push_back(make_measured_bb(bb_id, epoch_base, br, N, true));
280291
} else if (spec.setup == Setup::Pool) {
281292
TestInstruction lc = real_insn;
@@ -290,11 +301,8 @@ DUBenchProgram build_du_program(const OpcodeBenchSpec& spec,
290301
}
291302
}
292303

293-
// Replace last measured BB's last instruction with HALT
294-
if (!bbs.empty()) {
295-
auto& last_bb = bbs.back();
296-
last_bb.instructions.back() = {VmOpcode::HALT, f_none(), 0, 0, 0};
297-
}
304+
// Tail halt BB is never part of the measured K dispatch units.
305+
bbs.push_back(make_halt_bb(halt_bb_id, 0xC0, N));
298306

299307
// ── Build native call transition entries ─────────────────────────
300308
if (spec.setup == Setup::NativeCall) {
@@ -324,9 +332,8 @@ DUBenchProgram build_du_baseline(uint32_t K, uint32_t N) {
324332
bbs.push_back(make_nop_bb(bb_id, epoch_base, N));
325333
}
326334

327-
// Last BB: replace last NOP with HALT
328-
if (!bbs.empty())
329-
bbs.back().instructions.back() = {VmOpcode::HALT, f_none(), 0, 0, 0};
335+
// Tail halt BB is never part of the measured K dispatch units.
336+
bbs.push_back(make_halt_bb(K + 1, 0xC0, N));
330337

331338
prog.blob = Test::build_test_blob(prog.seed, bbs, {}, false, {});
332339
return prog;

runtime/bench/runner.hpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,26 @@ std::vector<BenchResult> run_all(const RunConfig& cfg) {
6262
if (!sr || *sr == VmResult::Halted) return 0;
6363
}
6464

65-
// Timed: execute remaining (K measured DUs)
65+
// Timed: execute exactly K measured dispatch units.
66+
//
67+
// This avoids pulling the tail HALT BB into the timing window and
68+
// ensures branch benchmarks measure K real branch DUs, not K-1 plus
69+
// one HALT-contaminated DU.
6670
auto t0 = Clock::now();
67-
auto r = engine->execute();
71+
bool ok = true;
72+
for (uint32_t i = 0; i < K; ++i) {
73+
auto r = engine->dispatch_unit();
74+
if (!r) {
75+
ok = false;
76+
break;
77+
}
78+
if (*r == VmResult::Halted) {
79+
ok = false;
80+
break;
81+
}
82+
}
6883
auto t1 = Clock::now();
69-
return r ? Clock::elapsed_ns(t0, t1) : 0;
84+
return ok ? Clock::elapsed_ns(t0, t1) : 0;
7085
};
7186

7287
// Pre-build all programs
@@ -129,7 +144,7 @@ std::vector<BenchResult> run_all(const RunConfig& cfg) {
129144
result.handler_ns = result.ns_per_insn - baseline;
130145
results.push_back(result);
131146

132-
std::fprintf(stderr, " bench %-20s %8.1f ns/DU (Δ: %+.1f ns)\n",
147+
std::fprintf(stderr, " bench %-20s %.17g ns/DU (Δ: %+.17g ns)\n",
133148
names[idx], result.ns_per_insn, result.handler_ns);
134149
}
135150

scripts/bench_analyzer.py

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import sys
1818
import math
1919
import argparse
20+
from collections import defaultdict
2021

2122
def mad_filter(samples, z_thresh=3.5):
2223
"""
@@ -77,9 +78,62 @@ def calculate_stats(samples, insn_count):
7778
"ips": ips
7879
}
7980

81+
def is_baseline_result(result):
82+
return bool(result.get("is_baseline")) or result.get("name") == "NOP_BASELINE" or result.get("opcode") == "NOP_BASELINE"
83+
84+
def build_diagnostics(clean_results, baseline_ns_per_du):
85+
opcode_rows = []
86+
category_groups = defaultdict(list)
87+
88+
for r in clean_results:
89+
row = {
90+
"name": r.get("name", r.get("opcode", "?")),
91+
"opcode": r.get("opcode", "?"),
92+
"category": r.get("category"),
93+
"iterations": r.get("iterations", 0),
94+
"ns_per_du": r.get("ns_per_du"),
95+
"delta_ns": r.get("delta_ns"),
96+
"stddev_ns": r.get("stddev_ns"),
97+
"is_baseline": bool(r.get("is_baseline", False)),
98+
}
99+
opcode_rows.append(row)
100+
101+
category = r.get("category")
102+
if category is not None and not row["is_baseline"]:
103+
category_groups[category].append(row)
104+
105+
sorted_by_delta = sorted(
106+
[r for r in opcode_rows if not r["is_baseline"]],
107+
key=lambda r: abs(r.get("delta_ns", 0.0)),
108+
reverse=True,
109+
)
110+
111+
category_summary = []
112+
for category, rows in sorted(category_groups.items()):
113+
ns_values = [r["ns_per_du"] for r in rows if r.get("ns_per_du") is not None]
114+
if not ns_values:
115+
continue
116+
category_summary.append({
117+
"category": category,
118+
"count": len(rows),
119+
"mean_ns_per_du": sum(ns_values) / len(ns_values),
120+
"min_ns_per_du": min(ns_values),
121+
"max_ns_per_du": max(ns_values),
122+
"span_ns_per_du": max(ns_values) - min(ns_values),
123+
})
124+
125+
return {
126+
"baseline_source": "result" if baseline_ns_per_du else "metadata_or_missing",
127+
"top_delta_opcodes": sorted_by_delta[:8],
128+
"category_summary": category_summary,
129+
"opcode_summary": sorted(
130+
opcode_rows, key=lambda r: (r["category"], r["opcode"], r["name"])
131+
),
132+
}
133+
80134
def analyze(raw_data):
81135
# 1. Filter and re-calculate
82-
baseline_ns_per_du = 0
136+
baseline_ns_per_du = raw_data.get("baseline_ns_per_du", 0)
83137
clean_results = []
84138

85139
# First pass: find NOP_BASELINE to set baseline_ns_per_du
@@ -90,7 +144,7 @@ def analyze(raw_data):
90144
# update r with new stats
91145
r.update(stats)
92146

93-
if r["opcode"] == "NOP_BASELINE":
147+
if is_baseline_result(r):
94148
baseline_ns_per_du = r["ns_per_insn"]
95149

96150
clean_results.append(r)
@@ -105,7 +159,7 @@ def analyze(raw_data):
105159

106160
groups = []
107161
for r in raw_data.get("results", []):
108-
if r["opcode"] == "NOP_BASELINE":
162+
if is_baseline_result(r):
109163
continue
110164
# Exclude category 7 (VM Internal) opcodes like CHECK_INTEGRITY,
111165
# as they do not follow standard DU padding and are inherently distinguishable.
@@ -177,6 +231,7 @@ def analyze(raw_data):
177231
"indistinguishable": bool(p_value > 0.05) if p_value != -1.0 else None,
178232
"leakage_bits": float(mi_bits) if mi_bits != -1.0 else None
179233
},
234+
"diagnostics": build_diagnostics(clean_results, baseline_ns_per_du),
180235
"results": clean_results
181236
}
182237

0 commit comments

Comments
 (0)