fix(benchmark): improve measurement integrity and CI diagnostics

scc-tw · scc-tw · commit 96cc7ba2d3ce · 2026-04-05T17:00:16.000+08:00
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -145,16 +145,39 @@ jobs:
                   print(f'  ANOVA p-value: {sec.get(\"anova_p_value\")}')
                   print(f'  ANOVA F-stat:  {sec.get(\"anova_f_stat\")}')
                   print(f'  Leakage bits:  {sec.get(\"leakage_bits\")}\n')
+
+                  diag = r.get('diagnostics', {})
+                  print('  Top opcode deltas:')
+                  for row in diag.get('top_delta_opcodes', [])[:5]:
+                      print(
+                          f'    {row.get(\"name\", row.get(\"opcode\", \"?\"))}: '
+                          f'ns/DU={row.get(\"ns_per_du\")!r} '
+                          f'delta={row.get(\"delta_ns\"):+.17g} '
+                          f'stddev={row.get(\"stddev_ns\")!r} '
+                          f'n={row.get(\"iterations\")}'
+                      )
+                  print('  Category spans:')
+                  for row in diag.get('category_summary', []):
+                      print(
+                          f'    cat{row.get(\"category\")}: '
+                          f'mean={row.get(\"mean_ns_per_du\")!r} '
+                          f'span={row.get(\"span_ns_per_du\")!r} '
+                          f'min={row.get(\"min_ns_per_du\")!r} '
+                          f'max={row.get(\"max_ns_per_du\")!r}'
+                      )
+                  print()
           except Exception as e:
               print(f'Error reading metrics: {e}')
           "
 
-      # ── Upload clean result ──
-      - name: Upload clean JSON
+      # ── Upload raw + clean result ──
+      - name: Upload benchmark JSON
         uses: actions/upload-artifact@v4
         with:
           name: bench-${{ matrix.platform }}
-          path: bench_clean.json
+          path: |
+            bench_raw.json
+            bench_clean.json
 
       # ── Store in gh-pages (push to main/dev only — becomes baseline) ──
       - name: Convert for chart
diff --git a/runtime/bench/bench_output.hpp b/runtime/bench/bench_output.hpp
@@ -28,13 +28,15 @@ inline void emit_json(const char* commit,
     std::printf("    \"policy\": \"%s\",\n", policy);
     std::printf("    \"oram\": \"%s\"\n", oram);
     std::printf("  },\n");
-    std::printf("  \"baseline_ns_per_du\": %.2f,\n", baseline_ns_per_insn);
+    std::printf("  \"baseline_ns_per_du\": %.17g,\n", baseline_ns_per_insn);
     std::printf("  \"results\": [\n");
 
     for (size_t i = 0; i < results.size(); ++i) {
         const auto& r = results[i];
         std::printf("    {\n");
+        std::printf("      \"name\": \"%s\",\n", r.name);
         std::printf("      \"opcode\": \"%s\",\n", to_string(r.opcode));
+        std::printf("      \"is_baseline\": %s,\n", r.is_baseline ? "true" : "false");
         std::printf("      \"category\": %u,\n", r.category);
         std::printf("      \"du_count\": %u,\n", r.insn_count);
         std::printf("      \"iterations\": %u,\n", r.iterations);
@@ -44,11 +46,11 @@ inline void emit_json(const char* commit,
                     static_cast<unsigned long long>(r.median_ns));
         std::printf("      \"p95_ns\": %llu,\n",
                     static_cast<unsigned long long>(r.p95_ns));
-        std::printf("      \"mean_ns\": %.2f,\n", r.mean_ns);
-        std::printf("      \"stddev_ns\": %.2f,\n", r.stddev_ns);
-        std::printf("      \"ns_per_du\": %.2f,\n", r.ns_per_insn);
-        std::printf("      \"delta_ns\": %.2f,\n", r.handler_ns);
-        std::printf("      \"du_per_sec\": %.0f,\n", r.ips);
+        std::printf("      \"mean_ns\": %.17g,\n", r.mean_ns);
+        std::printf("      \"stddev_ns\": %.17g,\n", r.stddev_ns);
+        std::printf("      \"ns_per_du\": %.17g,\n", r.ns_per_insn);
+        std::printf("      \"delta_ns\": %.17g,\n", r.handler_ns);
+        std::printf("      \"du_per_sec\": %.17g,\n", r.ips);
         std::printf("      \"samples\": [");
         for (size_t j = 0; j < r.samples.size(); ++j) {
             std::printf("%llu%s", static_cast<unsigned long long>(r.samples[j]),
diff --git a/runtime/bench/bench_stats.hpp b/runtime/bench/bench_stats.hpp
@@ -16,6 +16,7 @@ struct BenchResult {
     const char* name       = "";
     VmOpcode    opcode     = VmOpcode::NOP;
     uint8_t     category   = 0;
+    bool        is_baseline = false;
     uint32_t    insn_count = 0;
     uint32_t    iterations = 0;
     uint64_t    min_ns     = 0;
@@ -36,6 +37,7 @@ inline BenchResult compute_stats(const char* name, VmOpcode opcode,
     r.name       = name;
     r.opcode     = opcode;
     r.category   = Common::VM::vm_opcode_category(opcode);
+    r.is_baseline = (name != nullptr && std::string(name) == "NOP_BASELINE");
     r.insn_count = insn_count;
     r.iterations = static_cast<uint32_t>(samples.size());
 
diff --git a/runtime/bench/program_factory.cpp b/runtime/bench/program_factory.cpp
@@ -78,6 +78,15 @@ static TestBB make_nop_bb(uint32_t bb_id, uint8_t epoch_base, uint32_t N) {
     return bb;
 }
 
+/// Build one tail BB that halts immediately.
+static TestBB make_halt_bb(uint32_t bb_id, uint8_t epoch_base, uint32_t N) {
+    auto bb = make_bb(bb_id, epoch_base);
+    bb.instructions.push_back({VmOpcode::HALT, f_none(), 0, 0, 0});
+    for (uint32_t i = 1; i < N; ++i)
+        bb.instructions.push_back({VmOpcode::NOP, f_none(), 0, 0, 0});
+    return bb;
+}
+
 /// Build setup BBs of exactly N insns each.
 ///
 /// For N≥2: one BB with setup_insns + NOP padding + JMP (fits in N insns).
@@ -267,6 +276,8 @@ DUBenchProgram build_du_program(const OpcodeBenchSpec& spec,
         }
     }
 
+    const uint32_t halt_bb_id = next_bb_id + K;
+
     // ── Build K measured BBs ────────────────────────────────────────
     for (uint32_t i = 0; i < K; ++i) {
         uint32_t bb_id = next_bb_id++;
@@ -275,7 +286,7 @@ DUBenchProgram build_du_program(const OpcodeBenchSpec& spec,
         if (is_branch) {
             // Branch at end of DU, target = next BB
             TestInstruction br = real_insn;
-            br.aux = (i + 1 < K) ? (bb_id + 1) : bb_id;  // last BB: self (will HALT)
+            br.aux = (i + 1 < K) ? (bb_id + 1) : halt_bb_id;
             bbs.push_back(make_measured_bb(bb_id, epoch_base, br, N, true));
         } else if (spec.setup == Setup::Pool) {
             TestInstruction lc = real_insn;
@@ -290,11 +301,8 @@ DUBenchProgram build_du_program(const OpcodeBenchSpec& spec,
         }
     }
 
-    // Replace last measured BB's last instruction with HALT
-    if (!bbs.empty()) {
-        auto& last_bb = bbs.back();
-        last_bb.instructions.back() = {VmOpcode::HALT, f_none(), 0, 0, 0};
-    }
+    // Tail halt BB is never part of the measured K dispatch units.
+    bbs.push_back(make_halt_bb(halt_bb_id, 0xC0, N));
 
     // ── Build native call transition entries ─────────────────────────
     if (spec.setup == Setup::NativeCall) {
@@ -324,9 +332,8 @@ DUBenchProgram build_du_baseline(uint32_t K, uint32_t N) {
         bbs.push_back(make_nop_bb(bb_id, epoch_base, N));
     }
 
-    // Last BB: replace last NOP with HALT
-    if (!bbs.empty())
-        bbs.back().instructions.back() = {VmOpcode::HALT, f_none(), 0, 0, 0};
+    // Tail halt BB is never part of the measured K dispatch units.
+    bbs.push_back(make_halt_bb(K + 1, 0xC0, N));
 
     prog.blob = Test::build_test_blob(prog.seed, bbs, {}, false, {});
     return prog;
diff --git a/runtime/bench/runner.hpp b/runtime/bench/runner.hpp
@@ -62,11 +62,26 @@ std::vector<BenchResult> run_all(const RunConfig& cfg) {
             if (!sr || *sr == VmResult::Halted) return 0;
         }
 
-        // Timed: execute remaining (K measured DUs)
+        // Timed: execute exactly K measured dispatch units.
+        //
+        // This avoids pulling the tail HALT BB into the timing window and
+        // ensures branch benchmarks measure K real branch DUs, not K-1 plus
+        // one HALT-contaminated DU.
         auto t0 = Clock::now();
-        auto r  = engine->execute();
+        bool ok = true;
+        for (uint32_t i = 0; i < K; ++i) {
+            auto r = engine->dispatch_unit();
+            if (!r) {
+                ok = false;
+                break;
+            }
+            if (*r == VmResult::Halted) {
+                ok = false;
+                break;
+            }
+        }
         auto t1 = Clock::now();
-        return r ? Clock::elapsed_ns(t0, t1) : 0;
+        return ok ? Clock::elapsed_ns(t0, t1) : 0;
     };
 
     // Pre-build all programs
@@ -129,7 +144,7 @@ std::vector<BenchResult> run_all(const RunConfig& cfg) {
         result.handler_ns = result.ns_per_insn - baseline;
         results.push_back(result);
         
-        std::fprintf(stderr, "  bench %-20s  %8.1f ns/DU  (Δ: %+.1f ns)\n",
+        std::fprintf(stderr, "  bench %-20s  %.17g ns/DU  (Δ: %+.17g ns)\n",
                      names[idx], result.ns_per_insn, result.handler_ns);
     }
 
diff --git a/scripts/bench_analyzer.py b/scripts/bench_analyzer.py
@@ -17,6 +17,7 @@
 import sys
 import math
 import argparse
+from collections import defaultdict
 
 def mad_filter(samples, z_thresh=3.5):
     """
@@ -77,9 +78,62 @@ def calculate_stats(samples, insn_count):
         "ips": ips
     }
 
+def is_baseline_result(result):
+    return bool(result.get("is_baseline")) or result.get("name") == "NOP_BASELINE" or result.get("opcode") == "NOP_BASELINE"
+
+def build_diagnostics(clean_results, baseline_ns_per_du):
+    opcode_rows = []
+    category_groups = defaultdict(list)
+
+    for r in clean_results:
+        row = {
+            "name": r.get("name", r.get("opcode", "?")),
+            "opcode": r.get("opcode", "?"),
+            "category": r.get("category"),
+            "iterations": r.get("iterations", 0),
+            "ns_per_du": r.get("ns_per_du"),
+            "delta_ns": r.get("delta_ns"),
+            "stddev_ns": r.get("stddev_ns"),
+            "is_baseline": bool(r.get("is_baseline", False)),
+        }
+        opcode_rows.append(row)
+
+        category = r.get("category")
+        if category is not None and not row["is_baseline"]:
+            category_groups[category].append(row)
+
+    sorted_by_delta = sorted(
+        [r for r in opcode_rows if not r["is_baseline"]],
+        key=lambda r: abs(r.get("delta_ns", 0.0)),
+        reverse=True,
+    )
+
+    category_summary = []
+    for category, rows in sorted(category_groups.items()):
+        ns_values = [r["ns_per_du"] for r in rows if r.get("ns_per_du") is not None]
+        if not ns_values:
+            continue
+        category_summary.append({
+            "category": category,
+            "count": len(rows),
+            "mean_ns_per_du": sum(ns_values) / len(ns_values),
+            "min_ns_per_du": min(ns_values),
+            "max_ns_per_du": max(ns_values),
+            "span_ns_per_du": max(ns_values) - min(ns_values),
+        })
+
+    return {
+        "baseline_source": "result" if baseline_ns_per_du else "metadata_or_missing",
+        "top_delta_opcodes": sorted_by_delta[:8],
+        "category_summary": category_summary,
+        "opcode_summary": sorted(
+            opcode_rows, key=lambda r: (r["category"], r["opcode"], r["name"])
+        ),
+    }
+
 def analyze(raw_data):
     # 1. Filter and re-calculate
-    baseline_ns_per_du = 0
+    baseline_ns_per_du = raw_data.get("baseline_ns_per_du", 0)
     clean_results = []
     
     # First pass: find NOP_BASELINE to set baseline_ns_per_du
@@ -90,7 +144,7 @@ def analyze(raw_data):
         # update r with new stats
         r.update(stats)
         
-        if r["opcode"] == "NOP_BASELINE":
+        if is_baseline_result(r):
             baseline_ns_per_du = r["ns_per_insn"]
             
         clean_results.append(r)
@@ -105,7 +159,7 @@ def analyze(raw_data):
     
     groups = []
     for r in raw_data.get("results", []):
-        if r["opcode"] == "NOP_BASELINE":
+        if is_baseline_result(r):
             continue
         # Exclude category 7 (VM Internal) opcodes like CHECK_INTEGRITY,
         # as they do not follow standard DU padding and are inherently distinguishable.
@@ -177,6 +231,7 @@ def analyze(raw_data):
             "indistinguishable": bool(p_value > 0.05) if p_value != -1.0 else None,
             "leakage_bits": float(mi_bits) if mi_bits != -1.0 else None
         },
+        "diagnostics": build_diagnostics(clean_results, baseline_ns_per_du),
         "results": clean_results
     }