chore: add radar benchmark suite (#267)

Garmelon · web-flow · commit 02e2a23eef42 · 2026-01-16T15:17:32.000Z
This PR adds the benchmark suite to cslib that radar has already been
using for a while. It can also be run standalone following the
instructions in `scripts/bench/README.md`. Radar will automatically
switch to this suite once merged.

There have been some small changes to bring the benchmark suite in line
with the mathlib benchmark suite, but nothing that should affect the
results.
diff --git a/scripts/bench/README.md b/scripts/bench/README.md
@@ -0,0 +1,25 @@
+# Cslib benchmark suite
+
+This directory contains the cslib benchmark suite.
+It is built around [radar](github.com/leanprover/radar)
+and benchmark results can be viewed
+on the [Lean FRO radar instance](https://radar.lean-lang.org/repos/cslib).
+
+To execute the entire suite, run `scripts/bench/run` in the repo root.
+To execute an individual benchmark, run `scripts/bench/<benchmark>/run` in the repo root.
+All scripts output their measurements into the file `measurements.jsonl`.
+
+Radar sums any duplicated measurements with matching metrics.
+To post-process the `measurements.jsonl` file this way in-place,
+run `scripts/bench/combine.py` in the repo root after executing the benchmark suite.
+
+The `*.py` symlinks exist only so the python files are a bit nicer to edit
+in text editors that rely on the file ending.
+
+## Adding a benchmark
+
+To add a benchmark to the suite, follow these steps:
+
+1. Create a new folder containing a `run` script and a `README.md` file describing the benchmark,
+   as well as any other files required for the benchmark.
+2. Edit `scripts/bench/run` to call the `run` script of your new benchmark.
diff --git a/scripts/bench/build/README.md b/scripts/bench/build/README.md
@@ -0,0 +1,24 @@
+# The `build` benchmark
+
+This benchmark executes a complete build of cslib and collects global and per-module metrics.
+
+The following metrics are collected by a wrapper around the entire build process:
+
+- `build//instructions`
+- `build//maxrss`
+- `build//task-clock`
+- `build//wall-clock`
+
+The following metrics are collected from `leanc --profile` and summed across all modules:
+
+- `build/profile/<name>//wall-clock`
+
+The following metrics are collected from `lakeprof report`:
+
+- `build/lakeprof/longest build path//wall-clock`
+- `build/lakeprof/longest rebuild path//wall-clock`
+
+The following metrics are collected individually for each module:
+
+- `build/module/<name>//lines`
+- `build/module/<name>//instructions`
diff --git a/scripts/bench/build/fake-root/bin/lean b/scripts/bench/build/fake-root/bin/lean
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+NAME = "build"
+REPO = Path()
+BENCH = REPO / "scripts" / "bench"
+OUTFILE = REPO / "measurements.jsonl"
+
+
+def save_result(metric: str, value: float, unit: str | None = None) -> None:
+    data = {"metric": metric, "value": value}
+    if unit is not None:
+        data["unit"] = unit
+    with open(OUTFILE, "a+") as f:
+        f.write(f"{json.dumps(data)}\n")
+
+
+def run(*command: str) -> None:
+    result = subprocess.run(command)
+    if result.returncode != 0:
+        sys.exit(result.returncode)
+
+
+def run_stderr(*command: str) -> str:
+    result = subprocess.run(command, capture_output=True, encoding="utf-8")
+    if result.returncode != 0:
+        print(result.stdout, end="", file=sys.stdout)
+        print(result.stderr, end="", file=sys.stderr)
+        sys.exit(result.returncode)
+    return result.stderr
+
+
+def get_module(setup: Path) -> str:
+    with open(setup) as f:
+        return json.load(f)["name"]
+
+
+def count_lines(module: str, path: Path) -> None:
+    with open(path) as f:
+        lines = sum(1 for _ in f)
+    save_result(f"{NAME}/module/{module}//lines", lines)
+
+
+def run_lean(module: str) -> None:
+    stderr = run_stderr(
+        f"{BENCH}/measure.py",
+        *("-t", f"{NAME}/module/{module}"),
+        *("-m", "instructions"),
+        "--",
+        *("lean", "--profile", "-Dprofiler.threshold=9999999"),
+        *sys.argv[1:],
+    )
+
+    for line in stderr.splitlines():
+        # Output of `lean --profile`
+        # See timeit.cpp for the time format
+        if match := re.fullmatch(r"\t(.*) ([\d.]+)(m?s)", line):
+            name = match.group(1)
+            seconds = float(match.group(2))
+            if match.group(3) == "ms":
+                seconds = seconds / 1000
+            save_result(f"{NAME}/profile/{name}//wall-clock", seconds, "s")
+
+
+def main() -> None:
+    if sys.argv[1:] == ["--print-prefix"]:
+        print(Path(__file__).resolve().parent.parent)
+        return
+
+    if sys.argv[1:] == ["--githash"]:
+        run("lean", "--githash")
+        return
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("lean", type=Path)
+    parser.add_argument("--setup", type=Path)
+    args, _ = parser.parse_known_args()
+
+    lean: Path = args.lean
+    setup: Path = args.setup
+
+    module = get_module(setup)
+    count_lines(module, lean)
+    run_lean(module)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/bench/build/fake-root/bin/lean.py b/scripts/bench/build/fake-root/bin/lean.py
@@ -0,0 +1 @@
+lean
diff --git a/scripts/bench/build/run b/scripts/bench/build/run
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+BENCH="scripts/bench"
+
+# Prepare build
+lake exe cache get
+
+# Run build
+LAKE_OVERRIDE_LEAN=true LEAN=$(realpath "$BENCH/build/fake-root/bin/lean") \
+  "$BENCH/measure.py" -t build \
+  -m instructions -m maxrss -m task-clock -m wall-clock -- \
+  lakeprof record lake build --no-cache
+
+# Analyze lakeprof data
+lakeprof report -pj | jq -c '{metric: "build/lakeprof/longest build path//wall-clock", value: .[-1][2], unit: "s"}' >> measurements.jsonl
+lakeprof report -rj | jq -c '{metric: "build/lakeprof/longest rebuild path//wall-clock", value: .[-1][2], unit: "s"}' >> measurements.jsonl
diff --git a/scripts/bench/combine.py b/scripts/bench/combine.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+from pathlib import Path
+
+OUTFILE = Path() / "measurements.jsonl"
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=f"Combine duplicated measurements in {OUTFILE.name} the way radar does, by summing their values."
+    )
+    args = parser.parse_args()
+
+    values: dict[str, float] = {}
+    units: dict[str, str | None] = {}
+
+    with open(OUTFILE, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            metric = data["metric"]
+            values[metric] = values.get(metric, 0) + data["value"]
+            units[metric] = data.get("unit")
+
+    with open(OUTFILE, "w") as f:
+        for metric, value in values.items():
+            unit = units.get(metric)
+            data = {"metric": metric, "value": value}
+            if unit is not None:
+                data["unit"] = unit
+            f.write(f"{json.dumps(data)}\n")
diff --git a/scripts/bench/measure.py b/scripts/bench/measure.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import resource
+import subprocess
+import sys
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+
+OUTFILE = Path() / "measurements.jsonl"
+
+
+@dataclass
+class PerfMetric:
+    event: str
+    factor: float = 1
+    unit: str | None = None
+
+
+@dataclass
+class RusageMetric:
+    name: str
+    factor: float = 1
+    unit: str | None = None
+
+
+PERF_METRICS = {
+    "task-clock": PerfMetric("task-clock", factor=1e-9, unit="s"),
+    "wall-clock": PerfMetric("duration_time", factor=1e-9, unit="s"),
+    "instructions": PerfMetric("instructions"),
+}
+
+PERF_UNITS = {
+    "msec": 1e-3,
+    "ns": 1e-9,
+}
+
+RUSAGE_METRICS = {
+    "maxrss": RusageMetric("ru_maxrss", factor=1000, unit="B"),  # KiB on linux
+}
+
+ALL_METRICS = {**PERF_METRICS, **RUSAGE_METRICS}
+
+
+def measure_perf(cmd: list[str], events: list[str]) -> dict[str, tuple[float, str]]:
+    with tempfile.NamedTemporaryFile() as tmp:
+        cmd = [
+            *["perf", "stat", "-j", "-o", tmp.name],
+            *[arg for event in events for arg in ["-e", event]],
+            *["--", *cmd],
+        ]
+
+        # Execute command
+        env = os.environ.copy()
+        env["LC_ALL"] = "C"  # or else perf may output syntactically invalid json
+        result = subprocess.run(cmd, env=env)
+        if result.returncode != 0:
+            sys.exit(result.returncode)
+
+        # Collect results
+        perf = {}
+        for line in tmp:
+            data = json.loads(line)
+            if "event" in data and "counter-value" in data:
+                perf[data["event"]] = float(data["counter-value"]), data["unit"]
+
+        return perf
+
+
+@dataclass
+class Result:
+    category: str
+    value: float
+    unit: str | None
+
+    def fmt(self, topic: str) -> str:
+        metric = f"{topic}//{self.category}"
+        if self.unit is None:
+            return json.dumps({"metric": metric, "value": self.value})
+        return json.dumps({"metric": metric, "value": self.value, "unit": self.unit})
+
+
+def measure(cmd: list[str], metrics: list[str]) -> list[Result]:
+    # Check args
+    unknown_metrics = []
+    for metric in metrics:
+        if metric not in RUSAGE_METRICS and metric not in PERF_METRICS:
+            unknown_metrics.append(metric)
+    if unknown_metrics:
+        raise Exception(f"unknown metrics: {', '.join(unknown_metrics)}")
+
+    # Prepare perf events
+    events: list[str] = []
+    for metric in metrics:
+        if info := PERF_METRICS.get(metric):
+            events.append(info.event)
+
+    # Measure
+    perf = measure_perf(cmd, events)
+    rusage = resource.getrusage(resource.RUSAGE_CHILDREN)
+
+    # Extract results
+    results = []
+    for metric in metrics:
+        if info := PERF_METRICS.get(metric):
+            if info.event in perf:
+                value, unit = perf[info.event]
+            else:
+                # Without the corresponding permissions,
+                # we only get access to the userspace versions of the counters.
+                value, unit = perf[f"{info.event}:u"]
+
+            value *= PERF_UNITS.get(unit, info.factor)
+            results.append(Result(metric, value, info.unit))
+
+        if info := RUSAGE_METRICS.get(metric):
+            value = getattr(rusage, info.name) * info.factor
+            results.append(Result(metric, value, info.unit))
+
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=f"Measure resource usage of a command using perf and rusage. The results are appended to {OUTFILE.name}.",
+    )
+    parser.add_argument(
+        "-t",
+        "--topic",
+        action="append",
+        default=[],
+        help="topic prefix for the metrics",
+    )
+    parser.add_argument(
+        "-m",
+        "--metric",
+        action="append",
+        default=[],
+        help=f"metrics to measure. Can be specified multiple times. Available metrics: {', '.join(sorted(ALL_METRICS))}",
+    )
+    parser.add_argument(
+        "cmd",
+        nargs="*",
+        help="command to measure the resource usage of",
+    )
+    args = parser.parse_args()
+
+    topics: list[str] = args.topic
+    metrics: list[str] = args.metric
+    cmd: list[str] = args.cmd
+
+    results = measure(cmd, metrics)
+
+    with open(OUTFILE, "a+") as f:
+        for result in results:
+            for topic in topics:
+                f.write(f"{result.fmt(topic)}\n")
diff --git a/scripts/bench/run b/scripts/bench/run
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+BENCH="scripts/bench"
+
+echo "Running benchmark: build"
+"$BENCH/build/run"
+
+echo "Running benchmark: size"
+"$BENCH/size/run"
diff --git a/scripts/bench/size/README.md b/scripts/bench/size/README.md
@@ -0,0 +1,8 @@
+# The `size` benchmark
+
+This benchmark measures a few deterministic values.
+
+- `size/.lean//files`
+- `size/.lean//lines`
+- `size/.olean//files`
+- `size/.olean//bytes`
diff --git a/scripts/bench/size/run b/scripts/bench/size/run
diff --git a/scripts/bench/size/run.py b/scripts/bench/size/run.py