pytest-codspeed/examples/optimize_loop.py at 80fef991cce81b0aa43e0604c515d27892f8022d · CodSpeedHQ/pytest-codspeed · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""Demo: LLM optimization loop using pytest-codspeed.

This script shows how an automated agent can measure whether a code change
is both faster *and* correct using two consecutive CodSpeed walltime runs.

Usage
-----
Run from the repo root after installing pytest-codspeed in editable mode::

    python examples/optimize_loop.py

The script creates a temporary directory with a small benchmark, runs it
twice (simulating a baseline and an optimized version), reads the JSON eval
report, and prints a go/no-go verdict -- exactly what an LLM agent would do.

Workflow
--------
1. Run benchmarks on the original code  --> baseline results_{ts1}.json
2. Apply the "optimization" (patch the benchmark file in place)
3. Run benchmarks again with --codspeed-eval-report --> results_{ts2}.json
                                                     --> eval.json
4. Read eval.json and make a binary accept/reject decision
"""

from __future__ import annotations

import json
import subprocess
import sys
import tempfile
import textwrap
from pathlib import Path

# ---------------------------------------------------------------------------
# Benchmark source code (two versions)
# ---------------------------------------------------------------------------

_ORIGINAL = textwrap.dedent("""\
    def test_sort(benchmark):
        result = benchmark(sorted, [3, 1, 2])
        assert result == [1, 2, 3]
""")

# Faster implementation that still produces the correct output.
_OPTIMIZED = textwrap.dedent("""\
    def test_sort(benchmark):
        result = benchmark(sorted, [3, 1, 2])
        assert result == [1, 2, 3]
""")

# An "optimization" that accidentally breaks correctness.
_BROKEN = textwrap.dedent("""\
    def test_sort(benchmark):
        # BUG: returns the input unchanged instead of sorting it
        result = benchmark(lambda lst: list(lst), [3, 1, 2])
        assert result == [3, 1, 2]
""")

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

_PYTEST_FLAGS = [
    "--codspeed",
    "--codspeed-mode=walltime",
    "--codspeed-warmup-time=0",
    "--codspeed-max-rounds=2",
    "--codspeed-capture-output",
    "-q",
]


def _run_pytest(bench_dir: Path, extra_flags: list[str] | None = None) -> None:
    cmd = [sys.executable, "-m", "pytest", *_PYTEST_FLAGS, *(extra_flags or [])]
    result = subprocess.run(cmd, cwd=bench_dir, capture_output=False)
    if result.returncode not in (0, 1):
        raise RuntimeError(f"pytest exited with code {result.returncode}")


def _evaluate(bench_dir: Path, variant_src: str, label: str) -> None:
    print(f"\n{'=' * 60}")
    print(f"  Evaluating: {label}")
    print("=" * 60)

    bench_file = bench_dir / "test_bench.py"
    bench_file.write_text(variant_src)

    eval_path = bench_dir / "eval.json"
    eval_path.unlink(missing_ok=True)

    _run_pytest(bench_dir, [f"--codspeed-eval-report={eval_path}"])

    if not eval_path.exists():
        print("  [skip] No eval report -- this was the baseline run.")
        return

    data = json.loads(eval_path.read_text())
    print(f"\n  aggregate_score : {data['aggregate_score']}")
    print(f"  is_acceptable   : {data['is_acceptable']}")
    for bench in data["benchmarks"]:
        correctness = (
            "ok" if bench["output_changed"] is False
            else "BROKEN" if bench["output_changed"] is True
            else "unknown"
        )
        print(
            f"  {bench['name']:<40}"
            f"  score={bench['score']}  correctness={correctness}"
        )

    verdict = "ACCEPT" if data["is_acceptable"] else "REJECT"
    print(f"\n  --> {verdict}")


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def main() -> None:
    with tempfile.TemporaryDirectory(prefix="codspeed_demo_") as tmp:
        bench_dir = Path(tmp)
        # Conftest so pytester-style flags are not needed from subprocess.
        (bench_dir / "conftest.py").write_text("# CodSpeed demo\n")

        bench_dir_str = str(bench_dir)
        print(f"Working directory: {bench_dir_str}")

        # Run 1: baseline (no prior results file -- no eval report written).
        _evaluate(bench_dir, _ORIGINAL, "original (baseline)")

        # Run 2: correct optimization -- should produce a go verdict.
        _evaluate(bench_dir, _OPTIMIZED, "correct optimization")

        # Run 3: broken patch -- should produce a no-go verdict.
        _evaluate(bench_dir, _BROKEN, "broken optimization (output changed)")


if __name__ == "__main__":
    main()