Skip to content

Commit 9de4e16

Browse files
committed
ci: add on-demand benchmark workflow with 16-core paid runners
1 parent 384cd77 commit 9de4e16

3 files changed

Lines changed: 145 additions & 0 deletions

File tree

.github/workflows/benchmark.yml

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
name: Benchmark Suite
2+
3+
'on':
4+
workflow_dispatch:
5+
inputs:
6+
instances:
7+
description: 'Number of instances per benchmark'
8+
required: true
9+
default: '25'
10+
type: choice
11+
options: ['10', '25', '50', '100', '200', '500']
12+
scoring:
13+
description: 'Scoring modes to run'
14+
required: true
15+
default: 'both'
16+
type: choice
17+
options: ['ppr', 'ego', 'both']
18+
budget:
19+
description: 'Token budget'
20+
required: true
21+
default: '8000'
22+
23+
jobs:
24+
benchmark:
25+
name: Run benchmarks (${{ inputs.instances }} instances, ${{ inputs.scoring }})
26+
runs-on: ubuntu-latest-16-cores
27+
timeout-minutes: 360
28+
29+
steps:
30+
- uses: actions/checkout@v6
31+
32+
- name: Set up Python
33+
uses: actions/setup-python@v6
34+
with:
35+
python-version: '3.12'
36+
37+
- name: Cache pip
38+
uses: actions/cache@v5
39+
with:
40+
path: ~/.cache/pip
41+
key: bench-pip-${{ hashFiles('pyproject.toml') }}
42+
43+
- name: Cache cloned repos
44+
uses: actions/cache@v5
45+
with:
46+
path: /tmp/contextbench_repos
47+
key: bench-repos-v1
48+
restore-keys: bench-repos-
49+
50+
- name: Install
51+
run: pip install -e ".[dev,tree-sitter]" datasets
52+
53+
- name: Run ContextBench (PPR)
54+
if: inputs.scoring == 'ppr' || inputs.scoring == 'both'
55+
run: |
56+
python benchmarks/forensic_contextbench.py \
57+
--limit ${{ inputs.instances }} \
58+
--budget ${{ inputs.budget }} \
59+
2>&1 | tee results/cb_ppr.txt
60+
env:
61+
DIFFCTX_SCORING: precise
62+
63+
- name: Run ContextBench (EgoGraph)
64+
if: inputs.scoring == 'ego' || inputs.scoring == 'both'
65+
run: |
66+
python benchmarks/forensic_contextbench.py \
67+
--limit ${{ inputs.instances }} \
68+
--budget ${{ inputs.budget }} \
69+
2>&1 | tee results/cb_ego.txt
70+
env:
71+
DIFFCTX_SCORING: discover
72+
73+
- name: Run LOO (PPR)
74+
if: inputs.scoring == 'ppr' || inputs.scoring == 'both'
75+
run: |
76+
python benchmarks/loo_swebench.py \
77+
--limit ${{ inputs.instances }} \
78+
--budget ${{ inputs.budget }} \
79+
--output results/loo_ppr.json \
80+
2>&1 | tee results/loo_ppr.txt
81+
82+
- name: Run LOO (EgoGraph)
83+
if: inputs.scoring == 'ego' || inputs.scoring == 'both'
84+
run: |
85+
python benchmarks/loo_swebench.py \
86+
--limit ${{ inputs.instances }} \
87+
--budget ${{ inputs.budget }} \
88+
--output results/loo_ego.json \
89+
2>&1 | tee results/loo_ego.txt
90+
env:
91+
DIFFCTX_SCORING: discover
92+
93+
- name: Generate summary
94+
if: always()
95+
run: python benchmarks/summarize_results.py results/ >> "$GITHUB_STEP_SUMMARY"
96+
97+
- name: Upload results
98+
if: always()
99+
uses: actions/upload-artifact@v4
100+
with:
101+
name: benchmark-results-${{ inputs.instances }}-${{ inputs.scoring }}
102+
path: results/
103+
retention-days: 90

benchmarks/summarize_results.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/usr/bin/env python3
2+
from __future__ import annotations
3+
4+
import json
5+
import sys
6+
from pathlib import Path
7+
8+
9+
def main() -> None:
10+
results_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("results")
11+
12+
print("## Benchmark Results\n")
13+
14+
for txt in sorted(results_dir.glob("cb_*.txt")):
15+
mode = txt.stem.replace("cb_", "")
16+
print(f"### ContextBench ({mode})\n```")
17+
for line in txt.read_text().splitlines():
18+
if line.startswith(("Avg ", "Total:")):
19+
print(line)
20+
print("```\n")
21+
22+
for txt in sorted(results_dir.glob("loo_*.txt")):
23+
mode = txt.stem.replace("loo_", "")
24+
print(f"### LOO ({mode})\n```")
25+
for line in txt.read_text().splitlines():
26+
if line.startswith(("Total LOO", "Found")):
27+
print(line)
28+
print("```\n")
29+
30+
for jf in sorted(results_dir.glob("loo_*.json")):
31+
mode = jf.stem.replace("loo_", "")
32+
data = json.loads(jf.read_text())
33+
found = sum(1 for r in data if r["found"])
34+
total = len(data)
35+
pct = 100 * found / total if total else 0
36+
print(f"**LOO {mode} recall: {found}/{total} ({pct:.1f}%)**\n")
37+
38+
39+
if __name__ == "__main__":
40+
main()

results/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*.json
2+
*.txt

0 commit comments

Comments
 (0)