Skip to content

Commit aaf2e0e

Browse files
ci: add RAG P@1 regression gate (overall >=73%)
- benchmark_all_fixtures.py: --overall-gate flag exits 1 when overall P@1 drops below threshold (default disabled, not a breaking change). - .github/workflows/rag-benchmark.yml: runs on changes to summaries, fixtures, or the benchmark script. Gate is 73% (current baseline 73.1%). Skips unrelated commits to keep CI fast. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 9db5fee commit aaf2e0e

2 files changed

Lines changed: 64 additions & 1 deletion

File tree

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
name: RAG Benchmark
2+
3+
on:
4+
push:
5+
branches: [main]
6+
paths:
7+
- "src/attune_help/templates/summaries_by_path.json"
8+
- "src/attune_help/templates/fixtures/**"
9+
- "scripts/benchmark_all_fixtures.py"
10+
pull_request:
11+
paths:
12+
- "src/attune_help/templates/summaries_by_path.json"
13+
- "src/attune_help/templates/fixtures/**"
14+
- "scripts/benchmark_all_fixtures.py"
15+
workflow_dispatch:
16+
inputs:
17+
overall_gate:
18+
description: "Overall P@1 gate (0.0–1.0)"
19+
default: "0.73"
20+
21+
permissions:
22+
contents: read
23+
24+
jobs:
25+
benchmark:
26+
name: RAG P@1 gate
27+
runs-on: ubuntu-latest
28+
timeout-minutes: 10
29+
steps:
30+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
31+
32+
- name: Set up Python
33+
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
34+
with:
35+
python-version: "3.11"
36+
cache: pip
37+
38+
- name: Install attune-help + rag extra
39+
run: |
40+
python -m pip install --upgrade pip
41+
python -m pip install -e ".[dev]"
42+
python -m pip install "attune-rag>=0.1.0,<0.2"
43+
44+
- name: Run RAG benchmark
45+
run: |
46+
GATE=${{ github.event.inputs.overall_gate || '0.73' }}
47+
python scripts/benchmark_all_fixtures.py \
48+
--gate 0.0 \
49+
--overall-gate "$GATE"

scripts/benchmark_all_fixtures.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@ def _load_fixtures() -> list[dict]:
4141

4242
def main(argv: list[str] | None = None) -> int:
4343
parser = argparse.ArgumentParser()
44-
parser.add_argument("--gate", type=float, default=0.60)
44+
parser.add_argument("--gate", type=float, default=0.60,
45+
help="Per-feature P@1 gate (default: 0.60)")
46+
parser.add_argument("--overall-gate", type=float, default=0.0,
47+
help="Overall P@1 gate — exit 1 if overall drops below (default: disabled)")
4548
parser.add_argument(
4649
"--summaries",
4750
default="summaries_by_path.json",
@@ -124,6 +127,17 @@ def main(argv: list[str] | None = None) -> int:
124127

125128
print(f"\nFeatures below {args.gate:.0%} P@1 gate: " f"{below_gate}/{len(rows)}")
126129

130+
overall_p1 = total_top1 / total_queries if total_queries else 0.0
131+
if args.overall_gate > 0:
132+
if overall_p1 < args.overall_gate:
133+
print(
134+
f"\n✖ FAIL — overall P@1 {overall_p1:.1%} is below "
135+
f"--overall-gate {args.overall_gate:.0%}",
136+
file=sys.stderr,
137+
)
138+
return 1
139+
print(f"\n✔ PASS — overall P@1 {overall_p1:.1%} >= {args.overall_gate:.0%} gate")
140+
127141
return 1 if below_gate > 0 else 0
128142

129143

0 commit comments

Comments
 (0)