|
30 | 30 | pytest tests/e2e/evaluation/test_lseval_periodic.py -m lseval |
31 | 31 | """ |
32 | 32 |
|
| 33 | +import csv |
33 | 34 | import json |
34 | 35 | import os |
35 | 36 | import shutil |
@@ -185,7 +186,30 @@ def _run_lseval(eval_data: Path, out_dir: Path, system_config: Path) -> None: |
185 | 186 | assert json_files, f"No summary JSON artefacts found in {out_dir}" |
186 | 187 |
|
187 | 188 | with open(json_files[0], encoding="utf-8") as fh: |
188 | | - overall = json.load(fh)["summary_stats"]["overall"] |
| 189 | + summary_json = json.load(fh) |
| 190 | + overall = summary_json["summary_stats"]["overall"] |
| 191 | + |
| 192 | + if overall["error_rate"] > MAX_EVAL_ERROR_RATE_PCT: |
| 193 | + judge_tokens = overall.get("total_judge_llm_tokens", -1) |
| 194 | + judge_detail = ( |
| 195 | + "0 → OLS calls failed before judge was reached" |
| 196 | + if judge_tokens == 0 |
| 197 | + else "judge was called" |
| 198 | + ) |
| 199 | + print( |
| 200 | + f"\n--- ERROR DIAGNOSTICS ---\n" |
| 201 | + f"Judge LLM tokens used: {judge_tokens} ({judge_detail})\n" |
| 202 | + ) |
| 203 | + with open(csv_files[0], encoding="utf-8") as fh: |
| 204 | + reader = csv.DictReader(fh) |
| 205 | + error_rows = [r for r in reader if r.get("result") == "ERROR"] |
| 206 | + if error_rows: |
| 207 | + print("First 3 error reasons from detailed CSV:") |
| 208 | + for row in error_rows[:3]: |
| 209 | + print( |
| 210 | + f" turn={row.get('turn_id','?')} reason={row.get('reason','?')[:200]}" |
| 211 | + ) |
| 212 | + |
189 | 213 | assert overall["error_rate"] <= MAX_EVAL_ERROR_RATE_PCT, ( |
190 | 214 | f"{overall['ERROR']}/{overall['TOTAL']} evaluations errored " |
191 | 215 | f"(error_rate={overall['error_rate']:.1f}% > threshold {MAX_EVAL_ERROR_RATE_PCT}%)." |
|
0 commit comments