Skip to content

Commit 18a14bc

Browse files
Merge pull request #2931 from sriroopar/debug-ls-evals-fin
Fixing temperature for judge llm and clearning summarization.
2 parents 2f0d61f + 9df92a1 commit 18a14bc

2 files changed

Lines changed: 26 additions & 2 deletions

File tree

eval/system_openai_lseval.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ core:
1616
llm:
1717
provider: "openai"
1818
model: "gpt-5-mini"
19-
temperature: 0.0
19+
temperature: 1.0
2020
max_tokens: 512
2121
timeout: 300
2222
num_retries: 3

tests/e2e/evaluation/test_lseval_periodic.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
pytest tests/e2e/evaluation/test_lseval_periodic.py -m lseval
3131
"""
3232

33+
import csv
3334
import json
3435
import os
3536
import shutil
@@ -185,7 +186,30 @@ def _run_lseval(eval_data: Path, out_dir: Path, system_config: Path) -> None:
185186
assert json_files, f"No summary JSON artefacts found in {out_dir}"
186187

187188
with open(json_files[0], encoding="utf-8") as fh:
188-
overall = json.load(fh)["summary_stats"]["overall"]
189+
summary_json = json.load(fh)
190+
overall = summary_json["summary_stats"]["overall"]
191+
192+
if overall["error_rate"] > MAX_EVAL_ERROR_RATE_PCT:
193+
judge_tokens = overall.get("total_judge_llm_tokens", -1)
194+
judge_detail = (
195+
"0 → OLS calls failed before judge was reached"
196+
if judge_tokens == 0
197+
else "judge was called"
198+
)
199+
print(
200+
f"\n--- ERROR DIAGNOSTICS ---\n"
201+
f"Judge LLM tokens used: {judge_tokens} ({judge_detail})\n"
202+
)
203+
with open(csv_files[0], encoding="utf-8") as fh:
204+
reader = csv.DictReader(fh)
205+
error_rows = [r for r in reader if r.get("result") == "ERROR"]
206+
if error_rows:
207+
print("First 3 error reasons from detailed CSV:")
208+
for row in error_rows[:3]:
209+
print(
210+
f" turn={row.get('turn_id','?')} reason={row.get('reason','?')[:200]}"
211+
)
212+
189213
assert overall["error_rate"] <= MAX_EVAL_ERROR_RATE_PCT, (
190214
f"{overall['ERROR']}/{overall['TOTAL']} evaluations errored "
191215
f"(error_rate={overall['error_rate']:.1f}% > threshold {MAX_EVAL_ERROR_RATE_PCT}%)."

0 commit comments

Comments
 (0)