4141import pytest
4242import yaml
4343
44- MAX_EVAL_ERROR_RATE_PCT = 10.0
45-
4644# LSEval periodic provider matrix (operator-backed backends under test)
4745_LSEVAL_PERIODIC_PROVIDERS = ("openai" , "watsonx" , "azure_openai" )
4846
@@ -189,17 +187,25 @@ def _run_lseval(eval_data: Path, out_dir: Path, system_config: Path) -> None:
189187 summary_json = json .load (fh )
190188 overall = summary_json ["summary_stats" ]["overall" ]
191189
192- if overall ["error_rate" ] > MAX_EVAL_ERROR_RATE_PCT :
193- judge_tokens = overall .get ("total_judge_llm_tokens" , - 1 )
194- judge_detail = (
195- "0 → OLS calls failed before judge was reached"
196- if judge_tokens == 0
197- else "judge was called"
198- )
199- print (
200- f"\n --- ERROR DIAGNOSTICS ---\n "
201- f"Judge LLM tokens used: { judge_tokens } ({ judge_detail } )\n "
202- )
190+ error_rate = overall ["error_rate" ]
191+ total = overall ["TOTAL" ]
192+ errors = overall ["ERROR" ]
193+ passed = total - errors
194+
195+ judge_tokens = overall .get ("total_judge_llm_tokens" , - 1 )
196+ judge_detail = (
197+ "0 → OLS calls failed before judge was reached"
198+ if judge_tokens == 0
199+ else "judge was called"
200+ )
201+ print (
202+ f"\n --- EVAL SUMMARY ---\n "
203+ f"Total={ total } Passed={ passed } Errors={ errors } "
204+ f"error_rate={ error_rate :.1f} %\n "
205+ f"Judge LLM tokens used: { judge_tokens } ({ judge_detail } )\n "
206+ )
207+
208+ if errors :
203209 with open (csv_files [0 ], encoding = "utf-8" ) as fh :
204210 reader = csv .DictReader (fh )
205211 error_rows = [r for r in reader if r .get ("result" ) == "ERROR" ]
@@ -210,10 +216,7 @@ def _run_lseval(eval_data: Path, out_dir: Path, system_config: Path) -> None:
210216 f" turn={ row .get ('turn_id' ,'?' )} reason={ row .get ('reason' ,'?' )[:200 ]} "
211217 )
212218
213- assert overall ["error_rate" ] <= MAX_EVAL_ERROR_RATE_PCT , (
214- f"{ overall ['ERROR' ]} /{ overall ['TOTAL' ]} evaluations errored "
215- f"(error_rate={ overall ['error_rate' ]:.1f} % > threshold { MAX_EVAL_ERROR_RATE_PCT } %)."
216- )
219+ assert passed > 0 , f"All { total } evaluations errored — zero successful results."
217220
218221
219222@pytest .mark .lseval
0 commit comments