Skip to content

Commit fa4409f

Browse files
committed
removing temperature param and pass threshold.
1 parent 18a14bc commit fa4409f

7 files changed

Lines changed: 20 additions & 23 deletions

File tree

eval/system_azure_openai_lseval.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ core:
1515
llm:
1616
provider: "openai"
1717
model: "gpt-5-mini"
18-
temperature: 0.0
1918
max_tokens: 512
2019
timeout: 300
2120
num_retries: 3

eval/system_openai_lseval.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ core:
1616
llm:
1717
provider: "openai"
1818
model: "gpt-5-mini"
19-
temperature: 1.0
2019
max_tokens: 512
2120
timeout: 300
2221
num_retries: 3

eval/system_rhelai_vllm_lseval.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ core:
1818
llm:
1919
provider: "openai"
2020
model: "gpt-5-mini"
21-
temperature: 0.0
2221
max_tokens: 512
2322
timeout: 300
2423
num_retries: 3

eval/system_rhoai_vllm_lseval.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ core:
1818
llm:
1919
provider: "openai"
2020
model: "gpt-5-mini"
21-
temperature: 0.0
2221
max_tokens: 512
2322
timeout: 300
2423
num_retries: 3

eval/system_watsonx_lseval.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ core:
1515
llm:
1616
provider: "openai"
1717
model: "gpt-5-mini"
18-
temperature: 0.0
1918
max_tokens: 512
2019
timeout: 300
2120
num_retries: 3

eval/troubleshooting/system.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ core:
1414
llm:
1515
provider: "openai"
1616
model: "gpt-5-mini"
17-
temperature: 0.0
1817
max_tokens: 5000
1918
timeout: 300
2019
num_retries: 3

tests/e2e/evaluation/test_lseval_periodic.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,6 @@
4141
import pytest
4242
import yaml
4343

44-
MAX_EVAL_ERROR_RATE_PCT = 10.0
45-
4644
# LSEval periodic provider matrix (operator-backed backends under test)
4745
_LSEVAL_PERIODIC_PROVIDERS = ("openai", "watsonx", "azure_openai")
4846

@@ -189,17 +187,25 @@ def _run_lseval(eval_data: Path, out_dir: Path, system_config: Path) -> None:
189187
summary_json = json.load(fh)
190188
overall = summary_json["summary_stats"]["overall"]
191189

192-
if overall["error_rate"] > MAX_EVAL_ERROR_RATE_PCT:
193-
judge_tokens = overall.get("total_judge_llm_tokens", -1)
194-
judge_detail = (
195-
"0 → OLS calls failed before judge was reached"
196-
if judge_tokens == 0
197-
else "judge was called"
198-
)
199-
print(
200-
f"\n--- ERROR DIAGNOSTICS ---\n"
201-
f"Judge LLM tokens used: {judge_tokens} ({judge_detail})\n"
202-
)
190+
error_rate = overall["error_rate"]
191+
total = overall["TOTAL"]
192+
errors = overall["ERROR"]
193+
passed = total - errors
194+
195+
judge_tokens = overall.get("total_judge_llm_tokens", -1)
196+
judge_detail = (
197+
"0 → OLS calls failed before judge was reached"
198+
if judge_tokens == 0
199+
else "judge was called"
200+
)
201+
print(
202+
f"\n--- EVAL SUMMARY ---\n"
203+
f"Total={total} Passed={passed} Errors={errors} "
204+
f"error_rate={error_rate:.1f}%\n"
205+
f"Judge LLM tokens used: {judge_tokens} ({judge_detail})\n"
206+
)
207+
208+
if errors:
203209
with open(csv_files[0], encoding="utf-8") as fh:
204210
reader = csv.DictReader(fh)
205211
error_rows = [r for r in reader if r.get("result") == "ERROR"]
@@ -210,10 +216,7 @@ def _run_lseval(eval_data: Path, out_dir: Path, system_config: Path) -> None:
210216
f" turn={row.get('turn_id','?')} reason={row.get('reason','?')[:200]}"
211217
)
212218

213-
assert overall["error_rate"] <= MAX_EVAL_ERROR_RATE_PCT, (
214-
f"{overall['ERROR']}/{overall['TOTAL']} evaluations errored "
215-
f"(error_rate={overall['error_rate']:.1f}% > threshold {MAX_EVAL_ERROR_RATE_PCT}%)."
216-
)
219+
assert passed > 0, f"All {total} evaluations errored — zero successful results."
217220

218221

219222
@pytest.mark.lseval

0 commit comments

Comments
 (0)