We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent d957771 commit 7dac466Copy full SHA for 7dac466
1 file changed
eval_protocol/benchmarks/test_tau_bench_retail.py
@@ -108,6 +108,7 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
108
],
109
rollout_processor=MCPGymRolloutProcessor(),
110
rollout_processor_kwargs={"domain": "retail"},
111
+ passed_threshold={"success": 0.65, "standard_error": 0.02},
112
num_runs=8,
113
mode="pointwise",
114
max_concurrent_rollouts=50,
0 commit comments