Skip to content

Commit 3b8c1c2

Browse files
abrichrclaude
andauthored
fix: try local eval before slow /evaluate endpoint in evaluate_dense (#245)
51% of TRL training time wasted on 5050 evaluate timeouts (180s × 3 retries = 9 min per evaluation). The local evaluation via evaluate_checks_local takes ~5s. Fix: when task config has checks defined, try local eval FIRST. Only fall through to the slow /evaluate endpoint when no local checks exist. This eliminates the 9-minute timeout for custom YAML tasks that define their own checks. Before: evaluate() [9 min] → if 0.0 → local [5s] After: local [5s] → if no checks → evaluate() [9 min] Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d8c6187 commit 3b8c1c2

2 files changed

Lines changed: 90 additions & 22 deletions

File tree

openadapt_evals/adapters/rl_env.py

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -602,24 +602,18 @@ def evaluate_dense(self) -> float:
602602
if total > 0:
603603
milestone_score = passed / total
604604

605-
# Also try binary evaluation if available
606-
try:
607-
binary_score = self.evaluate()
608-
except Exception:
609-
binary_score = 0.0
610-
611-
# If binary eval returned 0.0 (often means /evaluate is
612-
# down), fall back to the task config's own checks run
613-
# locally via /execute_windows + VLM.
614-
if (
615-
binary_score == 0.0
616-
and self._task_config.checks
617-
and screenshot
618-
):
619-
server_url = getattr(
620-
getattr(self._adapter, "config", None),
621-
"server_url", "",
622-
) or ""
605+
# Try LOCAL evaluation FIRST (fast, ~5s) when we have
606+
# task config checks. This avoids the 9+ minute timeout
607+
# when the /evaluate endpoint (port 5050) is unresponsive.
608+
# Only fall back to binary evaluate() if local eval fails
609+
# or no local checks are defined.
610+
binary_score = 0.0
611+
server_url = getattr(
612+
getattr(self._adapter, "config", None),
613+
"server_url", "",
614+
) or ""
615+
616+
if self._task_config.checks and screenshot:
623617
try:
624618
binary_score = (
625619
self._task_config.evaluate_checks_local(
@@ -628,15 +622,25 @@ def evaluate_dense(self) -> float:
628622
)
629623
if binary_score > 0:
630624
logger.info(
631-
"evaluate_dense: local check fallback "
632-
"returned %.2f", binary_score,
625+
"evaluate_dense: local checks returned %.2f "
626+
"(skipping slow /evaluate endpoint)",
627+
binary_score,
633628
)
634629
except Exception as exc:
635630
logger.debug(
636-
"evaluate_dense: local check fallback "
637-
"failed: %s", exc,
631+
"evaluate_dense: local check failed: %s", exc,
638632
)
639633

634+
# Only try the slow /evaluate endpoint if local eval
635+
# returned 0.0 AND no local checks were available.
636+
# This is the path that causes 9+ min timeouts when
637+
# port 5050 is down.
638+
if binary_score == 0.0 and not self._task_config.checks:
639+
try:
640+
binary_score = self.evaluate()
641+
except Exception:
642+
binary_score = 0.0
643+
640644
# Use the higher of milestone score and binary score
641645
score = max(milestone_score, binary_score)
642646

tests/test_dense_rewards.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,3 +222,67 @@ def test_reset_uses_task_config_for_task_loading(self):
222222
# Should use load_task_from_json since task_config matches
223223
assert env._current_task is not None
224224
assert env._current_task.task_id == "test-001"
225+
226+
227+
class TestEvaluateDenseLocalFirst:
228+
"""Verify evaluate_dense tries local checks BEFORE the slow /evaluate endpoint.
229+
230+
This is critical for training performance: the /evaluate endpoint on port
231+
5050 can timeout for 9+ minutes (180s × 3 retries), while local checks
232+
take ~5 seconds. The evaluate_dense path must try local first.
233+
"""
234+
235+
def test_local_eval_before_binary_when_checks_defined(self):
236+
"""When task has checks, local eval runs first and binary is skipped."""
237+
adapter = _make_adapter()
238+
check = TaskCheck(check="command", run="echo 1", expect="1", match="exact")
239+
task_config = _make_task_config(
240+
milestones=[Milestone(name="Step done", check=check)],
241+
)
242+
task_config.checks = [check]
243+
244+
env = RLEnvironment(adapter, task_config=task_config)
245+
env.reset(config=ResetConfig(task_id="test-001"))
246+
247+
with patch.object(task_config, "evaluate_checks_local", return_value=1.0) as mock_local:
248+
score = env.evaluate_dense()
249+
250+
mock_local.assert_called_once()
251+
adapter.evaluate.assert_not_called()
252+
assert score >= 1.0
253+
254+
def test_binary_eval_used_when_no_checks(self):
255+
"""When task has no checks, falls through to binary evaluate."""
256+
adapter = _make_adapter()
257+
adapter.evaluate.return_value = BenchmarkResult(
258+
task_id="test-001", success=True, score=0.75,
259+
)
260+
check = TaskCheck(check="command", run="echo 1", expect="1", match="exact")
261+
task_config = _make_task_config(
262+
milestones=[Milestone(name="Step done", check=check)],
263+
)
264+
# No checks — must fall through to binary
265+
266+
env = RLEnvironment(adapter, task_config=task_config)
267+
env.reset(config=ResetConfig(task_id="test-001"))
268+
269+
score = env.evaluate_dense()
270+
271+
adapter.evaluate.assert_called_once()
272+
273+
def test_local_eval_failure_does_not_call_binary(self):
274+
"""When local eval returns 0.0, binary is still skipped if checks exist."""
275+
adapter = _make_adapter()
276+
check = TaskCheck(check="command", run="echo 0", expect="1", match="exact")
277+
task_config = _make_task_config(
278+
milestones=[Milestone(name="Step done", check=check)],
279+
)
280+
task_config.checks = [check]
281+
282+
env = RLEnvironment(adapter, task_config=task_config)
283+
env.reset(config=ResetConfig(task_id="test-001"))
284+
285+
with patch.object(task_config, "evaluate_checks_local", return_value=0.0):
286+
score = env.evaluate_dense()
287+
288+
adapter.evaluate.assert_not_called()

0 commit comments

Comments
 (0)