fix: use training-appropriate evaluate timeouts instead of reordering eval (#246)

abrichr · claude · web-flow · commit 114ad0e8bdc3 · 2026-03-29T16:41:47.000-04:00
Reverts the evaluate_dense reordering from #245 (local-first was too aggressive — skipped binary eval entirely, losing the signal when 5050 IS available). The actual fix: set evaluate_timeout=15s and evaluate_retries=1 on the WAALiveAdapter in the TRL wrapper. The evaluate_dense logic stays correct (try binary first, local fallback, take max). Training speed comes from fast failure, not from skipping evaluation paths. - Benchmarking: 180s timeout, 3 retries (thorough, one-shot) - Training: 15s timeout, 1 retry (fast feedback, thousands of evals) Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/openadapt_evals/adapters/rl_env.py b/openadapt_evals/adapters/rl_env.py
@@ -602,18 +602,27 @@ def evaluate_dense(self) -> float:
             if total > 0:
                 milestone_score = passed / total
 
-                # Try LOCAL evaluation FIRST (fast, ~5s) when we have
-                # task config checks. This avoids the 9+ minute timeout
-                # when the /evaluate endpoint (port 5050) is unresponsive.
-                # Only fall back to binary evaluate() if local eval fails
-                # or no local checks are defined.
-                binary_score = 0.0
-                server_url = getattr(
-                    getattr(self._adapter, "config", None),
-                    "server_url", "",
-                ) or ""
-
-                if self._task_config.checks and screenshot:
+                # Try binary evaluation (remote /evaluate endpoint).
+                # Speed depends on adapter's evaluate_timeout config:
+                # - Benchmarking: 180s timeout, 3 retries (thorough)
+                # - Training: 15s timeout, 1 retry (fast feedback)
+                # The TRL wrapper sets training-appropriate timeouts.
+                try:
+                    binary_score = self.evaluate()
+                except Exception:
+                    binary_score = 0.0
+
+                # If binary eval returned 0.0 (endpoint down or task
+                # failed), try local evaluation via task config checks.
+                if (
+                    binary_score == 0.0
+                    and self._task_config.checks
+                    and screenshot
+                ):
+                    server_url = getattr(
+                        getattr(self._adapter, "config", None),
+                        "server_url", "",
+                    ) or ""
                     try:
                         binary_score = (
                             self._task_config.evaluate_checks_local(
@@ -622,25 +631,15 @@ def evaluate_dense(self) -> float:
                         )
                         if binary_score > 0:
                             logger.info(
-                                "evaluate_dense: local checks returned %.2f "
-                                "(skipping slow /evaluate endpoint)",
-                                binary_score,
+                                "evaluate_dense: local check fallback "
+                                "returned %.2f", binary_score,
                             )
                     except Exception as exc:
                         logger.debug(
-                            "evaluate_dense: local check failed: %s", exc,
+                            "evaluate_dense: local check fallback "
+                            "failed: %s", exc,
                         )
 
-                # Only try the slow /evaluate endpoint if local eval
-                # returned 0.0 AND no local checks were available.
-                # This is the path that causes 9+ min timeouts when
-                # port 5050 is down.
-                if binary_score == 0.0 and not self._task_config.checks:
-                    try:
-                        binary_score = self.evaluate()
-                    except Exception:
-                        binary_score = 0.0
-
                 # Use the higher of milestone score and binary score
                 score = max(milestone_score, binary_score)
 
diff --git a/openadapt_evals/training/trl_wrapper.py b/openadapt_evals/training/trl_wrapper.py
@@ -152,6 +152,12 @@ def train(self) -> str:
         adapter = WAALiveAdapter(WAALiveConfig(
             server_url=self._config.server_url,
             evaluate_url=getattr(self._config, "evaluate_url", None),
+            # Training-appropriate timeouts: fail fast, don't block the
+            # training loop. Benchmark defaults (180s, 3 retries) are for
+            # one-shot evaluation where thoroughness matters. Training does
+            # thousands of evaluations where speed matters.
+            evaluate_timeout=15.0,
+            evaluate_retries=1,
         ))
         rollout_func = make_waa_rollout_func(
             adapter=adapter,
diff --git a/tests/test_dense_rewards.py b/tests/test_dense_rewards.py
@@ -224,16 +224,16 @@ def test_reset_uses_task_config_for_task_loading(self):
         assert env._current_task.task_id == "test-001"
 
 
-class TestEvaluateDenseLocalFirst:
-    """Verify evaluate_dense tries local checks BEFORE the slow /evaluate endpoint.
+class TestEvaluateDenseEvalOrder:
+    """Verify evaluate_dense tries binary first, local fallback second.
 
-    This is critical for training performance: the /evaluate endpoint on port
-    5050 can timeout for 9+ minutes (180s × 3 retries), while local checks
-    take ~5 seconds. The evaluate_dense path must try local first.
+    Training speed comes from the adapter's timeout config (15s for training
+    vs 180s for benchmarking), NOT from skipping the binary eval path.
+    Both evaluation methods are tried, and the max score is used.
     """
 
-    def test_local_eval_before_binary_when_checks_defined(self):
-        """When task has checks, local eval runs first and binary is skipped."""
+    def test_binary_eval_called_first(self):
+        """Binary evaluate() is always called when milestones exist."""
         adapter = _make_adapter()
         check = TaskCheck(check="command", run="echo 1", expect="1", match="exact")
         task_config = _make_task_config(
@@ -244,36 +244,37 @@ def test_local_eval_before_binary_when_checks_defined(self):
         env = RLEnvironment(adapter, task_config=task_config)
         env.reset(config=ResetConfig(task_id="test-001"))
 
-        with patch.object(task_config, "evaluate_checks_local", return_value=1.0) as mock_local:
-            score = env.evaluate_dense()
+        with patch.object(task_config, "evaluate_checks_local", return_value=1.0):
+            env.evaluate_dense()
 
-        mock_local.assert_called_once()
-        adapter.evaluate.assert_not_called()
-        assert score >= 1.0
+        # Binary eval was called (returns 0.0 from mock default)
+        adapter.evaluate.assert_called_once()
 
-    def test_binary_eval_used_when_no_checks(self):
-        """When task has no checks, falls through to binary evaluate."""
+    def test_local_fallback_when_binary_returns_zero(self):
+        """Local checks run as fallback when binary returns 0.0."""
         adapter = _make_adapter()
-        adapter.evaluate.return_value = BenchmarkResult(
-            task_id="test-001", success=True, score=0.75,
-        )
         check = TaskCheck(check="command", run="echo 1", expect="1", match="exact")
         task_config = _make_task_config(
             milestones=[Milestone(name="Step done", check=check)],
         )
-        # No checks — must fall through to binary
+        task_config.checks = [check]
 
         env = RLEnvironment(adapter, task_config=task_config)
         env.reset(config=ResetConfig(task_id="test-001"))
 
-        score = env.evaluate_dense()
+        with patch.object(task_config, "evaluate_checks_local", return_value=1.0) as mock_local:
+            score = env.evaluate_dense()
 
-        adapter.evaluate.assert_called_once()
+        mock_local.assert_called_once()
+        assert score >= 1.0
 
-    def test_local_eval_failure_does_not_call_binary(self):
-        """When local eval returns 0.0, binary is still skipped if checks exist."""
+    def test_local_not_called_when_binary_succeeds(self):
+        """Local checks are skipped when binary eval returns > 0."""
         adapter = _make_adapter()
-        check = TaskCheck(check="command", run="echo 0", expect="1", match="exact")
+        adapter.evaluate.return_value = BenchmarkResult(
+            task_id="test-001", success=True, score=0.75,
+        )
+        check = TaskCheck(check="command", run="echo 1", expect="1", match="exact")
         task_config = _make_task_config(
             milestones=[Milestone(name="Step done", check=check)],
         )
@@ -282,7 +283,8 @@ def test_local_eval_failure_does_not_call_binary(self):
         env = RLEnvironment(adapter, task_config=task_config)
         env.reset(config=ResetConfig(task_id="test-001"))
 
-        with patch.object(task_config, "evaluate_checks_local", return_value=0.0):
+        with patch.object(task_config, "evaluate_checks_local") as mock_local:
             score = env.evaluate_dense()
 
-        adapter.evaluate.assert_not_called()
+        mock_local.assert_not_called()
+        assert score >= 0.75