fix: try local eval before slow /evaluate endpoint in evaluate_dense (#245)

abrichr · claude · web-flow · commit 3b8c1c2b6317 · 2026-03-29T16:15:04.000-04:00
51% of TRL training time wasted on 5050 evaluate timeouts (180s × 3
retries = 9 min per evaluation). The local evaluation via
evaluate_checks_local takes ~5s.

Fix: when task config has checks defined, try local eval FIRST. Only
fall through to the slow /evaluate endpoint when no local checks exist.
This eliminates the 9-minute timeout for custom YAML tasks that define
their own checks.

Before: evaluate() [9 min] → if 0.0 → local [5s]
After:  local [5s] → if no checks → evaluate() [9 min]

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_evals/adapters/rl_env.py b/openadapt_evals/adapters/rl_env.py
@@ -602,24 +602,18 @@ def evaluate_dense(self) -> float:
             if total > 0:
                 milestone_score = passed / total
 
-                # Also try binary evaluation if available
-                try:
-                    binary_score = self.evaluate()
-                except Exception:
-                    binary_score = 0.0
-
-                # If binary eval returned 0.0 (often means /evaluate is
-                # down), fall back to the task config's own checks run
-                # locally via /execute_windows + VLM.
-                if (
-                    binary_score == 0.0
-                    and self._task_config.checks
-                    and screenshot
-                ):
-                    server_url = getattr(
-                        getattr(self._adapter, "config", None),
-                        "server_url", "",
-                    ) or ""
+                # Try LOCAL evaluation FIRST (fast, ~5s) when we have
+                # task config checks. This avoids the 9+ minute timeout
+                # when the /evaluate endpoint (port 5050) is unresponsive.
+                # Only fall back to binary evaluate() if local eval fails
+                # or no local checks are defined.
+                binary_score = 0.0
+                server_url = getattr(
+                    getattr(self._adapter, "config", None),
+                    "server_url", "",
+                ) or ""
+
+                if self._task_config.checks and screenshot:
                     try:
                         binary_score = (
                             self._task_config.evaluate_checks_local(
@@ -628,15 +622,25 @@ def evaluate_dense(self) -> float:
                         )
                         if binary_score > 0:
                             logger.info(
-                                "evaluate_dense: local check fallback "
-                                "returned %.2f", binary_score,
+                                "evaluate_dense: local checks returned %.2f "
+                                "(skipping slow /evaluate endpoint)",
+                                binary_score,
                             )
                     except Exception as exc:
                         logger.debug(
-                            "evaluate_dense: local check fallback "
-                            "failed: %s", exc,
+                            "evaluate_dense: local check failed: %s", exc,
                         )
 
+                # Only try the slow /evaluate endpoint if local eval
+                # returned 0.0 AND no local checks were available.
+                # This is the path that causes 9+ min timeouts when
+                # port 5050 is down.
+                if binary_score == 0.0 and not self._task_config.checks:
+                    try:
+                        binary_score = self.evaluate()
+                    except Exception:
+                        binary_score = 0.0
+
                 # Use the higher of milestone score and binary score
                 score = max(milestone_score, binary_score)
 
diff --git a/tests/test_dense_rewards.py b/tests/test_dense_rewards.py
@@ -222,3 +222,67 @@ def test_reset_uses_task_config_for_task_loading(self):
         # Should use load_task_from_json since task_config matches
         assert env._current_task is not None
         assert env._current_task.task_id == "test-001"
+
+
+class TestEvaluateDenseLocalFirst:
+    """Verify evaluate_dense tries local checks BEFORE the slow /evaluate endpoint.
+
+    This is critical for training performance: the /evaluate endpoint on port
+    5050 can timeout for 9+ minutes (180s × 3 retries), while local checks
+    take ~5 seconds. The evaluate_dense path must try local first.
+    """
+
+    def test_local_eval_before_binary_when_checks_defined(self):
+        """When task has checks, local eval runs first and binary is skipped."""
+        adapter = _make_adapter()
+        check = TaskCheck(check="command", run="echo 1", expect="1", match="exact")
+        task_config = _make_task_config(
+            milestones=[Milestone(name="Step done", check=check)],
+        )
+        task_config.checks = [check]
+
+        env = RLEnvironment(adapter, task_config=task_config)
+        env.reset(config=ResetConfig(task_id="test-001"))
+
+        with patch.object(task_config, "evaluate_checks_local", return_value=1.0) as mock_local:
+            score = env.evaluate_dense()
+
+        mock_local.assert_called_once()
+        adapter.evaluate.assert_not_called()
+        assert score >= 1.0
+
+    def test_binary_eval_used_when_no_checks(self):
+        """When task has no checks, falls through to binary evaluate."""
+        adapter = _make_adapter()
+        adapter.evaluate.return_value = BenchmarkResult(
+            task_id="test-001", success=True, score=0.75,
+        )
+        check = TaskCheck(check="command", run="echo 1", expect="1", match="exact")
+        task_config = _make_task_config(
+            milestones=[Milestone(name="Step done", check=check)],
+        )
+        # No checks — must fall through to binary
+
+        env = RLEnvironment(adapter, task_config=task_config)
+        env.reset(config=ResetConfig(task_id="test-001"))
+
+        score = env.evaluate_dense()
+
+        adapter.evaluate.assert_called_once()
+
+    def test_local_eval_failure_does_not_call_binary(self):
+        """When local eval returns 0.0, binary is still skipped if checks exist."""
+        adapter = _make_adapter()
+        check = TaskCheck(check="command", run="echo 0", expect="1", match="exact")
+        task_config = _make_task_config(
+            milestones=[Milestone(name="Step done", check=check)],
+        )
+        task_config.checks = [check]
+
+        env = RLEnvironment(adapter, task_config=task_config)
+        env.reset(config=ResetConfig(task_id="test-001"))
+
+        with patch.object(task_config, "evaluate_checks_local", return_value=0.0):
+            score = env.evaluate_dense()
+
+        adapter.evaluate.assert_not_called()