[RLlib] Handle the all-evaluation-workers-unhealthy case uniformly across modes

ArturNiederfahrenhorst · claude · ArturNiederfahrenhorst · commit 453923ce0a2e · 2026-05-05T16:28:34.000+02:00
When all *configured* remote evaluation EnvRunners are unhealthy at the
start of an evaluation step, `Algorithm.evaluate()` previously did one
of two thing:

- `evaluation_parallel_to_training=True`: fall back to the local eval
  EnvRunner, which raises `ValueError: Cannot run on local evaluation
  worker parallel to training!`. Hard-crashes a long training run.
- `evaluation_parallel_to_training=False`: silently fall back to the
  local eval EnvRunner. "Works" but the eval numbers are quietly
  produced by a different EnvRunner from the one the user configured,
  on the driver process, with potentially different perf and env
  settings.

Both behaviors are gone. RLlib never silently falls back to local eval
in the failure case anymore. Two new orthogonal config knobs on
`AlgorithmConfig.evaluation()` control the behavior:

- `evaluation_unhealthy_workers_timeout_s` (float, default 0): how
  long to wait for at least one remote evaluation EnvRunner to recover.
- `evaluation_error_on_no_workers` (bool, default False): if still
  none after the wait, raise `RuntimeError` (True) or skip evaluation
  for this iteration (False).

Both knobs apply uniformly regardless of `evaluation_parallel_to_training`.

The intentional `evaluation_num_env_runners=0` case (user explicitly
asked for local-only eval) is preserved -- this is not a fallback, it's
the user's chosen configuration, and is recognized via
`num_remote_env_runners() == 0`.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/rllib/BUILD.bazel b/rllib/BUILD.bazel
@@ -1685,6 +1685,18 @@ py_test(
     deps = [":conftest"],
 )
 
+py_test(
+    name = "test_eval_workers_all_unhealthy",
+    size = "medium",
+    srcs = ["algorithms/tests/test_eval_workers_all_unhealthy.py"],
+    tags = [
+        "algorithms",
+        "exclusive",
+        "team:rllib",
+    ],
+    deps = [":conftest"],
+)
+
 # Specific Algorithms
 
 # APPO
diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
@@ -1461,9 +1461,14 @@ def evaluate(
                     kwargs=dict(algorithm=self, metrics_logger=self.metrics),
                 )
 
+            eval_results: ResultDict = {}
             env_steps = agent_steps = 0
             batches = []
 
+            # If *all* configured remote eval EnvRunners are unhealthy,
+            # optionally wait for recovery before deciding to skip / raise.
+            self._maybe_wait_for_eval_env_runner_recovery()
+
             # We will use a user provided evaluation function.
             if self.config.custom_evaluation_function:
                 if self.config.enable_env_runner_and_connector_v2:
@@ -1474,22 +1479,39 @@ def evaluate(
                     ) = self._evaluate_with_custom_eval_function()
                 else:
                     eval_results = self.config.custom_evaluation_function()
-            # There is no eval EnvRunnerGroup -> Run on local EnvRunner.
+            # There is no eval EnvRunnerGroup -> Run on (training) local
+            # EnvRunner.
             elif self.eval_env_runner_group is None and self.env_runner:
                 (
                     eval_results,
                     env_steps,
                     agent_steps,
                     batches,
                 ) = self._evaluate_on_local_env_runner(self.env_runner)
-            # There is only a local eval EnvRunner -> Run on that.
-            elif self.eval_env_runner_group.num_healthy_remote_workers() == 0:
+            # User intentionally configured 0 remote eval EnvRunners
+            # (`evaluation_num_env_runners=0`) -> Run on the local eval
+            # EnvRunner. NB: this is *not* the failure-case fallback; that
+            # path is handled by `evaluation_error_on_no_workers` below.
+            elif self.eval_env_runner_group.num_remote_env_runners() == 0:
                 (
                     eval_results,
                     env_steps,
                     agent_steps,
                     batches,
                 ) = self._evaluate_on_local_env_runner(self.eval_env_runner)
+            # Configured remote eval EnvRunners but *none* are healthy
+            # and the user asked us to raise rather than skip.
+            elif (
+                self.eval_env_runner_group.num_healthy_remote_workers() == 0
+                and self.config.evaluation_error_on_no_workers
+            ):
+                raise RuntimeError(
+                    "All evaluation EnvRunners are unhealthy. Set "
+                    "`evaluation_error_on_no_workers=False` (default) to "
+                    "skip evaluation for this iteration instead of raising, "
+                    "and/or `evaluation_unhealthy_workers_timeout_s` > 0 to "
+                    "wait for recovery before deciding."
+                )
             # There are healthy remote evaluation workers -> Run on these.
             elif self.eval_env_runner_group.num_healthy_remote_workers() > 0:
                 # Running in automatic duration mode (parallel with training step).
@@ -1616,6 +1638,60 @@ def _evaluate_offline_on_local_runner(self):
             key=(EVALUATION_RESULTS, OFFLINE_EVAL_RUNNER_RESULTS),
         )
 
+    def _maybe_wait_for_eval_env_runner_recovery(self) -> None:
+        """Poll for at least one healthy eval EnvRunner if the user asked to.
+
+        When *all* configured remote eval EnvRunners are unhealthy, wait up
+        to `evaluation_unhealthy_workers_timeout_s` seconds for at least one
+        to come back before deciding to skip evaluation or raise (per
+        `evaluation_error_on_no_workers`).
+        """
+        timeout_s = self.config.evaluation_unhealthy_workers_timeout_s
+        if not timeout_s or timeout_s <= 0:
+            return
+        if self.eval_env_runner_group is None:
+            return
+        # Only relevant when remote workers were *configured* but are all
+        # unhealthy. `num_remote_env_runners() == 0` means the user asked
+        # for local-only eval; nothing to wait for.
+        if self.eval_env_runner_group.num_remote_env_runners() == 0:
+            return
+        if self.eval_env_runner_group.num_healthy_remote_workers() > 0:
+            return
+
+        start = time.monotonic()
+        deadline = start + timeout_s
+        # Heartbeat every 60s so long waits show up in logs without spamming.
+        next_log = start + 60.0
+        logger.warning(
+            "All %d remote eval EnvRunner(s) are unhealthy; waiting up to "
+            "%.0fs for at least one to recover before "
+            "deciding to skip evaluation or raise (controlled by "
+            "`evaluation_error_on_no_workers`).",
+            self.eval_env_runner_group.num_remote_env_runners(),
+            timeout_s,
+        )
+        while (
+            self.eval_env_runner_group.num_healthy_remote_workers() == 0
+            and time.monotonic() < deadline
+        ):
+            # Actively ping unhealthy actors so the ActorManager can mark
+            # them healthy if Ray Core has restarted them since the last
+            # call. Without this poke, `num_healthy_remote_workers()`
+            # would stay stuck at 0 even after recovery.
+            self.eval_env_runner_group.probe_unhealthy_env_runners()
+            time.sleep(0.1)
+            now = time.monotonic()
+            if now >= next_log:
+                logger.warning(
+                    "Still 0/%d eval EnvRunners healthy after %.0fs "
+                    "(timeout %.0fs).",
+                    self.eval_env_runner_group.num_remote_env_runners(),
+                    now - start,
+                    timeout_s,
+                )
+                next_log = now + 60.0
+
     def _evaluate_on_local_env_runner(self, env_runner):
         if hasattr(env_runner, "input_reader") and env_runner.input_reader is None:
             raise ValueError(
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
@@ -520,6 +520,15 @@ def __init__(self, algo_class: Optional[type] = None):
         self.evaluation_auto_duration_min_env_steps_per_sample = 100
         self.evaluation_auto_duration_max_env_steps_per_sample = 2000
         self.evaluation_parallel_to_training = False
+        # How long to wait for at least one remote eval EnvRunner to recover
+        # when all *configured* remote eval EnvRunners are unhealthy at the
+        # start of an evaluation step. Default 0: don't wait.
+        self.evaluation_unhealthy_workers_timeout_s = 0.0
+        # If still no eval EnvRunners are healthy after the wait above,
+        # raise a clear `RuntimeError` (True) or skip evaluation for this
+        # iteration (False, default). Applies regardless of
+        # `evaluation_parallel_to_training`.
+        self.evaluation_error_on_no_workers = False
         self.evaluation_force_reset_envs_before_iteration = True
         self.evaluation_config = None
         self.off_policy_estimation_methods = {}
@@ -2744,6 +2753,8 @@ def evaluation(
         evaluation_auto_duration_max_env_steps_per_sample: Optional[int] = NotProvided,
         evaluation_sample_timeout_s: Optional[float] = NotProvided,
         evaluation_parallel_to_training: Optional[bool] = NotProvided,
+        evaluation_unhealthy_workers_timeout_s: Optional[float] = NotProvided,
+        evaluation_error_on_no_workers: Optional[bool] = NotProvided,
         evaluation_force_reset_envs_before_iteration: Optional[bool] = NotProvided,
         evaluation_config: Optional[
             Union["AlgorithmConfig", PartialAlgorithmConfigDict]
@@ -2827,6 +2838,20 @@ def evaluation(
                 reports a good evaluation `episode_return_mean`, be aware that these
                 results were achieved on the weights trained in iteration 41, so you
                 should probably pick the iteration 41 checkpoint instead.
+            evaluation_unhealthy_workers_timeout_s: How long (in seconds) to
+                wait for at least one remote eval EnvRunner to recover when
+                all *configured* remote eval EnvRunners are unhealthy at the
+                start of an evaluation step. Default 0: don't wait. Combine
+                with `evaluation_error_on_no_workers` to choose what happens
+                if recovery doesn't arrive in time. Applies regardless of
+                `evaluation_parallel_to_training`.
+            evaluation_error_on_no_workers: If still no remote eval
+                EnvRunners are healthy after waiting
+                `evaluation_unhealthy_workers_timeout_s` seconds, raise a
+                clear `RuntimeError` (True) or skip evaluation for this
+                iteration (False, default). Has no effect if
+                `evaluation_num_env_runners=0` (in which case local eval is
+                the user's intentional choice).
             evaluation_force_reset_envs_before_iteration: Whether all environments
                 should be force-reset (even if they are not done yet) right before
                 the evaluation step of the iteration begins. Setting this to True
@@ -3000,6 +3025,12 @@ def evaluation(
             self.evaluation_sample_timeout_s = evaluation_sample_timeout_s
         if evaluation_parallel_to_training is not NotProvided:
             self.evaluation_parallel_to_training = evaluation_parallel_to_training
+        if evaluation_unhealthy_workers_timeout_s is not NotProvided:
+            self.evaluation_unhealthy_workers_timeout_s = (
+                evaluation_unhealthy_workers_timeout_s
+            )
+        if evaluation_error_on_no_workers is not NotProvided:
+            self.evaluation_error_on_no_workers = evaluation_error_on_no_workers
         if evaluation_force_reset_envs_before_iteration is not NotProvided:
             self.evaluation_force_reset_envs_before_iteration = (
                 evaluation_force_reset_envs_before_iteration
diff --git a/rllib/algorithms/tests/test_eval_workers_all_unhealthy.py b/rllib/algorithms/tests/test_eval_workers_all_unhealthy.py
@@ -0,0 +1,126 @@
+"""Tests for evaluating when all configured remote eval EnvRunners are
+unhealthy.
+
+The behavior is controlled by two orthogonal config knobs:
+
+- ``evaluation_unhealthy_workers_timeout_s``: how long to wait for at
+  least one eval EnvRunner to recover before deciding what to do
+  (default 0: don't wait).
+- ``evaluation_error_on_no_workers``: if still no healthy eval EnvRunners
+  after the wait, raise ``RuntimeError`` (True) or skip evaluation for
+  this iteration (False, default).
+
+Both apply identically regardless of ``evaluation_parallel_to_training``.
+"""
+import time
+
+import pytest
+
+import ray
+from ray.rllib.algorithms.ppo import PPOConfig
+
+
+@pytest.fixture(params=[True, False], ids=["parallel", "sequential"])
+def parallel_to_training(request):
+    return request.param
+
+
+def _algo_with_killed_eval_workers(
+    *,
+    timeout_s=0,
+    error_on_no_workers=False,
+    parallel_to_training=True,
+):
+    """Build a PPO algo, kill every eval worker, mark them all unhealthy.
+
+    Returns the live ``Algorithm`` ready for an ``algo.evaluate()`` call.
+
+    Config is the smallest that exercises the failure path: no remote
+    training EnvRunners, 1 (single) remote eval EnvRunner that we kill,
+    fixed-duration eval so we can call `evaluate()` directly without a
+    parallel-training future.
+    """
+    config = (
+        PPOConfig()
+        .environment("CartPole-v1")
+        # Local-only training: skips remote train EnvRunner setup.
+        .env_runners(num_env_runners=0)
+        .evaluation(
+            # 1 eval worker is enough; killing it leaves 0 healthy, which
+            # is the condition we're testing.
+            evaluation_num_env_runners=1,
+            evaluation_interval=1,
+            evaluation_parallel_to_training=parallel_to_training,
+            # Fixed duration so `evaluate()` doesn't need a parallel-train
+            # future (avoids the `auto` branch's `assert future is not None`).
+            evaluation_duration=1,
+            evaluation_duration_unit="episodes",
+            evaluation_unhealthy_workers_timeout_s=timeout_s,
+            evaluation_error_on_no_workers=error_on_no_workers,
+        )
+        .fault_tolerance(restart_failed_env_runners=False)
+    )
+    algo = config.build()
+
+    # Kill the eval worker and mark it unhealthy. Mirrors losing every
+    # eval node at once.
+    eval_grp = algo.eval_env_runner_group
+    for a in list(eval_grp._worker_manager._actors.values()):
+        ray.kill(a, no_restart=True)
+    for actor_id in list(eval_grp._worker_manager.actor_ids()):
+        eval_grp._worker_manager.set_actor_state(actor_id, healthy=False)
+    assert eval_grp.num_healthy_remote_workers() == 0
+    return algo
+
+
+def test_default_skips_eval_silently(parallel_to_training):
+    """Defaults (timeout=0, error=False): evaluate() must return cleanly
+    even though every eval worker is dead -- it just skips."""
+    algo = _algo_with_killed_eval_workers(parallel_to_training=parallel_to_training)
+    algo.evaluate()  # must not raise
+
+
+def test_error_on_no_workers_raises(parallel_to_training):
+    """error_on_no_workers=True with no recovery: evaluate() must raise a
+    clear, actionable error rather than silently skipping."""
+    algo = _algo_with_killed_eval_workers(
+        error_on_no_workers=True, parallel_to_training=parallel_to_training
+    )
+    with pytest.raises(RuntimeError, match="evaluation_error_on_no_workers"):
+        algo.evaluate()
+
+
+def test_timeout_waits_then_skips_when_no_recovery(parallel_to_training):
+    """timeout_s>0 with workers that never come back: evaluate() should
+    take at least roughly that long (waiting for recovery), then skip
+    silently because error_on_no_workers defaults to False."""
+    timeout_s = 0.5
+    algo = _algo_with_killed_eval_workers(
+        timeout_s=timeout_s, parallel_to_training=parallel_to_training
+    )
+    start = time.monotonic()
+    algo.evaluate()  # must not raise
+    elapsed = time.monotonic() - start
+    assert elapsed >= timeout_s
+
+
+def test_timeout_then_error_when_no_recovery(parallel_to_training):
+    """timeout_s>0 + error_on_no_workers=True with no recovery: wait the
+    timeout, then raise."""
+    timeout_s = 0.3
+    algo = _algo_with_killed_eval_workers(
+        timeout_s=timeout_s,
+        error_on_no_workers=True,
+        parallel_to_training=parallel_to_training,
+    )
+    start = time.monotonic()
+    with pytest.raises(RuntimeError, match="evaluation_error_on_no_workers"):
+        algo.evaluate()
+    elapsed = time.monotonic() - start
+    assert elapsed >= timeout_s
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))