dancingactor
diff --git a/‎rllib/BUILD.bazel‎
Lines changed: 12 additions & 0 deletions b/‎rllib/BUILD.bazel‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎rllib/algorithms/algorithm.py‎
Lines changed: 136 additions & 6 deletions b/‎rllib/algorithms/algorithm.py‎
Lines changed: 136 additions & 6 deletions
diff --git a/‎rllib/algorithms/algorithm_config.py‎
Lines changed: 44 additions & 0 deletions b/‎rllib/algorithms/algorithm_config.py‎
Lines changed: 44 additions & 0 deletions
@@ -1685,6 +1685,18 @@ py_test(
     deps = [":conftest"],
 )
 
+py_test(
+    name = "test_eval_workers_all_unhealthy",
+    size = "medium",
+    srcs = ["algorithms/tests/test_eval_workers_all_unhealthy.py"],
+    tags = [
+        "algorithms",
+        "exclusive",
+        "team:rllib",
+    ],
+    deps = [":conftest"],
+)
+
 # Specific Algorithms
 
 # APPO
 
@@ -1461,9 +1461,14 @@ def evaluate(
                     kwargs=dict(algorithm=self, metrics_logger=self.metrics),
                 )
 
+            eval_results: ResultDict = {}
             env_steps = agent_steps = 0
             batches = []
 
+            # If *all* configured remote eval EnvRunners are unhealthy,
+            # optionally wait for recovery before deciding to skip / raise.
+            self._maybe_wait_for_eval_env_runner_recovery()
+
             # We will use a user provided evaluation function.
             if self.config.custom_evaluation_function:
                 if self.config.enable_env_runner_and_connector_v2:
@@ -1474,24 +1479,28 @@ def evaluate(
                     ) = self._evaluate_with_custom_eval_function()
                 else:
                     eval_results = self.config.custom_evaluation_function()
-            # There is no eval EnvRunnerGroup -> Run on local EnvRunner.
+            # No eval EnvRunnerGroup -> Run on (training) local EnvRunner.
             elif self.eval_env_runner_group is None and self.env_runner:
                 (
                     eval_results,
                     env_steps,
                     agent_steps,
                     batches,
                 ) = self._evaluate_on_local_env_runner(self.env_runner)
-            # There is only a local eval EnvRunner -> Run on that.
-            elif self.eval_env_runner_group.num_healthy_remote_workers() == 0:
+            # 0 remote eval EnvRunners configured -> Run on the local eval EnvRunner.
+            elif self.eval_env_runner_group.num_remote_env_runners() == 0:
                 (
                     eval_results,
                     env_steps,
                     agent_steps,
                     batches,
                 ) = self._evaluate_on_local_env_runner(self.eval_env_runner)
-            # There are healthy remote evaluation workers -> Run on these.
+            # Healthy remote evaluation workers -> Run on these.
             elif self.eval_env_runner_group.num_healthy_remote_workers() > 0:
+                # A successful eval iteration resets the consecutive-skip
+                # counter; this is what tells the algorithm "the failure
+                # was transient".
+                self._counters["num_consecutive_eval_no_workers_iterations"] = 0
                 # Running in automatic duration mode (parallel with training step).
                 if self.config.evaluation_duration == "auto":
                     assert parallel_train_future is not None
@@ -1509,9 +1518,25 @@ def evaluate(
                         agent_steps,
                         batches,
                     ) = self._evaluate_with_fixed_duration()
-            # Can't find a good way to run this evaluation -> Wait for next iteration.
+            # No healthy remote eval EnvRunners. Increment the consecutive-
+            # skip counter; raise if it exceeds the configured threshold,
+            # otherwise skip evaluation for this iteration.
             else:
-                eval_results = {}
+                counter_key = "num_consecutive_eval_no_workers_iterations"
+                self._counters[counter_key] += 1
+                threshold = self.config.evaluation_error_after_n_consecutive_skips
+                if threshold is not None and self._counters[counter_key] >= threshold:
+                    n_skips = self._counters[counter_key]
+                    raise RuntimeError(
+                        "All evaluation EnvRunners have been unhealthy for "
+                        f"{n_skips} consecutive evaluation iterations "
+                        f"(threshold: {threshold}). Set "
+                        "`evaluation_error_after_n_consecutive_skips` to "
+                        "None to skip indefinitely instead, or to a higher "
+                        "number for more tolerance, and/or "
+                        "`evaluation_unhealthy_workers_timeout_s` > 0 to "
+                        "wait for recovery within each iteration."
+                    )
 
             if self.config.enable_env_runner_and_connector_v2:
                 if log_once("no_eval_results") and not self.metrics.peek(
@@ -1616,6 +1641,111 @@ def _evaluate_offline_on_local_runner(self):
             key=(EVALUATION_RESULTS, OFFLINE_EVAL_RUNNER_RESULTS),
         )
 
+    def _maybe_wait_for_eval_env_runner_recovery(self) -> None:
+        """Poll for at least one healthy eval EnvRunner if the user asked to.
+
+        When *all* configured remote eval EnvRunners are unhealthy, wait up
+        to `evaluation_unhealthy_workers_timeout_s` seconds for at least one
+        to come back before deciding to skip evaluation or raise (per
+        `evaluation_error_after_n_consecutive_skips`). If any worker
+        recovers during the wait, re-syncs current weights *and*
+        connector states / observation filters to it: the corresponding
+        syncs at the start of `evaluate()` were made before the wait and
+        skipped workers that were unhealthy then.
+        """
+        timeout_s = self.config.evaluation_unhealthy_workers_timeout_s
+        if not timeout_s or timeout_s <= 0:
+            return
+        if self.eval_env_runner_group is None:
+            return
+        # Only relevant when remote workers were *configured* but are all
+        # unhealthy. `num_remote_env_runners() == 0` means the user asked
+        # for local-only eval; nothing to wait for.
+        if self.eval_env_runner_group.num_remote_env_runners() == 0:
+            return
+        if self.eval_env_runner_group.num_healthy_remote_workers() > 0:
+            return
+
+        start = time.monotonic()
+        deadline = start + timeout_s
+        # Heartbeat every 60s so long waits show up in logs without spamming.
+        next_log = start + 60.0
+        logger.warning(
+            "All %d remote eval EnvRunner(s) are unhealthy; waiting up to "
+            "%.0fs for at least one to recover before "
+            "deciding to skip evaluation or raise (controlled by "
+            "`evaluation_error_after_n_consecutive_skips`).",
+            self.eval_env_runner_group.num_remote_env_runners(),
+            timeout_s,
+        )
+        while (
+            self.eval_env_runner_group.num_healthy_remote_workers() == 0
+            and time.monotonic() < deadline
+        ):
+            # Actively ping unhealthy actors so the ActorManager can mark
+            # them healthy if Ray Core has restarted them since the last
+            # call. Without this poke, `num_healthy_remote_workers()`
+            # would stay stuck at 0 even after recovery. Cap the per-probe
+            # timeout to remaining wait time so a hanging actor can't push
+            # us past `evaluation_unhealthy_workers_timeout_s`.
+            remaining = deadline - time.monotonic()
+            if remaining <= 0:
+                break
+            self.eval_env_runner_group.probe_unhealthy_env_runners(
+                timeout_seconds=min(remaining, 1.0),
+            )
+            time.sleep(0.1)
+            now = time.monotonic()
+            if now >= next_log:
+                logger.warning(
+                    "Still 0/%d eval EnvRunners healthy after %.0fs "
+                    "(timeout %.0fs).",
+                    self.eval_env_runner_group.num_remote_env_runners(),
+                    now - start,
+                    timeout_s,
+                )
+                next_log = now + 60.0
+
+        # If any workers recovered during the wait, push current weights
+        # *and* connector/filter state to them. The sync block at the
+        # start of `evaluate()` ran before this wait and only targeted
+        # workers that were healthy *then*; freshly-recovered workers
+        # were skipped. Without re-syncing, they would run eval with
+        # default/empty model weights *and* default/empty observation
+        # filters (or stale connector states on the v2 stack) -- both
+        # silently producing wrong eval metrics for one iteration.
+        if self.eval_env_runner_group.num_healthy_remote_workers() > 0:
+            weights_src = (
+                self.learner_group
+                if self.config.enable_env_runner_and_connector_v2
+                else self.env_runner
+            )
+            self.eval_env_runner_group.sync_weights(
+                from_worker_or_learner_group=weights_src,
+                inference_only=True,
+            )
+            if self.config.enable_env_runner_and_connector_v2:
+                if self.evaluation_config.broadcast_env_runner_states:
+                    self.eval_env_runner_group.sync_env_runner_states(
+                        config=self.evaluation_config,
+                        from_worker=self.env_runner,
+                        env_steps_sampled=self.metrics.peek(
+                            (
+                                ENV_RUNNER_RESULTS,
+                                NUM_ENV_STEPS_SAMPLED_LIFETIME,
+                            ),
+                            default=0,
+                        ),
+                        env_to_module=self.env_to_module_connector,
+                        module_to_env=self.module_to_env_connector,
+                    )
+            else:
+                self._sync_filters_if_needed(
+                    central_worker=self.env_runner_group.local_env_runner,
+                    workers=self.eval_env_runner_group,
+                    config=self.evaluation_config,
+                )
+
     def _evaluate_on_local_env_runner(self, env_runner):
         if hasattr(env_runner, "input_reader") and env_runner.input_reader is None:
             raise ValueError(
 
@@ -520,6 +520,20 @@ def __init__(self, algo_class: Optional[type] = None):
         self.evaluation_auto_duration_min_env_steps_per_sample = 100
         self.evaluation_auto_duration_max_env_steps_per_sample = 2000
         self.evaluation_parallel_to_training = False
+        # How long to wait for at least one remote eval EnvRunner to recover
+        # when all *configured* remote eval EnvRunners are unhealthy at the
+        # start of an evaluation step. Default 0: don't wait.
+        self.evaluation_unhealthy_workers_timeout_s = 0.0
+        # Raise `RuntimeError` from `evaluate()` once this many consecutive
+        # evaluation iterations have been skipped because all configured
+        # remote eval EnvRunners are unhealthy. The N-th consecutive skip
+        # raises (so `1` raises on the first skip; `5` raises on the fifth,
+        # tolerating 4 prior skips). Tune escalates the error per the
+        # trial's `max_failures` setting. The counter resets to 0 whenever
+        # an evaluation step actually runs on the remote workers. `None`
+        # (default) tolerates an unbounded number of consecutive skips.
+        # Applies regardless of `evaluation_parallel_to_training`.
+        self.evaluation_error_after_n_consecutive_skips = None
         self.evaluation_force_reset_envs_before_iteration = True
         self.evaluation_config = None
         self.off_policy_estimation_methods = {}
@@ -2744,6 +2758,8 @@ def evaluation(
         evaluation_auto_duration_max_env_steps_per_sample: Optional[int] = NotProvided,
         evaluation_sample_timeout_s: Optional[float] = NotProvided,
         evaluation_parallel_to_training: Optional[bool] = NotProvided,
+        evaluation_unhealthy_workers_timeout_s: Optional[float] = NotProvided,
+        evaluation_error_after_n_consecutive_skips: Optional[int] = NotProvided,
         evaluation_force_reset_envs_before_iteration: Optional[bool] = NotProvided,
         evaluation_config: Optional[
             Union["AlgorithmConfig", PartialAlgorithmConfigDict]
@@ -2827,6 +2843,26 @@ def evaluation(
                 reports a good evaluation `episode_return_mean`, be aware that these
                 results were achieved on the weights trained in iteration 41, so you
                 should probably pick the iteration 41 checkpoint instead.
+            evaluation_unhealthy_workers_timeout_s: How long (in seconds) to
+                wait for at least one remote eval EnvRunner to recover when
+                all *configured* remote eval EnvRunners are unhealthy at the
+                start of an evaluation step. Default 0: don't wait. Pair
+                with `evaluation_error_after_n_consecutive_skips` to escalate
+                if recovery never arrives. Applies regardless of
+                `evaluation_parallel_to_training`.
+            evaluation_error_after_n_consecutive_skips: Raise
+                `RuntimeError` from `evaluate()` once this many consecutive
+                evaluation iterations have been skipped because all
+                configured remote eval EnvRunners are unhealthy. The N-th
+                consecutive skip raises: `1` raises on the first skip
+                (strict); `5` raises on the fifth, tolerating 4 prior
+                skips. Tune escalates the error per the trial's
+                `max_failures` setting. The counter resets to 0 whenever
+                an evaluation step actually runs on the remote workers.
+                `None` (default) tolerates an unbounded number of
+                consecutive skips. Has no effect if
+                `evaluation_num_env_runners=0` (in which case local eval is
+                the user's intentional choice).
             evaluation_force_reset_envs_before_iteration: Whether all environments
                 should be force-reset (even if they are not done yet) right before
                 the evaluation step of the iteration begins. Setting this to True
@@ -3000,6 +3036,14 @@ def evaluation(
             self.evaluation_sample_timeout_s = evaluation_sample_timeout_s
         if evaluation_parallel_to_training is not NotProvided:
             self.evaluation_parallel_to_training = evaluation_parallel_to_training
+        if evaluation_unhealthy_workers_timeout_s is not NotProvided:
+            self.evaluation_unhealthy_workers_timeout_s = (
+                evaluation_unhealthy_workers_timeout_s
+            )
+        if evaluation_error_after_n_consecutive_skips is not NotProvided:
+            self.evaluation_error_after_n_consecutive_skips = (
+                evaluation_error_after_n_consecutive_skips
+            )
         if evaluation_force_reset_envs_before_iteration is not NotProvided:
             self.evaluation_force_reset_envs_before_iteration = (
                 evaluation_force_reset_envs_before_iteration