FusionBrainLab · GrigoryEvko · May 15, 2026
diff --git a/gigaevo/llm/bandit.py b/gigaevo/llm/bandit.py
@@ -64,9 +64,14 @@ def compute_bandit_reward(
         best_parent_fitness: Best fitness among the parent programs.
         higher_is_better: Whether higher fitness is better.
 
+    Non-finite inputs (NaN or ±inf in either argument) return ``0.0``: a NaN
+    reward would propagate into the sliding-window mean and brick UCB scoring.
+
     Returns:
         Non-negative raw reward, capped at ``exp(_MAX_IMPROVEMENT) - 1``.
     """
+    if not (math.isfinite(child_fitness) and math.isfinite(best_parent_fitness)):
+        return 0.0
     improvement = child_fitness - best_parent_fitness
     if not higher_is_better:
         improvement = -improvement

diff --git a/tests/evolution/test_bandit.py b/tests/evolution/test_bandit.py
@@ -120,6 +120,29 @@ def test_reward_is_strictly_non_negative(self) -> None:
         for child, parent, hib in cases:
             assert compute_bandit_reward(child, parent, higher_is_better=hib) >= 0.0
 
+    # -- non-finite inputs must not poison the sliding-window mean --
+
+    def test_finite_inputs_unaffected_by_finite_guard(self) -> None:
+        """The finite-input fast-path is identical to pre-guard behavior."""
+        r = compute_bandit_reward(10.0, 8.0, higher_is_better=True)
+        assert r == pytest.approx(math.exp(2.0) - 1.0)
+
+    def test_nan_child_returns_neutral_reward(self) -> None:
+        """A NaN child fitness (e.g. from a crashed validity stage) must not
+        propagate into the deque — a single NaN poisons mean_reward and bricks
+        UCB exploration (all scores become NaN, ``score > best_score`` is
+        always False, the first arm in dict order is always selected)."""
+        r = compute_bandit_reward(float("nan"), 8.0, higher_is_better=True)
+        assert r == 0.0
+        assert math.isfinite(r)
+
+    def test_inf_parent_returns_neutral_reward(self) -> None:
+        """Infinite parent fitness (sentinel for unbounded objectives) must
+        not produce inf or NaN reward."""
+        r = compute_bandit_reward(10.0, float("inf"), higher_is_better=True)
+        assert r == 0.0
+        assert math.isfinite(r)
+
 
 # ---------------------------------------------------------------------------
 # RunningPercentileNormalizer