diff --git a/gigaevo/llm/bandit.py b/gigaevo/llm/bandit.py index b63fe5c0..c68071fe 100644 --- a/gigaevo/llm/bandit.py +++ b/gigaevo/llm/bandit.py @@ -64,9 +64,14 @@ def compute_bandit_reward( best_parent_fitness: Best fitness among the parent programs. higher_is_better: Whether higher fitness is better. + Non-finite inputs (NaN or ±inf in either argument) return ``0.0``: a NaN + reward would propagate into the sliding-window mean and brick UCB scoring. + Returns: Non-negative raw reward, capped at ``exp(_MAX_IMPROVEMENT) - 1``. """ + if not (math.isfinite(child_fitness) and math.isfinite(best_parent_fitness)): + return 0.0 improvement = child_fitness - best_parent_fitness if not higher_is_better: improvement = -improvement diff --git a/tests/evolution/test_bandit.py b/tests/evolution/test_bandit.py index 85273556..e0ef770c 100644 --- a/tests/evolution/test_bandit.py +++ b/tests/evolution/test_bandit.py @@ -120,6 +120,29 @@ def test_reward_is_strictly_non_negative(self) -> None: for child, parent, hib in cases: assert compute_bandit_reward(child, parent, higher_is_better=hib) >= 0.0 + # -- non-finite inputs must not poison the sliding-window mean -- + + def test_finite_inputs_unaffected_by_finite_guard(self) -> None: + """The finite-input fast-path is identical to pre-guard behavior.""" + r = compute_bandit_reward(10.0, 8.0, higher_is_better=True) + assert r == pytest.approx(math.exp(2.0) - 1.0) + + def test_nan_child_returns_neutral_reward(self) -> None: + """A NaN child fitness (e.g. from a crashed validity stage) must not + propagate into the deque — a single NaN poisons mean_reward and bricks + UCB exploration (all scores become NaN, ``score > best_score`` is + always False, the first arm in dict order is always selected).""" + r = compute_bandit_reward(float("nan"), 8.0, higher_is_better=True) + assert r == 0.0 + assert math.isfinite(r) + + def test_inf_parent_returns_neutral_reward(self) -> None: + """Infinite parent fitness (sentinel for unbounded objectives) must + not produce inf or NaN reward.""" + r = compute_bandit_reward(10.0, float("inf"), higher_is_better=True) + assert r == 0.0 + assert math.isfinite(r) + # --------------------------------------------------------------------------- # RunningPercentileNormalizer