Fix deterministic MCTS tie-breaking

Dieg0Code · Dieg0Code · commit 9c8a8cd78631 · 2026-03-08T15:52:46.000-03:00
diff --git a/src/engine/mcts.py b/src/engine/mcts.py
@@ -56,6 +56,13 @@ def __init__(
         self._cache_hits = 0
         self._cache_misses = 0
 
+    @staticmethod
+    def _sample_tied_index(candidate_indices: np.ndarray) -> int:
+        if candidate_indices.size == 1:
+            return int(candidate_indices[0])
+        picked = int(np.random.randint(0, candidate_indices.size))
+        return int(candidate_indices[picked])
+
     def run(
         self,
         board: AtaxxBoard,
@@ -232,24 +239,31 @@ def _add_dirichlet_noise(self, node: MCTSNode, alpha: float, frac: float) -> Non
             child.prior = (1.0 - frac) * child.prior + frac * float(noise[idx])
 
     def _select_child(self, node: MCTSNode) -> tuple[int, MCTSNode]:
-        best_action = -1
-        best_child: MCTSNode | None = None
         best_score = -float("inf")
+        tied_actions: list[int] = []
+        tied_children: list[MCTSNode] = []
         sqrt_parent = math.sqrt(node.visit_count + 1)
 
         for action_idx, child in node.children.items():
             # child.value() is from child-player perspective; negate for parent.
             q_value = -child.value()
             u_value = self.c_puct * child.prior * sqrt_parent / (1 + child.visit_count)
             score = q_value + u_value
-            if score > best_score:
+            # Early training often produces flat priors/value estimates. If we always
+            # keep the first child on exact ties, search collapses into one opening.
+            if score > (best_score + 1e-12):
                 best_score = score
-                best_action = action_idx
-                best_child = child
+                tied_actions = [action_idx]
+                tied_children = [child]
+                continue
+            if math.isclose(score, best_score, rel_tol=0.0, abs_tol=1e-12):
+                tied_actions.append(action_idx)
+                tied_children.append(child)
 
-        if best_child is None:
+        if len(tied_children) == 0:
             raise RuntimeError("No child selected from a non-empty node.")
-        return best_action, best_child
+        picked = self._sample_tied_index(np.arange(len(tied_children), dtype=np.int64))
+        return tied_actions[picked], tied_children[picked]
 
     def _expand(self, node: MCTSNode, board: AtaxxBoard) -> float:
         """
@@ -281,8 +295,10 @@ def _get_action_probs(self, root: MCTSNode, temperature: float) -> np.ndarray:
         )
 
         if temperature <= 0.0:
-            best_idx = int(np.argmax(visit_counts))
-            probs[int(actions[best_idx])] = 1.0
+            max_visits = float(np.max(visit_counts))
+            best_indices = np.flatnonzero(visit_counts == max_visits)
+            chosen = self._sample_tied_index(best_indices)
+            probs[int(actions[chosen])] = 1.0
             return probs
 
         adjusted = np.power(visit_counts, 1.0 / temperature)
diff --git a/tests/test_mcts_numerics.py b/tests/test_mcts_numerics.py
@@ -10,7 +10,7 @@
 
 sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src"))
 
-from engine.mcts import MCTS
+from engine.mcts import MCTS, MCTSNode
 from game.actions import ACTION_SPACE
 from game.board import AtaxxBoard
 from model.transformer import AtaxxTransformerNet
@@ -217,6 +217,60 @@ def forward(
         self.assertGreaterEqual(float(stats_second["hit_rate"]), 0.0)
         self.assertLessEqual(float(stats_second["hit_rate"]), 1.0)
 
+    def test_select_child_does_not_always_pick_first_on_exact_tie(self) -> None:
+        model = AtaxxTransformerNet(
+            d_model=64,
+            nhead=8,
+            num_layers=2,
+            dim_feedforward=128,
+            dropout=0.0,
+        )
+        mcts = MCTS(model=model, c_puct=1.5, n_simulations=1, device="cpu")
+        root = MCTSNode(prior=1.0)
+        root.visit_count = 4
+        root.children = {
+            11: MCTSNode(prior=0.5),
+            23: MCTSNode(prior=0.5),
+        }
+        chosen_actions: set[int] = set()
+
+        for seed in range(32):
+            np.random.seed(seed)
+            action_idx, _child = mcts._select_child(root)
+            chosen_actions.add(action_idx)
+
+        self.assertEqual(chosen_actions, {11, 23})
+
+    def test_temperature_zero_breaks_visit_ties_without_fixed_first_action(self) -> None:
+        class UniformModel(nn.Module):
+            def forward(
+                self,
+                board_tensor: torch.Tensor,
+                action_mask: torch.Tensor | None = None,
+            ) -> tuple[torch.Tensor, torch.Tensor]:
+                batch = board_tensor.shape[0]
+                logits = torch.zeros((batch, ACTION_SPACE.num_actions), dtype=torch.float32)
+                value = torch.zeros((batch, 1), dtype=torch.float32)
+                if action_mask is not None:
+                    logits = logits.masked_fill(action_mask <= 0, -1e9)
+                return logits, value
+
+        board = AtaxxBoard()
+        chosen_actions: set[int] = set()
+        for seed in range(32):
+            np.random.seed(seed)
+            mcts = MCTS(
+                model=UniformModel(),
+                c_puct=1.5,
+                n_simulations=0,
+                device="cpu",
+                cache_size=0,
+            )
+            probs = mcts.run(board=board, add_dirichlet_noise=False, temperature=0.0)
+            chosen_actions.add(int(np.argmax(probs)))
+
+        self.assertGreater(len(chosen_actions), 1)
+
 
 if __name__ == "__main__":
     unittest.main()