Dieg0Code
diff --git a/‎scripts/compare_checkpoints.py‎
Lines changed: 139 additions & 0 deletions b/‎scripts/compare_checkpoints.py‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎src/game/board.py‎
Lines changed: 6 additions & 0 deletions b/‎src/game/board.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/inference/checkpoint_duel_runtime.py‎
Lines changed: 94 additions & 0 deletions b/‎src/inference/checkpoint_duel_runtime.py‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎src/training/bootstrap.py‎
Lines changed: 27 additions & 20 deletions b/‎src/training/bootstrap.py‎
Lines changed: 27 additions & 20 deletions
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+
+if TYPE_CHECKING:
+    from engine.mcts import MCTS
+    from game.board import AtaxxBoard
+
+
+def _ensure_src_on_path() -> None:
+    root = Path(__file__).resolve().parents[1]
+    src = root / "src"
+    if str(src) not in sys.path:
+        sys.path.insert(0, str(src))
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run a short automated duel between two Ataxx checkpoints.",
+    )
+    parser.add_argument("--checkpoint-a", required=True, help="Path to checkpoint A (.pt/.ckpt).")
+    parser.add_argument("--checkpoint-b", required=True, help="Path to checkpoint B (.pt/.ckpt).")
+    parser.add_argument("--games", type=int, default=8, help="Number of games to play.")
+    parser.add_argument("--device", default="auto", choices=["auto", "cpu", "cuda"])
+    parser.add_argument("--mcts-sims", "--sims", type=int, default=96)
+    parser.add_argument("--c-puct", type=float, default=1.5)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--json", action="store_true", help="Print machine-readable JSON summary.")
+    return parser.parse_args()
+
+
+def _resolve_device(device: str) -> str:
+    if device == "auto":
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    if device == "cuda" and not torch.cuda.is_available():
+        print("CUDA requested but not available; falling back to CPU.")
+        return "cpu"
+    return device
+
+
+def _pick_model_action_idx(board: AtaxxBoard, mcts: MCTS) -> int:
+    probs = mcts.run(board=board, add_dirichlet_noise=False, temperature=0.0)
+    return int(np.argmax(probs))
+
+
+def main() -> None:
+    args = _parse_args()
+    _ensure_src_on_path()
+
+    from engine.mcts import MCTS
+    from game.actions import ACTION_SPACE
+    from game.board import AtaxxBoard
+    from inference.checkpoint_duel_runtime import (
+        build_match_schedule,
+        load_system_from_checkpoint,
+        summarize_match_results,
+    )
+
+    checkpoint_a = Path(args.checkpoint_a)
+    checkpoint_b = Path(args.checkpoint_b)
+    if not checkpoint_a.exists():
+        raise FileNotFoundError(f"Checkpoint A not found: {checkpoint_a}")
+    if not checkpoint_b.exists():
+        raise FileNotFoundError(f"Checkpoint B not found: {checkpoint_b}")
+
+    device = _resolve_device(args.device)
+    system_a = load_system_from_checkpoint(checkpoint_a, device=device)
+    system_b = load_system_from_checkpoint(checkpoint_b, device=device)
+    mcts_a = MCTS(model=system_a.model, c_puct=args.c_puct, n_simulations=args.mcts_sims, device=device)
+    mcts_b = MCTS(model=system_b.model, c_puct=args.c_puct, n_simulations=args.mcts_sims, device=device)
+
+    schedule = build_match_schedule(games=max(1, int(args.games)))
+    rng = np.random.default_rng(seed=int(args.seed))
+    results: list[dict[str, int]] = []
+
+    for idx, (checkpoint_a_player, checkpoint_b_player) in enumerate(schedule, start=1):
+        board = AtaxxBoard()
+        turn_seed = int(rng.integers(0, 2**31 - 1))
+        torch.manual_seed(turn_seed)
+        np.random.seed(turn_seed)
+        turns = 0
+        while not board.is_game_over():
+            turns += 1
+            if board.current_player == checkpoint_a_player:
+                action_idx = _pick_model_action_idx(board, mcts_a)
+            elif board.current_player == checkpoint_b_player:
+                action_idx = _pick_model_action_idx(board, mcts_b)
+            else:
+                raise RuntimeError("Unexpected player assignment while comparing checkpoints.")
+            board.step(ACTION_SPACE.decode(action_idx))
+
+        winner = board.get_result()
+        results.append(
+            {
+                "winner": int(winner),
+                "turns": turns,
+                "checkpoint_a_player": checkpoint_a_player,
+            },
+        )
+        color_a = "p1" if checkpoint_a_player == 1 else "p2"
+        print(
+            f"[{idx}/{len(schedule)}] "
+            f"checkpoint_a={color_a} winner={winner} turns={turns}",
+        )
+
+    summary = summarize_match_results(results=results)
+    output: dict[str, float | int | str] = {
+        **summary,
+        "checkpoint_a": str(checkpoint_a),
+        "checkpoint_b": str(checkpoint_b),
+        "device": device,
+        "mcts_sims": int(args.mcts_sims),
+    }
+
+    if args.json:
+        print(json.dumps(output, indent=2))
+        return
+
+    print("")
+    print("Summary")
+    print(f"  checkpoint_a: {checkpoint_a}")
+    print(f"  checkpoint_b: {checkpoint_b}")
+    print(f"  games: {summary['games']}")
+    print(f"  checkpoint_a_wins: {summary['checkpoint_a_wins']}")
+    print(f"  checkpoint_b_wins: {summary['checkpoint_b_wins']}")
+    print(f"  draws: {summary['draws']}")
+    print(f"  checkpoint_a_score: {float(summary['checkpoint_a_score']):.3f}")
+    print(f"  avg_turns: {float(summary['avg_turns']):.1f}")
+
+
+if __name__ == "__main__":
+    main()
@@ -242,6 +242,12 @@ def get_result(self) -> int:
             return WIN_P2
         return DRAW
 
+    def is_forced_draw(self) -> bool:
+        """Expose loop/cap draws so training can punish non-terminating play."""
+        if not self.is_game_over():
+            return False
+        return self.half_moves >= 100 or max(self._position_counts.values(), default=0) >= 3
+
     def get_canonical_form(self) -> np.ndarray:
         """
         Current-player perspective:
 
@@ -0,0 +1,94 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+if TYPE_CHECKING:
+    from model.system import AtaxxZero
+
+MatchSchedule = list[tuple[int, int]]
+MatchResult = dict[str, int]
+
+
+def build_match_schedule(*, games: int) -> MatchSchedule:
+    if games <= 0:
+        return []
+    schedule: MatchSchedule = []
+    for idx in range(games):
+        checkpoint_a_player = 1 if idx % 2 == 0 else -1
+        checkpoint_b_player = -checkpoint_a_player
+        schedule.append((checkpoint_a_player, checkpoint_b_player))
+    return schedule
+
+
+def summarize_match_results(*, results: list[MatchResult]) -> dict[str, float | int]:
+    games = len(results)
+    if games == 0:
+        return {
+            "games": 0,
+            "checkpoint_a_wins": 0,
+            "checkpoint_b_wins": 0,
+            "draws": 0,
+            "checkpoint_a_score": 0.0,
+            "avg_turns": 0.0,
+        }
+
+    checkpoint_a_wins = 0
+    checkpoint_b_wins = 0
+    draws = 0
+    total_turns = 0
+    for result in results:
+        winner = int(result["winner"])
+        checkpoint_a_player = int(result["checkpoint_a_player"])
+        total_turns += int(result["turns"])
+        if winner == 0:
+            draws += 1
+        elif winner == checkpoint_a_player:
+            checkpoint_a_wins += 1
+        else:
+            checkpoint_b_wins += 1
+
+    checkpoint_a_score = (checkpoint_a_wins + (0.5 * draws)) / float(games)
+    return {
+        "games": games,
+        "checkpoint_a_wins": checkpoint_a_wins,
+        "checkpoint_b_wins": checkpoint_b_wins,
+        "draws": draws,
+        "checkpoint_a_score": checkpoint_a_score,
+        "avg_turns": total_turns / float(games),
+    }
+
+
+def load_system_from_checkpoint(checkpoint_path: Path, *, device: str) -> AtaxxZero:
+    from model.system import AtaxxZero
+
+    if checkpoint_path.suffix == ".ckpt":
+        return AtaxxZero.load_from_checkpoint(str(checkpoint_path), map_location=device)
+
+    payload = torch.load(str(checkpoint_path), map_location=device, weights_only=False)
+    if not isinstance(payload, dict):
+        raise ValueError("Invalid checkpoint format: expected dictionary.")
+    state_dict_obj = payload.get("state_dict")
+    if not isinstance(state_dict_obj, dict):
+        raise ValueError("Checkpoint dictionary must contain key 'state_dict'.")
+
+    hparams = payload.get("hparams")
+    kwargs: dict[str, Any] = {}
+    if isinstance(hparams, dict):
+        allowed = {"d_model", "nhead", "num_layers", "dim_feedforward", "dropout"}
+        kwargs = {key: hparams[key] for key in allowed if key in hparams}
+
+    system = AtaxxZero(**kwargs)
+    system.load_state_dict(state_dict_obj)
+    system.eval()
+    system.to(device)
+    return system
+
+
+__all__ = [
+    "build_match_schedule",
+    "load_system_from_checkpoint",
+    "summarize_match_results",
+]
@@ -8,9 +8,15 @@
 from data.replay_buffer import TrainingExample
 from game.actions import ACTION_SPACE
 from game.board import AtaxxBoard
+from training.config_runtime import cfg_bool
+from training.reward_runtime import (
+    HistoryEntry,
+    compute_state_potential,
+    compute_transition_shaping_reward,
+    history_to_examples,
+)
 
 HeuristicLevel = Literal["easy", "normal", "hard", "apex", "gambit", "sentinel"]
-HistoryEntry = tuple[np.ndarray, np.ndarray, int]
 
 
 def _one_hot_policy(action_idx: int) -> np.ndarray:
@@ -19,23 +25,6 @@ def _one_hot_policy(action_idx: int) -> np.ndarray:
     return policy
 
 
-def history_to_examples(
-    game_history: list[HistoryEntry],
-    winner: int,
-) -> list[TrainingExample]:
-    """Convert per-turn history into value targets from the acting player's perspective."""
-    examples: list[TrainingExample] = []
-    for observation, policy, player_at_turn in game_history:
-        if winner == 0:
-            z = 0.0
-        elif winner == player_at_turn:
-            z = 1.0
-        else:
-            z = -1.0
-        examples.append((observation, policy, z))
-    return examples
-
-
 def generate_imitation_data(
     *,
     n_games: int,
@@ -60,16 +49,34 @@ def generate_imitation_data(
     for _ in range(n_games):
         board = AtaxxBoard()
         game_history: list[HistoryEntry] = []
+        shaping_enabled = cfg_bool("reward_shaping_enabled")
 
         while not board.is_game_over():
             player_at_turn = int(board.current_player)
+            observation = board.get_observation()
             move = heuristic_move(board=board, rng=rng, level=heuristic_level)
             action_idx = ACTION_SPACE.encode(move)
             policy = _one_hot_policy(action_idx)
-            game_history.append((board.get_observation(), policy, player_at_turn))
+            shaping_reward = 0.0
+            before_potential = 0.0
+            if shaping_enabled:
+                before_potential = compute_state_potential(board, player_at_turn)
             board.step(move)
+            if shaping_enabled:
+                after_potential = compute_state_potential(board, player_at_turn)
+                shaping_reward = compute_transition_shaping_reward(
+                    before_potential=before_potential,
+                    after_potential=after_potential,
+                )
+            game_history.append((observation, policy, player_at_turn, shaping_reward))
 
         winner = board.get_result()
-        all_examples.extend(history_to_examples(game_history=game_history, winner=winner))
+        all_examples.extend(
+            history_to_examples(
+                game_history=game_history,
+                winner=winner,
+                forced_draw=board.is_forced_draw(),
+            ),
+        )
 
     return all_examples