Dieg0Code
diff --git a/‎README.md‎
Lines changed: 7 additions & 3 deletions b/‎README.md‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎scripts/bootstrap_model_bot.py‎
Lines changed: 7 additions & 11 deletions b/‎scripts/bootstrap_model_bot.py‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎src/data/dataset.py‎
Lines changed: 68 additions & 9 deletions b/‎src/data/dataset.py‎
Lines changed: 68 additions & 9 deletions
diff --git a/‎src/data/replay_buffer.py‎
Lines changed: 41 additions & 0 deletions b/‎src/data/replay_buffer.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎src/training/bootstrap.py‎
Lines changed: 4 additions & 2 deletions b/‎src/training/bootstrap.py‎
Lines changed: 4 additions & 2 deletions
@@ -56,7 +56,8 @@ Main entrypoint is now:
 uv run python train.py
 ```
 
-`train_improved.py` is kept as compatibility wrapper.
+`train_improved.py` is kept as notebook compatibility wrapper and re-exports
+`CONFIG`, `parse_args`, `apply_cli_overrides`, `validate_config`, and `main`.
 
 ## Entrenamiento remoto (RunPod + Pulumi)
 
@@ -204,7 +205,7 @@ uv run python train.py --no-onnx --quiet --keep-local-ckpts 2 --keep-log-version
 Kaggle 2x T4 (use both GPUs):
 
 ```bash
-uv run python train.py --no-onnx --quiet --devices 2 --strategy ddp --keep-local-ckpts 2 --keep-log-versions 1 --hf --hf-repo-id your_user/ataxx-zero --iterations 40 --episodes 70 --sims 420 --epochs 5 --batch-size 96 --lr 9e-4 --weight-decay 1e-4 --save-every 3
+uv run python train.py --no-onnx --quiet --devices 2 --strategy ddp_spawn --precision 16-mixed --num-workers 2 --persistent-workers --keep-local-ckpts 256 --keep-log-versions 2 --hf --hf-repo-id your_user/ataxx-zero --hf-run-id policy_spatial_v3 --hf-bootstrap-run-id policy_spatial_v2 --hf-reset-iteration --iterations 220 --episodes 20 --sims 160 --epochs 2 --batch-size 224 --lr 3e-4 --weight-decay 1e-4 --save-every 1 --warmup-games 240 --warmup-epochs 3 --warmup-heuristic-levels hard,apex,sentinel --eval-every 6 --eval-games 12 --eval-sims 160 --eval-heuristic-levels hard,apex,sentinel --restore-best-on-regression --eval-regression-delta 0.03 --eval-regression-patience 2 --allow-selfplay-fallback --max-pending-hf-uploads 6 --hf-upload-timeout-s 900
 ```
 
 Kaggle estable con `opponent pool` (recomendado):
@@ -216,9 +217,12 @@ uv run python train.py --no-onnx --quiet --devices 1 --strategy auto --keep-loca
 Kaggle estable + evaluacion automatica + best checkpoint:
 
 ```bash
-uv run python train.py --no-onnx --quiet --devices 1 --strategy auto --num-workers 3 --persistent-workers --keep-local-ckpts 2 --keep-log-versions 1 --hf --hf-repo-id your_user/ataxx-zero --iterations 40 --episodes 70 --sims 420 --epochs 5 --batch-size 96 --lr 9e-4 --weight-decay 1e-4 --save-every 3 --strict-probs --eval-every 3 --eval-games 12 --eval-sims 220 --eval-heuristic-level hard --opp-self 0.85 --opp-heuristic 0.12 --opp-random 0.03 --opp-heu-easy 0.05 --opp-heu-normal 0.20 --opp-heu-hard 0.75 --model-swap-prob 0.5
+uv run python train.py --no-onnx --quiet --devices 1 --strategy auto --num-workers 2 --persistent-workers --keep-local-ckpts 256 --keep-log-versions 2 --hf --hf-repo-id your_user/ataxx-zero --hf-run-id policy_spatial_v3 --hf-bootstrap-run-id policy_spatial_v2 --hf-reset-iteration --iterations 160 --episodes 16 --sims 128 --epochs 2 --batch-size 192 --lr 3e-4 --weight-decay 1e-4 --save-every 1 --warmup-games 180 --warmup-epochs 2 --warmup-heuristic-levels hard,apex,sentinel --eval-every 6 --eval-games 12 --eval-sims 128 --eval-heuristic-levels hard,apex,sentinel --restore-best-on-regression --eval-regression-delta 0.03 --eval-regression-patience 2 --allow-selfplay-fallback --max-pending-hf-uploads 6 --hf-upload-timeout-s 900
 ```
 
+Si Kaggle te asigna una `P100`, el trainer cae automaticamente a CPU para evitar el
+crash de compatibilidad `sm_60`; para mixed precision y DDP real necesitas `T4 x2`.
+
 If your environment is missing ONNX tooling, use:
 
 ```bash
 
@@ -30,6 +30,8 @@ async def _ensure_model_bot(args: argparse.Namespace) -> None:
     from api.db.enums import AgentType, BotKind
     from api.db.models import BotProfile, ModelVersion, User
     from api.db.session import get_engine, get_sessionmaker
+    from api.modules.model_versions.repository import ModelVersionRepository
+    from api.modules.model_versions.service import ModelVersionService
     from api.modules.ranking.repository import RankingRepository
     from api.modules.ranking.service import RankingService
 
@@ -64,18 +66,12 @@ async def _ensure_model_bot(args: argparse.Namespace) -> None:
             await session.refresh(version)
 
         if args.activate_version:
-            await session.execute(
-                # Keep one global active version when requested explicitly.
-                ModelVersion.__table__.update()
-                .where(col(ModelVersion.id) != version.id)
-                .values(is_active=False)
+            # Reuse the repository/service flow so activation semantics stay
+            # consistent with the API and type-check cleanly.
+            version_service = ModelVersionService(
+                repository=ModelVersionRepository(session=session)
             )
-            await session.execute(
-                ModelVersion.__table__.update()
-                .where(col(ModelVersion.id) == version.id)
-                .values(is_active=True)
-            )
-            await session.commit()
+            version = await version_service.activate_model_version(version.id)
 
         user_stmt = select(User).where(col(User.username) == args.username)
         user = (await session.execute(user_stmt)).scalars().first()
 
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from collections import deque
+from typing import TYPE_CHECKING
 
 import numpy as np
 import torch
@@ -14,6 +15,9 @@
 _N_TRANSFORMS = 8
 _POLICY_INDEX_MAPS: np.ndarray | None = None
 
+if TYPE_CHECKING:
+    from data.replay_buffer import TrainingExample
+
 
 def _rotate_coord_ccw(r: int, c: int, k: int, size: int) -> tuple[int, int]:
     rr, cc = r, c
@@ -98,25 +102,65 @@ def _augment_policy(policy: np.ndarray, transform_id: int) -> np.ndarray:
     return pi_aug
 
 
+def split_train_val_examples(
+    *,
+    all_examples: list[TrainingExample],
+    val_split: float,
+    shuffle: bool,
+    seed: int,
+) -> tuple[list[TrainingExample], list[TrainingExample]]:
+    """Split examples into disjoint train/val sets with optional seeded shuffling."""
+    n_total = len(all_examples)
+    if n_total == 0:
+        return [], []
+    n_val = int(n_total * val_split)
+    n_val = min(max(0, n_val), n_total)
+    n_train = n_total - n_val
+    if n_val == 0:
+        return list(all_examples), []
+    if not shuffle:
+        return list(all_examples[:n_train]), list(all_examples[n_train:])
+
+    rng = np.random.default_rng(seed=seed)
+    val_indices = np.sort(rng.choice(n_total, size=n_val, replace=False))
+    val_set = {int(i) for i in val_indices.tolist()}
+    # Keep train set in chronological order so "recent" remains meaningful.
+    train_indices = [idx for idx in range(n_total) if idx not in val_set]
+    train_examples = [all_examples[idx] for idx in train_indices]
+    val_examples = [all_examples[int(idx)] for idx in val_indices]
+    return train_examples, val_examples
+
+
 class AtaxxDataset(Dataset[tuple[torch.Tensor, torch.Tensor, torch.Tensor]]):
     """Dataset wrapper from replay buffer examples."""
 
     def __init__(
         self,
-        buffer: ReplayBuffer,
+        buffer: ReplayBuffer | None = None,
         augment: bool = True,
         reference_buffer: bool = False,
         val_split: float = 0.1,
+        examples: list[TrainingExample] | None = None,
     ) -> None:
         self.augment = augment
         self.examples: list[tuple[np.ndarray, np.ndarray, float]] | deque[
             tuple[np.ndarray, np.ndarray, float]
         ]
+        if examples is not None:
+            self.examples = list(examples)
+            return
+        if buffer is None:
+            self.examples = []
+            return
+
         raw_examples = list(buffer.buffer) if reference_buffer else buffer.get_all()
-        n_val = int(len(raw_examples) * val_split)
-        n_train = len(raw_examples) - n_val
-        # Keep train/validation disjoint so val loss is a true hold-out metric.
-        self.examples = raw_examples[:n_train]
+        train_examples, _ = split_train_val_examples(
+            all_examples=raw_examples,
+            val_split=val_split,
+            shuffle=False,
+            seed=0,
+        )
+        self.examples = train_examples
 
     def __len__(self) -> int:
         return len(self.examples)
@@ -140,11 +184,26 @@ def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor, torch.Ten
 class ValidationDataset(Dataset[tuple[torch.Tensor, torch.Tensor, torch.Tensor]]):
     """Hold-out validation split from replay buffer."""
 
-    def __init__(self, buffer: ReplayBuffer, split: float = 0.1) -> None:
+    def __init__(
+        self,
+        buffer: ReplayBuffer | None = None,
+        split: float = 0.1,
+        examples: list[TrainingExample] | None = None,
+    ) -> None:
+        if examples is not None:
+            self.examples = list(examples)
+            return
+        if buffer is None:
+            self.examples = []
+            return
         all_examples = buffer.get_all()
-        n_val = int(len(all_examples) * split)
-        n_train = len(all_examples) - n_val
-        self.examples = all_examples[n_train:] if n_val > 0 else []
+        _, val_examples = split_train_val_examples(
+            all_examples=all_examples,
+            val_split=split,
+            shuffle=False,
+            seed=0,
+        )
+        self.examples = val_examples
 
     def __len__(self) -> int:
         return len(self.examples)
 
@@ -11,6 +11,47 @@
 TrainingExample = tuple[Observation, PolicyTarget, float]
 
 
+def sample_recent_mix(
+    examples: list[TrainingExample],
+    *,
+    recent_fraction: float,
+    recent_window_fraction: float,
+    seed: int | None = None,
+    sample_size: int | None = None,
+) -> list[TrainingExample]:
+    """
+    Build a training set biased toward recent samples while keeping global coverage.
+
+    The default behavior keeps dataset size unchanged and mixes:
+    - `recent_fraction` from the most recent `recent_window_fraction` of samples,
+    - the rest from the full training pool.
+    """
+    if len(examples) == 0:
+        return []
+
+    total = len(examples)
+    sample_n = total if sample_size is None else max(1, min(int(sample_size), total))
+
+    recent_window_size = max(1, round(total * recent_window_fraction))
+    recent_window = examples[-recent_window_size:]
+    recent_n = round(sample_n * recent_fraction)
+    recent_n = min(sample_n, max(0, recent_n))
+    global_n = sample_n - recent_n
+
+    rng = np.random.default_rng(seed=seed)
+    picked: list[TrainingExample] = []
+    if recent_n > 0:
+        recent_idx = rng.integers(0, len(recent_window), size=recent_n, endpoint=False)
+        picked.extend(recent_window[int(i)] for i in recent_idx)
+    if global_n > 0:
+        global_idx = rng.integers(0, total, size=global_n, endpoint=False)
+        picked.extend(examples[int(i)] for i in global_idx)
+    if len(picked) > 1:
+        order = rng.permutation(len(picked))
+        picked = [picked[int(i)] for i in order]
+    return picked
+
+
 class ReplayBuffer:
     """FIFO replay buffer for self-play training examples."""
 
 
@@ -4,12 +4,12 @@
 
 import numpy as np
 
-from agents.heuristic import heuristic_move
+from agents.heuristic import heuristic_move, is_supported_heuristic_level
 from data.replay_buffer import TrainingExample
 from game.actions import ACTION_SPACE
 from game.board import AtaxxBoard
 
-HeuristicLevel = Literal["easy", "normal", "hard"]
+HeuristicLevel = Literal["easy", "normal", "hard", "apex", "gambit", "sentinel"]
 HistoryEntry = tuple[np.ndarray, np.ndarray, int]
 
 
@@ -51,6 +51,8 @@ def generate_imitation_data(
     """
     if n_games <= 0:
         return []
+    if not is_supported_heuristic_level(heuristic_level):
+        raise ValueError(f"Unsupported heuristic level for warmup: {heuristic_level}")
 
     rng = np.random.default_rng(seed=seed)
     all_examples: list[TrainingExample] = []