Professionalize and tweak record experiment

theomgdev · theomgdev · commit 5e0490ffa3d6 · 2026-04-11T06:03:39.000+03:00
diff --git a/examples/advanced/convergence_mnist_record.py b/examples/advanced/convergence_mnist_record.py
@@ -1,191 +1,211 @@
+import os
+import time
+
 import torch
 import torch.nn as nn
 from torchvision import datasets, transforms
 from torch.utils.data import DataLoader
-import os
-import time
 
 from odyssnet import OdyssNet, OdyssNetTrainer, TrainingHistory, set_seed
 
-# --- Configuration ---
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
 SEED = 42
 NUM_EPOCHS = 100
 BATCH_SIZE = 32
 LR = 1e-2
+
+# Architecture
 NUM_NEURONS = 10
+EMBED_NEURONS = 4   # Neurons that receive patch input (first N neurons)
+NUM_CLASSES = 10    # Output classes
+
+# Patch strategy: divide 28×28 image into GRID_SIZE×GRID_SIZE non-overlapping patches
+IMAGE_SIZE = 28
 GRID_SIZE = 4
-THINKING_STEPS = GRID_SIZE * GRID_SIZE  # Total patches in spiral order
+PATCH_SIZE = IMAGE_SIZE // GRID_SIZE    # 7 pixels per side
+PATCH_PIXELS = PATCH_SIZE * PATCH_SIZE  # 49 pixels per patch (embed input dim)
+NUM_PATCHES = GRID_SIZE * GRID_SIZE     # 16 patches total
+THINKING_RATIO = 1                      # Thinking steps per patch (1 = inject only, 2 = inject + 1 free step, ...)
+THINKING_STEPS = NUM_PATCHES * THINKING_RATIO
+
+# DataLoader
+NUM_WORKERS = min(4, os.cpu_count() or 1)
+
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 
-def get_spiral_indices(rows: int, cols: int):
-    """
-    Algorithmically calculates the clockwise inward spiral visit order for any grid size.
-    Returns a list of flat indices.
-    """
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def get_spiral_indices(rows: int, cols: int) -> list[int]:
+    """Return flat patch indices in clockwise inward spiral order."""
     indices = []
     top, bottom = 0, rows - 1
     left, right = 0, cols - 1
 
     while top <= bottom and left <= right:
-        # Move Right
-        for i in range(left, right + 1):
+        for i in range(left, right + 1):        # right
             indices.append(top * cols + i)
         top += 1
 
-        # Move Down
-        for i in range(top, bottom + 1):
+        for i in range(top, bottom + 1):        # down
             indices.append(i * cols + right)
         right -= 1
 
         if top <= bottom:
-            # Move Left
-            for i in range(right, left - 1, -1):
+            for i in range(right, left - 1, -1):  # left
                 indices.append(bottom * cols + i)
             bottom -= 1
 
         if left <= right:
-            # Move Up
-            for i in range(bottom, top - 1, -1):
+            for i in range(bottom, top - 1, -1):  # up
                 indices.append(i * cols + left)
             left += 1
 
     return indices
 
 
+def format_time(seconds: float) -> str:
+    m, s = divmod(int(seconds), 60)
+    h, m = divmod(m, 60)
+    return f"{h:02d}:{m:02d}:{s:02d}"
+
+
+def extract_spiral_patches(images: torch.Tensor, spiral: list[int]) -> torch.Tensor:
+    """
+    Extract and spiral-reorder non-overlapping patches from a batch of images.
+
+    Args:
+        images: (B, 1, H, W) image batch.
+        spiral: Patch visit order (flat indices).
+
+    Returns:
+        (B, NUM_PATCHES, PATCH_PIXELS) tensor.
+    """
+    b = images.size(0)
+    patches = images.unfold(2, PATCH_SIZE, PATCH_SIZE).unfold(3, PATCH_SIZE, PATCH_SIZE)
+    # (B, 1, GRID_SIZE, GRID_SIZE, PATCH_SIZE, PATCH_SIZE) → (B, NUM_PATCHES, PATCH_PIXELS)
+    patches = patches.contiguous().view(b, NUM_PATCHES, PATCH_PIXELS)
+    return patches[:, spiral, :]
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
 def main():
-    print("OdyssNet: MNIST RECORD CHALLENGE (Spiral-Fed 4x4 Patch Model)")
-    print(f"Strategy: 16 Spiral Patches (7x7=49 pixels) -> Embed(4 Neurons) -> Core({NUM_NEURONS}) -> Decoder(10 Classes)")
+    print("OdyssNet: MNIST Record Challenge")
+    print(
+        f"  Strategy : {NUM_PATCHES} spiral patches "
+        f"({PATCH_SIZE}x{PATCH_SIZE}={PATCH_PIXELS} px) → "
+        f"Embed({EMBED_NEURONS}) → Core({NUM_NEURONS}) → Decoder({NUM_CLASSES})"
+    )
     set_seed(SEED)
 
+    # GPU optimisations
+    use_compile = False
     if DEVICE == 'cuda':
         torch.set_float32_matmul_precision('high')
         torch.backends.cudnn.benchmark = True
-        # Try to compile model for speed if available
-        model_compile = hasattr(torch, 'compile')
-        if model_compile:
-            print("OdyssNet: torch.compile enabled for speed.")
-    else:
-        model_compile = False
-
-    # Strategy: 16 Patches -> Embed(4) -> Core(10) -> Output Decoder (10)
-    input_ids = [0, 1, 2, 3]  # Map to first 4 neurons
-    output_ids = list(range(NUM_NEURONS))  # Decoder reads from all neurons
-
-    # vocab_size = [v_in, v_out]
-    # v_in = 49 pixels (from each 7x7 patch)
-    # v_out = 10 classes
+        use_compile = hasattr(torch, 'compile')
+        if use_compile:
+            print("  torch.compile enabled.")
+
+    # Model
+    input_ids = list(range(EMBED_NEURONS))
+    output_ids = list(range(NUM_NEURONS))
+
     model = OdyssNet(
         num_neurons=NUM_NEURONS,
         input_ids=input_ids,
         output_ids=output_ids,
         device=DEVICE,
-        vocab_size=[49, 10],
+        vocab_size=[PATCH_PIXELS, NUM_CLASSES],
         vocab_mode='continuous',
         weight_init='micro_quiet_warm',
-        gate='none'
+        gate='none',
     )
 
-    # Speed up core with torch.compile if on PyTorch 2.0+
-    if 'model_compile' in locals() and model_compile:
+    if use_compile:
         model = torch.compile(model)
 
     total_params = model.get_num_params()
-    print(f"Total Params: {total_params} (Goal: < 500)")
+    print(f"  Params   : {total_params} (target: < 500)\n")
 
-    # Data Preparation
+    # Data
     train_transform = transforms.Compose([
         transforms.RandomAffine(degrees=5, translate=(0.05, 0.05), scale=(0.95, 1.05)),
         transforms.ToTensor(),
-        transforms.Normalize((0.5,), (0.5,))
+        transforms.Normalize((0.5,), (0.5,)),
     ])
-
     test_transform = transforms.Compose([
         transforms.ToTensor(),
-        transforms.Normalize((0.5,), (0.5,))
+        transforms.Normalize((0.5,), (0.5,)),
     ])
 
     data_dir = os.path.join(os.path.dirname(__file__), '..', 'data')
     train_dataset = datasets.MNIST(root=data_dir, train=True, download=True, transform=train_transform)
     test_dataset = datasets.MNIST(root=data_dir, train=False, download=True, transform=test_transform)
 
-    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=8)
-    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=8)
+    loader_kwargs = dict(batch_size=BATCH_SIZE, pin_memory=(DEVICE == 'cuda'), num_workers=NUM_WORKERS)
+    train_loader = DataLoader(train_dataset, shuffle=True, **loader_kwargs)
+    test_loader = DataLoader(test_dataset, shuffle=False, **loader_kwargs)
 
-    trainer = OdyssNetTrainer(
-        model,
-        device=DEVICE, lr=LR,
-    )
+    # Trainer
+    trainer = OdyssNetTrainer(model, device=DEVICE, lr=LR)
+    trainer.loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
 
-    loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
-    trainer.loss_fn = loss_fn
+    print(f"Training for {NUM_EPOCHS} epochs | batch {BATCH_SIZE} | lr {LR} | device {DEVICE}")
 
-    print(f"Training with Batch Size: {BATCH_SIZE} for {NUM_EPOCHS} Epochs...")
     history = TrainingHistory()
+    spiral = get_spiral_indices(GRID_SIZE, GRID_SIZE)
     start_time = time.time()
 
-    SPIRAL = get_spiral_indices(GRID_SIZE, GRID_SIZE)
-
-    # Processing Loop
-
     for epoch in range(NUM_EPOCHS):
+        # --- Train ---
         model.train()
-        total_loss = 0
-
-        for batch_idx, (data, target) in enumerate(train_loader):
-            batch_size = data.size(0)
-            data = data.to(DEVICE, non_blocking=True)
-            target = target.to(DEVICE, non_blocking=True)
-
-            # Extract 4x4 grid of 7x7 patches: (B, 1, 28, 28) -> (B, 16, 49)
-            patches = data.unfold(2, 7, 7).unfold(3, 7, 7)       # (B, 1, 4, 4, 7, 7)
-            patches = patches.contiguous().view(batch_size, 16, 49)  # (B, 16, 49)
+        total_loss = 0.0
 
-            # Reorder patches: edges first, center last (spiral inward)
-            seq_input = patches[:, SPIRAL, :]  # (B, 16, 49)
-
-            loss = trainer.train_batch(seq_input, target, thinking_steps=THINKING_STEPS)
-            total_loss += loss
+        for images, targets in train_loader:
+            images = images.to(DEVICE, non_blocking=True)
+            targets = targets.to(DEVICE, non_blocking=True)
+            seq = extract_spiral_patches(images, spiral)
+            total_loss += trainer.train_batch(seq, targets, thinking_steps=THINKING_STEPS)
 
         avg_loss = total_loss / len(train_loader)
 
-        # Eval
+        # --- Evaluate ---
         model.eval()
         correct = 0
         total = 0
         with torch.no_grad():
-            for data, target in test_loader:
-                batch_size = data.size(0)
-                data = data.to(DEVICE)
-                target = target.to(DEVICE)
-
-                # Same patch extraction and spiral reordering
-                patches = data.unfold(2, 7, 7).unfold(3, 7, 7)
-                patches = patches.contiguous().view(batch_size, 16, 49)
-                seq_input = patches[:, SPIRAL, :]
-
-                preds = trainer.predict(seq_input, thinking_steps=THINKING_STEPS)
-                correct += (preds.argmax(1) == target).sum().item()
-                total += target.size(0)
+            for images, targets in test_loader:
+                images = images.to(DEVICE, non_blocking=True)
+                targets = targets.to(DEVICE, non_blocking=True)
+                seq = extract_spiral_patches(images, spiral)
+                preds = trainer.predict(seq, thinking_steps=THINKING_STEPS)
+                correct += (preds.argmax(1) == targets).sum().item()
+                total += targets.size(0)
 
         acc = 100.0 * correct / total
 
-        # Calculate time metrics
         elapsed = time.time() - start_time
-        avg_time_per_epoch = elapsed / (epoch + 1)
-        remaining_epochs = NUM_EPOCHS - (epoch + 1)
-        eta_seconds = remaining_epochs * avg_time_per_epoch
-
-        def format_time(seconds):
-            m, s = divmod(int(seconds), 60)
-            h, m = divmod(m, 60)
-            return f"{h:02d}:{m:02d}:{s:02d}"
+        eta = (elapsed / (epoch + 1)) * (NUM_EPOCHS - epoch - 1)
 
         history.record(loss=avg_loss, accuracy=acc)
+        print(
+            f"Epoch {epoch+1:4d}/{NUM_EPOCHS} | "
+            f"Loss {avg_loss:.4f} | "
+            f"Acc {acc:5.2f}% | "
+            f"Elapsed {format_time(elapsed)} | "
+            f"ETA {format_time(eta)}"
+        )
 
-        print(f"Epoch {epoch+1:4d}/{NUM_EPOCHS} | Loss {avg_loss:.4f} | Acc {acc:5.2f}% | "
-              f"Elapsed {format_time(elapsed)} | ETA {format_time(eta_seconds)}")
+    history.plot(title=f"MNIST Record ({total_params} params) — {NUM_PATCHES}-patch spiral")
 
-    history.plot(title="MNIST Record (480 Params) Training")
 
 if __name__ == "__main__":
     main()