Clay-foundation
diff --git a/‎README.md‎
Lines changed: 1 addition & 5 deletions b/‎README.md‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎claymodel/__init__.py‎
Lines changed: 2 additions & 3 deletions b/‎claymodel/__init__.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎claymodel/api.py‎
Lines changed: 30 additions & 39 deletions b/‎claymodel/api.py‎
Lines changed: 30 additions & 39 deletions
diff --git a/‎claymodel/model.py‎
Lines changed: 7 additions & 28 deletions b/‎claymodel/model.py‎
Lines changed: 7 additions & 28 deletions
diff --git a/‎claymodel/module.py‎
Lines changed: 54 additions & 12 deletions b/‎claymodel/module.py‎
Lines changed: 54 additions & 12 deletions
@@ -30,13 +30,9 @@ The easiest way to install Clay Foundation Model is via `uv`:
 This will install the `claymodel` package and all its dependencies. You can then import and use it in your Python code:
 
 ```python
-from claymodel import ClayMAEModule
+from claymodel import load_model, embed
 ```
 
-If you want the `clay` CLI, install the `cli` extra:
-
-    uv pip install "claymodel[cli]"
-
 ### Development Installation
 
 For development or advanced usage, clone the repository and install with dev extras:
 
@@ -4,15 +4,14 @@
 
 from claymodel.api import EmbeddingResult, embed, load_metadata, load_model, normalize
 from claymodel.metadata import PlatformMetadata
-from claymodel.model import clay_mae_base, clay_mae_large, clay_mae_small, clay_mae_tiny
-from claymodel.module import ClayMAEModule
+from claymodel.model import Encoder, clay_mae_base, clay_mae_large, clay_mae_small, clay_mae_tiny
 
 __version__: str = version("claymodel")
 
 
 __all__ = [
-    "ClayMAEModule",
     "EmbeddingResult",
+    "Encoder",
     "PlatformMetadata",
     "clay_mae_base",
     "clay_mae_large",
 
@@ -10,7 +10,15 @@
 import torch
 
 from claymodel.metadata import PlatformMetadata, load_metadata_yaml
-from claymodel.module import ClayMAEModule
+from claymodel.model import Encoder
+from claymodel.utils import load_encoder_weights
+
+_ENCODER_CONFIGS: dict[str, dict[str, int]] = {
+    "tiny": {"dim": 192, "depth": 6, "heads": 4, "dim_head": 48, "mlp_ratio": 2},
+    "small": {"dim": 384, "depth": 6, "heads": 6, "dim_head": 64, "mlp_ratio": 2},
+    "base": {"dim": 768, "depth": 12, "heads": 12, "dim_head": 64, "mlp_ratio": 4},
+    "large": {"dim": 1024, "depth": 24, "heads": 16, "dim_head": 64, "mlp_ratio": 4},
+}
 
 
 def load_metadata(
@@ -48,57 +56,40 @@ def load_model(
     size: str = "large",
     ckpt_path: str | None = None,
     device: str = "cpu",
-    metadata_path: str | Path | None = None,
-) -> ClayMAEModule:
-    """Load a Clay MAE model ready for inference.
-
-    Creates a ClayMAEModule and optionally loads weights from a checkpoint.
-    The model is returned in eval mode with mask_ratio=0 and shuffle=False
-    for deterministic inference.
+) -> Encoder:
+    """Load a Clay encoder ready for inference.
 
-    Note: The model includes a DINOv2 teacher (~300MB) that is downloaded
-    on first use. The teacher is frozen and not needed for embedding
-    extraction, but is part of the architecture.
+    Creates an Encoder and optionally loads weights from a checkpoint.
+    The encoder is returned in eval mode with mask_ratio=0 and shuffle=False
+    for deterministic inference. No teacher model is downloaded.
 
     Args:
         size: Model size - "tiny", "small", "base", or "large".
-        ckpt_path: Path to checkpoint file. If None, creates model with
+        ckpt_path: Path to checkpoint file. If None, creates encoder with
             random weights (useful for testing).
         device: Device to load model onto ("cpu", "cuda", etc.).
-        metadata_path: Path to a custom metadata YAML file. If None,
-            uses the bundled metadata with common public sensors.
 
     Returns:
-        ClayMAEModule instance in eval mode.
+        Encoder instance in eval mode.
 
     Example:
-        >>> model = load_model("large", ckpt_path="clay-v1.5.ckpt")
-        >>> model = load_model("large", metadata_path="my_sensors.yaml")
+        >>> encoder = load_model("large", ckpt_path="clay-v1.5.ckpt")
     """
-    resolved_path = (
-        str(metadata_path)
-        if metadata_path
-        else str(files("claymodel").joinpath("configs/metadata.yaml"))
+    if size not in _ENCODER_CONFIGS:
+        raise ValueError(f"Invalid size {size!r}. Expected one of {list(_ENCODER_CONFIGS.keys())}")
+
+    encoder = Encoder(
+        mask_ratio=0.0,
+        patch_size=8,
+        shuffle=False,
+        **_ENCODER_CONFIGS[size],
     )
 
     if ckpt_path is not None:
-        model = ClayMAEModule.load_from_checkpoint(
-            ckpt_path,
-            metadata_path=resolved_path,
-            map_location=device,
-        )
-    else:
-        model = ClayMAEModule(
-            model_size=size,
-            mask_ratio=0.0,
-            shuffle=False,
-            metadata_path=resolved_path,
-        )
+        load_encoder_weights(encoder, ckpt_path, device=device, freeze=False)
 
-    model.model.encoder.mask_ratio = 0.0
-    model.model.encoder.shuffle = False
-    model.eval()
-    return model.to(device)
+    encoder.eval()
+    return encoder.to(device)
 
 
 @dataclass
@@ -118,7 +109,7 @@ def shape(self) -> torch.Size:
 def embed(  # noqa: PLR0913
     input_data: torch.Tensor | np.ndarray,
     sensor: str,
-    model: ClayMAEModule | None = None,
+    model: Encoder | None = None,
     ckpt_path: str | None = None,
     device: str = "cpu",
     time: torch.Tensor | None = None,
@@ -188,7 +179,7 @@ def embed(  # noqa: PLR0913
         model = load_model(ckpt_path=ckpt_path, device=device)
 
     with torch.no_grad():
-        encoded, *_ = model.encoder(datacube)
+        encoded, *_ = model(datacube)
         cls_embeddings = encoded[:, 0, :]
 
     return EmbeddingResult(
 
@@ -10,18 +10,15 @@
 ]
 
 import math
-from typing import TYPE_CHECKING, Any, TypedDict, cast
+from typing import TYPE_CHECKING, Any, TypedDict
 
-import timm
 import torch
 import torch.nn.functional as F
 from einops import rearrange, reduce, repeat
 from torch import nn
-from torchvision.transforms import v2
 
 from claymodel.embedding import DynamicEmbedding
 from claymodel.layers import Transformer
-from claymodel.mrl import MRL, MRLLoss
 from claymodel.utils import posemb_sincos_2d_with_gsd
 
 if TYPE_CHECKING:
@@ -375,14 +372,17 @@ def forward(  # noqa: PLR0913
 
 
 class ClayMAE(nn.Module):
+    """Clay Masked Autoencoder: encoder + decoder.
+
+    Does not include the teacher model or representation loss components,
+    which live in ClayMAEModule (the training wrapper).
+    """
+
     mask_ratio: float
     patch_size: int
     norm_pix_loss: bool
     shuffle: bool
     metadata: dict[str, "PlatformMetadata"]
-    teacher: nn.Module
-    teacher_chip_size: int
-    matryoshka: bool
     encoder: Encoder
     decoder: Decoder
 
@@ -393,9 +393,6 @@ def __init__(  # noqa: PLR0913
         norm_pix_loss: bool,
         shuffle: bool,
         metadata: dict[str, "PlatformMetadata"],
-        teacher: str,
-        dolls: list[int],
-        doll_weights: list[float],
         # ENCODER
         dim: int,
         depth: int,
@@ -408,7 +405,6 @@ def __init__(  # noqa: PLR0913
         decoder_heads: int,
         decoder_dim_head: int,
         decoder_mlp_ratio: float,
-        matryoshka: bool = False,
         **kwargs: object,
     ) -> None:
         super().__init__()
@@ -417,16 +413,6 @@ def __init__(  # noqa: PLR0913
         self.norm_pix_loss = norm_pix_loss
         self.shuffle = shuffle
         self.metadata = metadata
-        self.teacher = timm.create_model(teacher, pretrained=True, num_classes=0)
-        teacher_features = cast("int", self.teacher.num_features)
-        self.teacher_chip_size = 518
-        self.teacher_resize = v2.Resize(size=(self.teacher_chip_size, self.teacher_chip_size))
-        self.matryoshka = matryoshka
-        if matryoshka:
-            self.mrl = MRL(features=teacher_features, dolls=dolls)
-            self.mrl_loss = MRLLoss(weights=doll_weights)
-        else:
-            self.proj = nn.Linear(dim, teacher_features)
 
         self.encoder = Encoder(
             mask_ratio=mask_ratio,
@@ -450,13 +436,6 @@ def __init__(  # noqa: PLR0913
             mlp_ratio=decoder_mlp_ratio,
         )
 
-        self.freeze_teacher()
-
-    def freeze_teacher(self) -> None:
-        for param in self.teacher.parameters():
-            param.requires_grad = False
-        self.teacher.eval()
-
     def per_pixel_loss(
         self, cube: torch.Tensor, pixels: torch.Tensor, masked_matrix: torch.Tensor
     ) -> torch.Tensor:
 
@@ -3,12 +3,15 @@
 import random
 from collections.abc import Mapping
 from importlib.resources import files
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, cast
 
 import lightning as L
+import timm
 import torch
 import torch.nn.functional as F
 from lightning.pytorch.utilities.types import OptimizerLRScheduler
+from torch import nn
+from torchvision.transforms import v2
 
 from claymodel.metadata import load_metadata_yaml
 from claymodel.model import (
@@ -19,6 +22,7 @@
     clay_mae_small,
     clay_mae_tiny,
 )
+from claymodel.mrl import MRL, MRLLoss
 
 if TYPE_CHECKING:
     from claymodel.metadata import PlatformMetadata
@@ -65,19 +69,57 @@ def __init__(  # noqa: PLR0913
                 "norm_pix_loss": norm_pix_loss,
                 "shuffle": shuffle,
                 "metadata": self.metadata,
-                "teacher": teacher,
-                "dolls": dolls,
-                "doll_weights": doll_weights,
-                "matryoshka": matryoshka,
             }
             self.model = model_map[model_size](**model_args)
         else:
             raise ValueError(
                 f"Invalid model size {model_size}. Expected one of {list(model_map.keys())}"
             )
 
+        # Teacher model and representation loss components (training only)
+        self.teacher = timm.create_model(teacher, pretrained=True, num_classes=0)
+        teacher_features = cast("int", self.teacher.num_features)
+        self.teacher_chip_size = 518
+        self.teacher_resize = v2.Resize(size=(self.teacher_chip_size, self.teacher_chip_size))
+        self.matryoshka = matryoshka
+        if matryoshka:
+            self.mrl = MRL(features=teacher_features, dolls=dolls)
+            self.mrl_loss = MRLLoss(weights=doll_weights)
+        else:
+            self.proj = nn.Linear(self.model.encoder.dim, teacher_features)
+
+        self._freeze_teacher()
+
+    def _freeze_teacher(self) -> None:
+        for param in self.teacher.parameters():
+            param.requires_grad = False
+        self.teacher.eval()
+
+    def on_load_checkpoint(self, checkpoint: dict[str, Any]) -> None:
+        """Remap old checkpoint keys where teacher lived under model.*."""
+        state_dict = checkpoint.get("state_dict", checkpoint)
+        prefixes = (
+            "model.teacher.",
+            "model.proj.",
+            "model.mrl.",
+            "model.mrl_loss.",
+            "model.teacher_resize.",
+        )
+        remapped = {}
+        keys_to_remove = []
+        for key in state_dict:
+            for prefix in prefixes:
+                if key.startswith(prefix):
+                    new_key = key.replace("model.", "", 1)
+                    remapped[new_key] = state_dict[key]
+                    keys_to_remove.append(key)
+                    break
+        for key in keys_to_remove:
+            del state_dict[key]
+        state_dict.update(remapped)
+
     def on_train_epoch_start(self) -> None:
-        self.model.teacher.eval()
+        self.teacher.eval()
 
     @property
     def encoder(self) -> Encoder:
@@ -165,19 +207,19 @@ def _teacher_target(
             else:
                 indices = self.metadata[platform].rgb_indices
                 rgb = pixels[:, indices, :, :]
-            rgb = self.model.teacher_resize(rgb)
-            return self.model.teacher(rgb)
+            rgb = self.teacher_resize(rgb)
+            return self.teacher(rgb)
 
     def _representation_loss(
         self,
         cls_token: torch.Tensor,
         target: torch.Tensor,
     ) -> torch.Tensor:
         """Compute representation loss (proj or MRL)."""
-        if self.model.matryoshka:
-            representations = self.model.mrl(cls_token)
-            return self.model.mrl_loss(representations, target)
-        representations = self.model.proj(cls_token)
+        if self.matryoshka:
+            representations = self.mrl(cls_token)
+            return self.mrl_loss(representations, target)
+        representations = self.proj(cls_token)
         return 1.0 - F.cosine_similarity(representations, target).mean()
 
     def _log_losses(