allenai
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/olmo_core/nn/__init__.py‎
Lines changed: 26 additions & 0 deletions b/‎src/olmo_core/nn/__init__.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎src/olmo_core/nn/transformer/model.py‎
Lines changed: 15 additions & 5 deletions b/‎src/olmo_core/nn/transformer/model.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎src/olmo_core/nn/vision/__init__.py‎
Lines changed: 26 additions & 0 deletions b/‎src/olmo_core/nn/vision/__init__.py‎
Lines changed: 26 additions & 0 deletions
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added vision transformer encoder (`VisionTransformer`, `SiglipVisionTransformer`), vision-to-LM connector (`VisionConnector`), and `MultimodalTransformer` — a composite vision-language model that fuses image patch tokens into the LM token stream. Supports OpenAI CLIP, SigLIP, and SigLIP2 backbone variants with factory configs for all standard Molmo2 checkpoints.
 - Added `HFConverterCallback`, which can be used to convert models to huggingface format at the end of the training run.
 - Trainer now records checkpoint save and load durations as `train/checkpoint_save_duration_s` and `train/checkpoint_load_duration_s` metrics.
 - Added `PowerLR`, a power-law learning rate scheduler with linear warmup, power-decay phase (`lr = initial_lr * (current / warmup) ** b` for negative `b`, making the LR independent of the training horizon), and an optional linear decay tail. Registered as `"power_lr"`.
 
@@ -1,3 +1,29 @@
 """
 Common :class:`torch.nn.Module` implementations.
 """
+
+from .vision import (
+    ImagePoolingType,
+    ImageProjectorType,
+    MultimodalTransformer,
+    MultimodalTransformerConfig,
+    SiglipVisionTransformer,
+    VisionBackboneConfig,
+    VisionBackboneType,
+    VisionConnector,
+    VisionConnectorConfig,
+    VisionTransformer,
+)
+
+__all__ = [
+    "VisionBackboneType",
+    "VisionBackboneConfig",
+    "VisionTransformer",
+    "SiglipVisionTransformer",
+    "ImagePoolingType",
+    "ImageProjectorType",
+    "VisionConnectorConfig",
+    "VisionConnector",
+    "MultimodalTransformerConfig",
+    "MultimodalTransformer",
+]
@@ -506,6 +506,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         *,
+        input_embeddings: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         ignore_index: int = -100,
         loss_reduction: Literal["mean", "sum", "none"] = "mean",
@@ -519,6 +520,12 @@ def forward(
         Run the transformer on the token input IDs.
 
         :param input_ids: The token input IDs, shape ``(batch_size, seq_len)``.
+        :param input_embeddings: Pre-computed embeddings to use instead of looking up
+            ``input_ids`` in the embedding table, shape
+            ``(batch_size, seq_len, d_model)``.  When provided the embedding lookup,
+            scale, and norm steps are all skipped.  Intended for multimodal use-cases
+            where image features have already been spliced into the embedding sequence.
+            Not supported with context parallelism.
         :param labels: The token labels, shape ``(batch_size, seq_len)``.
         :param ignore_index: The index to ignore in the loss computation. Default is -100.
         :param loss_reduction: The reduction method for the loss. Can be "mean", "sum", or "none".
@@ -550,11 +557,14 @@ def forward(
 
         # Get embeddings but pass-through for non-existent layers to allow easy
         # pipeline parallel configuration.
-        h = self.embeddings(input_ids) if self.embeddings is not None else input_ids
-        if self.embeddings is not None and self.embed_scale is not None:
-            h = h * self.embed_scale
-        if self.embedding_norm is not None:
-            h = self.embedding_norm(h)
+        if input_embeddings is not None:
+            h = move_to_device(input_embeddings, self.device)
+        else:
+            h = self.embeddings(input_ids) if self.embeddings is not None else input_ids
+            if self.embeddings is not None and self.embed_scale is not None:
+                h = h * self.embed_scale
+            if self.embedding_norm is not None:
+                h = self.embedding_norm(h)
 
         # Run each block.
         for block_key, block in self.blocks.items():
 
@@ -0,0 +1,26 @@
+"""
+Vision encoder modules for multimodal (VLM) training.
+"""
+
+from .config import VisionBackboneConfig, VisionBackboneType
+from .connector import (
+    ImagePoolingType,
+    ImageProjectorType,
+    VisionConnector,
+    VisionConnectorConfig,
+)
+from .image_vit import SiglipVisionTransformer, VisionTransformer
+from .multimodal import MultimodalTransformer, MultimodalTransformerConfig
+
+__all__ = [
+    "VisionBackboneType",
+    "VisionBackboneConfig",
+    "VisionTransformer",
+    "SiglipVisionTransformer",
+    "ImagePoolingType",
+    "ImageProjectorType",
+    "VisionConnectorConfig",
+    "VisionConnector",
+    "MultimodalTransformerConfig",
+    "MultimodalTransformer",
+]