Merge pull request #452 from Modalities/3B_training_prep

le1nux · web-flow · commit 8db7d24e0b84 · 2026-06-19T17:17:27.000+02:00
3 b training prep
diff --git a/src/modalities/checkpointing/fsdp/fsdp_checkpoint_loading.py b/src/modalities/checkpointing/fsdp/fsdp_checkpoint_loading.py
@@ -103,17 +103,18 @@ def load_optimizer_checkpoint_(self, optimizer: Optimizer, model: FSDP, file_pat
 class DCPCheckpointLoading(DistributedCheckpointLoadingIF):
     """Distributed checkpoint loading interface for loading PyTorch models and optimizer checkpoints."""
 
-    def __init__(self, global_rank: int):
+    def __init__(self, global_rank: int, allow_partial_load: bool = False):
         """
         Initializes the DCPCheckpointLoading object.
 
         Args:
             global_rank (int): The global rank of the process.
-
+            allow_partial_load (bool, optional): Whether to allow partial loading of the checkpoint. Defaults to False.
         Returns:
             None
         """
         self._global_rank = global_rank
+        self._allow_partial_load = allow_partial_load
 
     @torch.no_grad()
     def load_checkpoint_(self, app_state: AppState, checkpoint_dir_path: Path):
@@ -129,5 +130,6 @@ def load_checkpoint_(self, app_state: AppState, checkpoint_dir_path: Path):
         dcp.load(
             state_dict={"app": app_state},
             checkpoint_id=checkpoint_dir_path,
+            planner=dcp.DefaultLoadPlanner(allow_partial_load=self._allow_partial_load),
         )
         get_logger().info(f"Distributed checkpoint loaded on rank {self._global_rank}.")
diff --git a/src/modalities/checkpointing/stateful/app_state.py b/src/modalities/checkpointing/stateful/app_state.py
@@ -37,7 +37,11 @@ class AppState(Stateful):
     """
 
     def __init__(
-        self, model: nn.Module | list[nn.Module], optimizer: Optimizer, lr_scheduler: Optional[LRScheduler] = None
+        self,
+        model: nn.Module | list[nn.Module],
+        optimizer: Optimizer,
+        lr_scheduler: Optional[LRScheduler] = None,
+        components_to_load: list[StatefulComponents] | None = None,
     ):
         """Initializes the AppState object.
 
@@ -46,12 +50,29 @@ def __init__(
                 a non-sharded model, FSDP1 or FSDP2 model.
             optimizer (Optimizer): The optimizer can be either a non-sharded optimizer, FSDP1 or FSDP2 optimizer.
             lr_scheduler (Optional[LRScheduler], optional): The lr scheduler used during training. Defaults to None.
+            components_to_load (list[StatefulComponents] | None, optional): The list of components to load from the
+                checkpoint. If None, all components are loaded. Defaults to None.
         """
         self._model_parts = list(model) if isinstance(model, list) else [model]
         self._optimizer = optimizer
         self._lr_scheduler = lr_scheduler
         self._is_loaded = False
 
+        # policy for which components to load from the checkpoint. If None, defaults to loading all components.
+        if components_to_load is None:
+            self._components_to_load = [StatefulComponents.MODEL, StatefulComponents.OPTIMIZER]
+            if lr_scheduler is not None:
+                self._components_to_load.append(StatefulComponents.LR_SCHEDULER)
+        else:
+            self._components_to_load = components_to_load
+
+        invalid_components = [c for c in self._components_to_load if not isinstance(c, StatefulComponents)]
+        if invalid_components:
+            raise ValueError(
+                f"components_to_load must only contain StatefulComponents, but got invalid entries: "
+                f"{invalid_components}"
+            )
+
     @property
     def is_loaded(self) -> bool:
         """Returns whether the state dict has been loaded.
@@ -106,12 +127,14 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
                 "Cannot call load_state_dict twice on the same AppState object. " "State dict has already been loaded."
             )
 
-        ModelStateRetriever.load_state_dict_(app_state=self, state_dict=state_dict[StatefulComponents.MODEL.value])
-        OptimizerStateRetriever.load_state_dict_(
-            app_state=self,
-            state_dict=state_dict[StatefulComponents.OPTIMIZER.value],
-        )
-        if self._lr_scheduler is not None:
+        if StatefulComponents.MODEL in self._components_to_load:
+            ModelStateRetriever.load_state_dict_(app_state=self, state_dict=state_dict[StatefulComponents.MODEL.value])
+        if StatefulComponents.OPTIMIZER in self._components_to_load:
+            OptimizerStateRetriever.load_state_dict_(
+                app_state=self,
+                state_dict=state_dict[StatefulComponents.OPTIMIZER.value],
+            )
+        if self._lr_scheduler is not None and StatefulComponents.LR_SCHEDULER in self._components_to_load:
             LRSchedulerStateRetriever.load_state_dict_(
                 app_state=self, state_dict=state_dict[StatefulComponents.LR_SCHEDULER.value]
             )
diff --git a/src/modalities/checkpointing/stateful/app_state_factory.py b/src/modalities/checkpointing/stateful/app_state_factory.py
@@ -7,15 +7,18 @@
 from torch.optim.lr_scheduler import LRScheduler
 
 from modalities.checkpointing.fsdp.fsdp_checkpoint_loading import DCPCheckpointLoading
-from modalities.checkpointing.stateful.app_state import AppState
+from modalities.checkpointing.stateful.app_state import AppState, StatefulComponents
 
 
 class AppStateFactory:
     """Factory class to create AppState objects."""
 
     @staticmethod
     def get_raw_app_state(
-        model: nn.Module | list[nn.Module], optimizer: Optimizer, lr_scheduler: Optional[LRScheduler] = None
+        model: nn.Module | list[nn.Module],
+        optimizer: Optimizer,
+        lr_scheduler: Optional[LRScheduler] = None,
+        components_to_load: list[StatefulComponents] | None = None,
     ) -> AppState:
         """Creates a new (non-checkpoint loaded) AppState object from an instantiated
         model, optimizer, and optional learning rate scheduler.
@@ -25,24 +28,35 @@ def get_raw_app_state(
                 a non-sharded model, FSDP1 or FSDP2 model.
             optimizer (Optimizer): The optimizer can be either a non-sharded optimizer, FSDP1 or FSDP2 optimizer.
             lr_scheduler (Optional[LRScheduler], optional): Lr scheduler used during training. Defaults to None.
+            components_to_load (list[StatefulComponents] | None, optional): Subset of components that should
+                be restored from a checkpoint when ``load_state_dict`` is later invoked. If None, all
+                available components are loaded. Defaults to None.
 
         Returns:
             AppState: The AppState object.
         """
-        app_state = AppState(model=model, optimizer=optimizer, lr_scheduler=lr_scheduler)
+        app_state = AppState(
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            components_to_load=components_to_load,
+        )
         return app_state
 
     @staticmethod
     def get_dcp_checkpointed_app_state_(
         raw_app_state: AppState,
         checkpoint_dir_path: Path,
+        allow_partial_load: bool = False,
     ) -> AppState:
         """Loads the checkpointed state dict into the raw AppState object
         (i.e., non-checkpoint loaded AppState) in-place.
 
         Args:
-            raw_app_state (AppState): The raw AppState object.
+            raw_app_state (AppState): The raw AppState object. Its ``components_to_load`` policy
+                determines which components are restored.
             checkpoint_dir_path (Path): The path to the checkpoint directory.
+            allow_partial_load (bool, optional): Whether to allow partial loading of the checkpoint. Defaults to False.
 
         Raises:
             RuntimeError: Raises an error if the state dict has already been loaded.
@@ -52,8 +66,9 @@ def get_dcp_checkpointed_app_state_(
         """
         if raw_app_state.is_loaded:
             raise RuntimeError(
-                "Cannot call load_state_dict twice on the same AppState object. " "State dict has already been loaded."
+                "Cannot call load_state_dict twice on the same AppState object. State dict has already been loaded."
             )
-        cp_loading = DCPCheckpointLoading(global_rank=dist.get_rank())
+
+        cp_loading = DCPCheckpointLoading(global_rank=dist.get_rank(), allow_partial_load=allow_partial_load)
         cp_loading.load_checkpoint_(app_state=raw_app_state, checkpoint_dir_path=checkpoint_dir_path)
         return raw_app_state
diff --git a/src/modalities/config/config.py b/src/modalities/config/config.py
@@ -11,6 +11,7 @@
 from transformers import LlamaTokenizer as LlamaTokenizerFast
 from typing_extensions import deprecated
 
+from modalities.checkpointing.stateful.app_state import StatefulComponents
 from modalities.config.lookup_enum import LookupEnum
 from modalities.config.pydantic_if_types import (
     PydanticAppStateType,
@@ -33,6 +34,7 @@
     PydanticTokenizerIFType,
 )
 from modalities.config.utils import parse_torch_device
+from modalities.models.weight_tying import has_tied_word_embeddings
 from modalities.running_env.env_utils import (
     FSDP2MixedPrecisionSettings,
     MixedPrecisionSettings,
@@ -124,10 +126,6 @@ def parse_sharding_strategy_by_name(cls, name: str) -> ShardingStrategy:
         return parse_enum_by_name(name=name, enum_type=ShardingStrategy)
 
 
-class DCPCheckpointLoadingConfig(BaseModel):
-    global_rank: Annotated[int, Field(strict=True, ge=0)]
-
-
 class FSDP1CheckpointSavingConfig(BaseModel):
     checkpoint_path: Path
     global_rank: Annotated[int, Field(strict=True, ge=0)]
@@ -340,6 +338,13 @@ def validate_tp_mesh_existence(self) -> "GPT2ModelTPConfig":
             raise ValueError("data_parallel_replicate_degree > 1 cannot be used with Tensor Parallelism.")
         return self
 
+    @model_validator(mode="after")
+    def validate_untied_word_embeddings(self) -> "GPT2ModelTPConfig":
+        models = self.model if isinstance(self.model, list) else [self.model]
+        if any(has_tied_word_embeddings(model) for model in models):
+            raise ValueError("Tied word embeddings are not supported with Tensor Parallelism.")
+        return self
+
 
 class CompiledModelConfig(BaseModel):
     model: PydanticPytorchModuleOrListType
@@ -382,11 +387,13 @@ class RawAppStateConfig(BaseModel):
     model: PydanticPytorchModuleOrListType
     optimizer: PydanticOptimizerIFType
     lr_scheduler: Optional[PydanticLRSchedulerIFType] = None
+    components_to_load: Optional[list[StatefulComponents]] = None
 
 
 class DCPAppStateConfig(BaseModel):
     raw_app_state: PydanticAppStateType
     checkpoint_dir_path: Path
+    allow_partial_load: bool = False
 
 
 class PreTrainedHFTokenizerConfig(BaseModel):
diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py
@@ -1121,6 +1121,12 @@ def __init__(
                 self.transformer.lm_head.weight
             )  # https://paperswithcode.com/method/weight-tying
 
+    @property
+    def has_tied_word_embeddings(self) -> bool:
+        token_embedding_weight = getattr(self.transformer.wte, "weight", None)
+        lm_head_weight = getattr(self.transformer.lm_head, "weight", None)
+        return token_embedding_weight is not None and token_embedding_weight is lm_head_weight
+
     @overload
     def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
         """
diff --git a/src/modalities/models/gpt2/llama3_like_initialization.py b/src/modalities/models/gpt2/llama3_like_initialization.py
@@ -15,6 +15,7 @@
 class Llama3InitializerConfig(BaseModel):
     num_layers: Annotated[int, Field(strict=True, gt=0)]
     n_embd: Annotated[int, Field(strict=True, gt=0)]
+    use_weight_tying: bool
     depth_init: bool = True
 
 
@@ -23,7 +24,7 @@ class Llama3Initializer(ModelInitializationIF):
     Follows weight initialization distributions and parameterization for Llama3 as described in TorchTitan.
     """
 
-    def __init__(self, num_layers: int, n_embd: int, depth_init: bool) -> None:
+    def __init__(self, num_layers: int, n_embd: int, depth_init: bool, use_weight_tying: bool) -> None:
         """
         Initializes the Llama3Initializer.
         Args:
@@ -39,16 +40,6 @@ def __init__(self, num_layers: int, n_embd: int, depth_init: bool) -> None:
         self.regex_to_init = {
             # embedding weights
             r"transformer\.wte\.weight": (nn.init.normal_, {"mean": 0.0, "std": 1}),
-            # lm head weights
-            r"transformer\.lm_head\.weight": (
-                trunc_normal_,
-                {
-                    "mean": 0.0,
-                    "std": 1 / math.sqrt(n_embd),
-                    "a": -3 / math.sqrt(n_embd),
-                    "b": 3 / math.sqrt(n_embd),
-                },
-            ),
             # qkv projections
             r"transformer\.h\.\d+\.attn\.(q_attn|k_attn|v_attn)\.weight": (
                 trunc_normal_,
@@ -97,6 +88,17 @@ def __init__(self, num_layers: int, n_embd: int, depth_init: bool) -> None:
                 },
             ),
         }
+        if not use_weight_tying:
+            # lm head weights
+            self.regex_to_init[r"transformer\.lm_head\.weight"] = (
+                trunc_normal_,
+                {
+                    "mean": 0.0,
+                    "std": 1 / math.sqrt(n_embd),
+                    "a": -3 / math.sqrt(n_embd),
+                    "b": 3 / math.sqrt(n_embd),
+                },
+            )
 
     def initialize_in_place(self, model: nn.Module):
         self._init_by_fqn_regex(model, self.regex_to_init)
diff --git a/src/modalities/models/model.py b/src/modalities/models/model.py
@@ -46,6 +46,11 @@ def weight_decay_groups(self) -> WeightDecayGroups:
         """
         return self._weight_decay_groups
 
+    @property
+    def has_tied_word_embeddings(self) -> bool:
+        """Whether the model currently uses tied token embedding and output weights."""
+        return False
+
     @abstractmethod
     def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
         """
diff --git a/src/modalities/models/parallelism/pipeline_parallelism_configs.py b/src/modalities/models/parallelism/pipeline_parallelism_configs.py
@@ -1,6 +1,6 @@
 from typing import Annotated
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 
 from modalities.config.pydantic_if_types import (
     PydanticDeviceMeshIFType,
@@ -11,6 +11,7 @@
     PydanticStagesGeneratorType,
 )
 from modalities.models.parallelism.pipeline_parallelism import PipelineSelectionTypes
+from modalities.models.weight_tying import has_tied_word_embeddings
 from modalities.utils.deprecated_alias import add_deprecated_alias
 
 
@@ -26,6 +27,12 @@ class StagedPipelineConfig(BaseModel):
     pp_schedule_name: str
     num_layers_per_stage: Annotated[int, Field(strict=True, ge=1)]
 
+    @model_validator(mode="after")
+    def validate_untied_word_embeddings(self) -> "StagedPipelineConfig":
+        if has_tied_word_embeddings(self.whole_model):
+            raise ValueError("Tied word embeddings are not supported with Pipeline Parallelism.")
+        return self
+
 
 class ScheduledPipelineConfig(BaseModel):
     loss_fn: PydanticLossIFType
diff --git a/src/modalities/models/weight_tying.py b/src/modalities/models/weight_tying.py
@@ -0,0 +1,11 @@
+import torch.nn as nn
+
+
+def has_tied_word_embeddings(model: nn.Module) -> bool:
+    model_has_tied_word_embeddings = getattr(model, "has_tied_word_embeddings", None)
+    if model_has_tied_word_embeddings is None:
+        raise TypeError(
+            f"{type(model).__name__} must define 'has_tied_word_embeddings' to be used with tied-embedding validation."
+        )
+
+    return bool(model_has_tied_word_embeddings)
diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py
@@ -13,7 +13,7 @@
     SaveEveryKStepsCheckpointingStrategy,
     SaveKMostRecentCheckpointsStrategy,
 )
-from modalities.checkpointing.fsdp.fsdp_checkpoint_loading import DCPCheckpointLoading, FSDP1CheckpointLoading
+from modalities.checkpointing.fsdp.fsdp_checkpoint_loading import FSDP1CheckpointLoading
 from modalities.checkpointing.fsdp.fsdp_checkpoint_saving import DCPCheckpointSaving, FSDP1CheckpointSaving
 from modalities.checkpointing.stateful.app_state_factory import AppStateFactory
 from modalities.checkpointing.torch.torch_checkpoint_loading import TorchCheckpointLoading
@@ -29,7 +29,6 @@
     ConstantLRSchedulerConfig,
     CosineAnnealingLRSchedulerConfig,
     DCPAppStateConfig,
-    DCPCheckpointLoadingConfig,
     DCPCheckpointSavingConfig,
     DebuggingEnrichedModelConfig,
     DistributedSamplerConfig,
@@ -358,7 +357,7 @@ class ComponentEntity:
     ComponentEntity("checkpoint_saving_execution", "dcp", DCPCheckpointSaving, DCPCheckpointSavingConfig),
     # checkpoint loading
     ComponentEntity("checkpoint_loading", "fsdp1", FSDP1CheckpointLoading, FSDP1CheckpointLoadingConfig),
-    ComponentEntity("checkpoint_loading", "dcp", DCPCheckpointLoading, DCPCheckpointLoadingConfig),
+    # ComponentEntity("checkpoint_loading", "dcp", DCPCheckpointLoading, DCPCheckpointLoadingConfig),
     ComponentEntity("checkpoint_loading", "torch", TorchCheckpointLoading, TorchCheckpointLoadingConfig),
     # Progress subscriber
     ComponentEntity(
diff --git a/tests/checkpointing/test_app_state_components_to_load.py b/tests/checkpointing/test_app_state_components_to_load.py
diff --git a/tests/test_weight_tying.py b/tests/test_weight_tying.py