Modalities
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/modalities/__main__.py‎
Lines changed: 74 additions & 0 deletions b/‎src/modalities/__main__.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎src/modalities/config/pydantic_if_types.py‎
Lines changed: 2 additions & 0 deletions b/‎src/modalities/config/pydantic_if_types.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/modalities/models/components/layer_norms.py‎
Lines changed: 4 additions & 1 deletion b/‎src/modalities/models/components/layer_norms.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/modalities/registry/components.py‎
Lines changed: 16 additions & 1 deletion b/‎src/modalities/registry/components.py‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎src/modalities/utils/profilers/batch_generator.py‎
Lines changed: 44 additions & 12 deletions b/‎src/modalities/utils/profilers/batch_generator.py‎
Lines changed: 44 additions & 12 deletions
@@ -1,7 +1,7 @@
 [project]
 name = "modalities"
 version = "0.4.0"
-requires-python = ">=3.10,<3.13"
+requires-python = ">=3.10,<=3.13"
 description = "Modalities, a PyTorch-native framework for distributed and reproducible foundation model training."
 readme = "README.md"
 dependencies = [
@@ -21,8 +21,10 @@ dependencies = [
     "click_pathlib",
     "jq",
     "class_resolver",
+    "matplotlib",
     "wandb",
     "einops>=0.7.0",
+    "debugpy",  # For VSCode debugging support
 ]
 
 [project.urls]
 
@@ -35,6 +35,10 @@
 from modalities.utils.benchmarking.benchmarking_utils import SweepSets, get_updated_sweep_status
 from modalities.utils.benchmarking.sweep_utils import SweepGenerator
 from modalities.utils.communication_test import run_communication_test
+from modalities.utils.logger_utils import get_logger
+from modalities.utils.profilers.modalities_profiler import ModalitiesProfilerStarter
+
+logger = get_logger("__main__")
 
 
 @click.group()
@@ -680,5 +684,75 @@ def CMD_entry_point_list_remaining_runs(
                 f.write(f"{cfg}\n")
 
 
+@main.group(name="profile")
+def profile():
+    """
+    Collection of utilities to profile modalities.
+    """
+    pass
+
+
+@profile.command(name="distributed")
+@click.option(
+    "--config_file_path",
+    type=click_pathlib.Path(exists=True),
+    required=True,
+    help="Path to the YAML training config file.",
+)
+@click.option(
+    "--experiment_root_path",
+    type=click_pathlib.Path(file_okay=False),
+    required=True,
+    help="Path to the experiment output directory.",
+)
+@click.option(
+    "--num_wait_steps",
+    type=int,
+    default=1,
+    show_default=True,
+    help="Number of wait steps to skip in profiling.",
+)
+@click.option(
+    "--num_warmup_steps",
+    type=int,
+    default=1,
+    show_default=True,
+    help="Number of warmup steps to skip in profiling. Already recording but dropping the data.",
+)
+@click.option(
+    "--num_measurement_steps",
+    type=int,
+    default=3,
+    show_default=True,
+    help="Number of steps to measure during profiling.",
+)
+@click.option(
+    "--profiled_ranks",
+    type=str,
+    default="0",
+    help="Comma-separated list of profiled ranks (must not have spaces), e.g. --profiled_ranks '2,4,8'",
+)
+def CMD_entry_point_run_train_step_profiler(
+    config_file_path: Path,
+    experiment_root_path: Path,
+    num_wait_steps: int,
+    num_warmup_steps: int,
+    num_measurement_steps: int,
+    profiled_ranks: str,
+):
+    """Run train step profiler and write result to JSON if RANK=0."""
+    profiled_ranks_list = [int(i) for i in profiled_ranks.split(",")] if profiled_ranks != "" else [0]
+    logger.info(f"Running distributed profiling on ranks {profiled_ranks_list}")
+
+    ModalitiesProfilerStarter.run_distributed(
+        config_file_path=config_file_path,
+        num_measurement_steps=num_measurement_steps,
+        num_wait_steps=num_wait_steps,
+        num_warmup_steps=num_warmup_steps,
+        experiment_root_path=experiment_root_path,
+        profiled_ranks=profiled_ranks_list,
+    )
+
+
 if __name__ == "__main__":
     main()
@@ -28,6 +28,7 @@
 from modalities.training.gradient_clipping.gradient_clipper import GradientClipperIF
 from modalities.utils.mfu import MFUCalculatorABC
 from modalities.utils.profilers.batch_generator import DatasetBatchGeneratorIF
+from modalities.utils.profilers.steppable_components import SteppableComponentIF
 
 
 class PydanticThirdPartyTypeIF:
@@ -88,3 +89,4 @@ def __get_pydantic_core_schema__(
 PydanticStagesGeneratorType = Annotated[StagesGenerator, PydanticThirdPartyTypeIF(StagesGenerator)]
 PydanticPipelineType = Annotated[Pipeline, PydanticThirdPartyTypeIF(Pipeline)]
 PydanticPipelineStageType = Annotated[PipelineStage, PydanticThirdPartyTypeIF(PipelineStage)]
+PydanticSteppableComponentIFType = Annotated[SteppableComponentIF, PydanticThirdPartyTypeIF(SteppableComponentIF)]
@@ -1,3 +1,4 @@
+import warnings
 from typing import Annotated
 
 import torch
@@ -10,7 +11,8 @@ class RMSLayerNorm(nn.Module):
 
     def __init__(self, ndim: int, bias: bool = True, epsilon: float = 1e-5):
         """
-        Initializes a LayerNorm module.
+        RMS Norm implementation.
+        WARNING: THIS IMPLEMENTATION IS DEPCREATED! USE torch.nn.RMSNorm INSTEAD FOR BETTER PERFORMANCE!
         Args:
             ndim (int): The number of dimensions of the input tensor.
             bias (bool, optional): If True, adds a learnable bias to the normalized tensor. Defaults to True.
@@ -21,6 +23,7 @@ def __init__(self, ndim: int, bias: bool = True, epsilon: float = 1e-5):
         Returns:
             None
         """
+        warnings.warn("RMSLayerNorm is deprecated. Please use torch.nn.RMSNorm for better performance.", FutureWarning)
 
         super().__init__()
         self.epsilon = epsilon
 
@@ -82,7 +82,12 @@
 from modalities.loss_functions import CLMCrossEntropyLoss
 from modalities.models.coca.coca_model import CoCa, CoCaConfig
 from modalities.models.coca.collator import CoCaCollateFnConfig, CoCaCollatorFn
-from modalities.models.components.layer_norms import LayerNormConfig, RMSLayerNorm, RMSLayerNormConfig
+from modalities.models.components.layer_norms import (
+    LayerNormConfig,
+    PytorchRMSLayerNormConfig,
+    RMSLayerNorm,
+    RMSLayerNormConfig,
+)
 from modalities.models.gpt2.collator import GPT2LLMCollateFn
 from modalities.models.gpt2.gpt2_model import GPT2LLMConfig
 from modalities.models.huggingface.huggingface_model import HuggingFacePretrainedModel, HuggingFacePretrainedModelConfig
@@ -130,6 +135,8 @@
     NumTokensFromPackedMemMapDatasetContinuousConfig,
 )
 from modalities.utils.profilers.batch_generator import RandomDatasetBatchGenerator, RandomDatasetBatchGeneratorConfig
+from modalities.utils.profilers.steppable_component_configs import SteppableForwardPassConfig
+from modalities.utils.profilers.steppable_components import SteppableForwardPass
 
 
 @dataclass
@@ -326,6 +333,7 @@ class ComponentEntity:
     # layer norms
     ComponentEntity("layer_norm", "rms_norm", RMSLayerNorm, RMSLayerNormConfig),
     ComponentEntity("layer_norm", "layer_norm", nn.LayerNorm, LayerNormConfig),
+    ComponentEntity("layer_norm", "rms_norm_pytorch", nn.RMSNorm, PytorchRMSLayerNormConfig),
     # gradient clippers
     ComponentEntity("gradient_clipper", "fsdp1", FSDP1GradientClipper, FSDP1GradientClipperConfig),
     ComponentEntity(
@@ -416,4 +424,11 @@ class ComponentEntity:
         NumberConversion.get_num_steps_from_raw_dataset_index,
         NumStepsFromRawDatasetIndexConfig,
     ),
+    # Profiling components
+    ComponentEntity(
+        "steppable_component",
+        "forward_pass",
+        SteppableForwardPass,
+        SteppableForwardPassConfig,
+    ),
 ]
@@ -4,28 +4,60 @@
 from pydantic import BaseModel
 
 from modalities.batch import DatasetBatch
-
-
-class RandomDatasetBatchGeneratorConfig(BaseModel):
-    vocab_size: int
-    sequence_length: int
-    batch_size: int
+from modalities.config.lookup_enum import LookupEnum
 
 
 class DatasetBatchGeneratorIF(ABC):
     def get_dataset_batch(self) -> DatasetBatch:
         raise NotImplementedError
 
 
+class DataTypeEnum(LookupEnum):
+    float32 = torch.float32
+    bfloat16 = torch.bfloat16
+    int64 = torch.int64
+
+
+class RandomDatasetBatchGeneratorConfig(BaseModel):
+    dims: dict[str, int]
+    data_type: DataTypeEnum
+    min_val: int
+    max_val: int
+
+
 class RandomDatasetBatchGenerator(DatasetBatchGeneratorIF):
-    def __init__(self, vocab_size: int, sequence_length: int, batch_size: int):
-        self._vocab_size = vocab_size
-        self._sequence_length = sequence_length
-        self._batch_size = batch_size
+    def __init__(self, dims: dict[str, int], data_type: DataTypeEnum, min_val: int, max_val: int):
+        self._dims = dims
+        self._data_type = data_type
+        self._min_val = min_val
+        self._max_val = max_val
+        self._device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 
     def get_dataset_batch(self) -> DatasetBatch:
+        size = tuple(self._dims.values())
+        if self._data_type == DataTypeEnum.int64:
+            inputs = torch.randint(low=self._min_val, high=self._max_val, size=size, device=self._device)
+            targets = torch.randint(low=self._min_val, high=self._max_val, size=size, device=self._device)
+        elif self._data_type in {DataTypeEnum.float32, DataTypeEnum.bfloat16}:
+            inputs = (
+                torch.rand(size=size, device=self._device, dtype=self._data_type.value)
+                * (self._max_val - self._min_val)
+                + self._min_val
+            )
+            targets = (
+                torch.rand(size=size, device=self._device, dtype=self._data_type.value)
+                * (self._max_val - self._min_val)
+                + self._min_val
+            )
+        else:
+            raise ValueError(f"Unsupported data type: {self._data_type}")
+
         batch = DatasetBatch(
-            samples={"input_ids": torch.randint(0, self._vocab_size, (self._batch_size, self._sequence_length))},
-            targets={"target_ids": torch.randint(0, self._vocab_size, (self._batch_size, self._sequence_length))},
+            samples={
+                "input_ids": inputs,
+            },
+            targets={
+                "target_ids": targets,
+            },
         )
         return batch