NVIDIA
diff --git a/‎recipes/vit/README.md‎
Lines changed: 61 additions & 0 deletions b/‎recipes/vit/README.md‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎recipes/vit/checkpoint.py‎
Lines changed: 163 additions & 0 deletions b/‎recipes/vit/checkpoint.py‎
Lines changed: 163 additions & 0 deletions
diff --git a/‎recipes/vit/config/defaults.yaml‎
Lines changed: 4 additions & 0 deletions b/‎recipes/vit/config/defaults.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎recipes/vit/config/vit_base_patch16_224.yaml‎
Lines changed: 4 additions & 0 deletions b/‎recipes/vit/config/vit_base_patch16_224.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎recipes/vit/config/vit_te_base_patch16_224.yaml‎
Lines changed: 4 additions & 0 deletions b/‎recipes/vit/config/vit_te_base_patch16_224.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎recipes/vit/distributed.py‎
Lines changed: 77 additions & 0 deletions b/‎recipes/vit/distributed.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎recipes/vit/imagenet_dataset.py‎
Lines changed: 2 additions & 2 deletions b/‎recipes/vit/imagenet_dataset.py‎
Lines changed: 2 additions & 2 deletions
@@ -45,3 +45,64 @@ which will train on a local tiny 5-class version of [ImageNet](https://image-net
 The TIMM-derived model code for the ViT can be found in [`vit.py`](vit.py), and data utilities for ImageNet can be found in [`imagenet_*.py`](imagenet_dataset.py).
 
 Various configuration options common in computer vision modeling can be found in [config](./config/).
+
+#### Checkpoint Conversion
+
+To convert DCP checkpoints to non-distributed Torch checkpoints, and vice-versa, you can run the following command from `torch`:
+
+```
+python -m torch.distributed.checkpoint.format_utils --help
+usage: format_utils.py [-h] {torch_to_dcp,dcp_to_torch} src dst
+
+positional arguments:
+  {torch_to_dcp,dcp_to_torch}
+                        Conversion mode
+  src                   Path to the source model
+  dst                   Path to the destination model
+
+options:
+  -h, --help            show this help message and exit
+```
+
+For example:
+
+```
+python -m torch.distributed.checkpoint.format_utils dcp_to_torch step_75_loss_1.725 torch_ckpt_test.pt
+```
+
+or:
+
+```
+from torch.distributed.checkpoint.format_utils import dcp_to_torch_save, torch_save_to_dcp
+
+# Convert DCP model checkpoint to torch.save format.
+dcp_to_torch_save(CHECKPOINT_DIR, TORCH_SAVE_CHECKPOINT_PATH)
+
+# Convert torch.save model checkpoint back to DCP format.
+torch_save_to_dcp(TORCH_SAVE_CHECKPOINT_PATH, f"{CHECKPOINT_DIR}_new")
+```
+
+_Note that `torch.save`-converted Megatron-FSDP distributed checkpoints (DCP) cannot be loaded directly into `MegatronFSDP` module classes, because Megatron-FSDP expects a deterministic unevenly sharded checkpoint when loading using DCP. To load a non-distributed checkpoint for training with Megatron-FSDP, simply load the checkpoint into the unsharded model before calling `fully_shard`!_
+
+```python
+# Initialize model.
+model = build_vit_model(cfg, device_mesh)
+
+# Load model checkpoint. Remove the "module." prefix from the keys from Megatron-FSDP,
+# which is the main discrepancy between Megatron-FSDP and normal checkpoints.
+# Must load with weights_only=False if you have an optimizer state in your checkpoint.
+# NOTE(@cspades): `from checkpoint import load_torch_checkpoint`
+# -> load_torch_checkpoint(megatron_fsdp=True)
+model_checkpoint = {
+    (k.strip("module.") if megatron_fsdp else k): v
+    for k, v in torch.load(checkpoint_path, weights_only=False)["model"].items()
+}
+# Load with strict=False because the checkpoint may have TE-specific keys that are not
+# necessary for inference.
+model.load_state_dict(model_checkpoint, strict=False)
+
+# Fully-shard.
+model = fully_shard_model(...)
+```
+
+TODO(@cspades): For converting DCP directly to HuggingFace SafeTensors checkpoints, you can look into: https://pytorch.org/blog/huggingface-safetensors-support-in-pytorch-distributed-checkpointing/
@@ -0,0 +1,163 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from pathlib import Path
+
+import torch
+import torch.distributed.checkpoint
+
+
+_logger = logging.getLogger(__name__)
+
+
+def load_torch_checkpoint(model, checkpoint_path, megatron_fsdp=False):
+    """Load a Torch checkpoint from checkpoint_path into an unsharded model.
+    Used for converting existing TIMM or Torch checkpoints into a freshly initialized
+    model prior to sharding with Megatron-FSDP.
+
+    If the checkpoint was created from a Megatron-FSDP DCP checkpoint, then setting
+    megatron_fsdp=True is required and strips a "module." prefix from the keys.
+
+    Docs: https://docs.pytorch.org/tutorials/beginner/saving_loading_models.html
+    """
+    # Load model checkpoint. Remove the "module." prefix from the keys from Megatron-FSDP,
+    # which is the main discrepancy between Megatron-FSDP and normal checkpoints.
+    # Must load with weights_only=False if you have an optimizer state in your checkpoint.
+    model_checkpoint = {
+        (k.strip("module.") if megatron_fsdp else k): v
+        for k, v in torch.load(checkpoint_path, weights_only=False)["model"].items()
+    }
+    # Warn about Megatron-FSDP checkpoints.
+    first_key = next(iter(model_checkpoint))
+    if first_key.startswith("module.") and not megatron_fsdp:
+        _logger.warning(
+            f"Checkpoint state dictionary keys ({first_key}) may be prefixed "
+            "with 'modele.' if converted from a Megatron-FSDP DCP checkpoint."
+            "Set megatron_fsdp=True to automatically strip the prefix."
+        )
+    # Load with strict=False because the checkpoint may have
+    # TE-specific keys that are not necessary for inference.
+    model.load_state_dict(model_checkpoint, strict=False)
+
+
+def load_dcp_checkpoint(checkpoint_path, model=None, optimizer=None):
+    """Load a Torch DCP checkpoint from checkpoint_path into model and optimizer.
+
+    Docs: https://docs.pytorch.org/docs/stable/distributed.checkpoint.html
+    """
+    # Load model and optimizer checkpoints.
+    state_dict = {}
+    if model is not None:
+        state_dict["model"] = model.state_dict()
+    if optimizer is not None:
+        state_dict["optimizer"] = optimizer.state_dict()
+    torch.distributed.checkpoint.load(state_dict, checkpoint_id=checkpoint_path)
+    model.load_state_dict(state_dict["model"])
+    optimizer.load_state_dict(state_dict["optimizer"])
+
+
+def load_auto_resume_checkpoint(cfg, model, optimizer):
+    """Auto-resume training from the latest checkpoint.
+
+    Checkpoint directories should adhere to the simple format: step_<step_idx>_loss_<loss_value>
+    If cfg.training.checkpoint.resume_from_metric is '+' or '-', then the loss_value is utilized
+    for determining the optimal checkpoint to resume from. Otherwise, the latest checkpoint by
+    modification time is chosen for resumption.
+
+    Args:
+        cfg: Hydra config.
+        model: Model to load checkpoints into.
+        optimizer: Optimizer to load checkpoints into.
+
+    Returns:
+        The latest step index to resume from.
+    """
+    # Auto-Resume: Load latest model and optimizer checkpoints.
+    latest_step_idx = 0
+    if cfg.training.checkpoint.path and Path(cfg.training.checkpoint.path).exists():
+        # Get latest checkpoint sub-directory, which should ONLY contain Torch DCP checkpoint sub-directories.
+        subdirs = [x.absolute() for x in Path(cfg.training.checkpoint.path).iterdir() if x.is_dir()]
+        if len(subdirs) > 0:
+            # We expect a checkpoint named as: step_<step_idx>_loss_<loss_value>.
+            # Get the latest step, the directory with the most recent modification time.
+            opt_metric_coeff = 1 if cfg.training.checkpoint.resume_from_metric == "+" else -1
+            latest_subdir = max(
+                subdirs,
+                key=lambda x: (
+                    opt_metric_coeff * float(x.name.split("_")[3])
+                    if cfg.training.checkpoint.resume_from_metric
+                    else 0,
+                    x.stat().st_mtime,
+                ),
+            )
+            # Track latest step to continue training from.
+            latest_step_idx = int(latest_subdir.name.split("_")[1])
+            # Load model and optimizer checkpoints.
+            load_dcp_checkpoint(latest_subdir, model, optimizer)
+            if torch.distributed.get_rank() == 0:
+                _logger.info(f"Loaded latest model and optimizer checkpoints from: {latest_subdir}")
+
+    # Return the auto-resumed step index for training progression.
+    return latest_step_idx
+
+
+def save_dcp_checkpoint(checkpoint_path, model=None, optimizer=None):
+    """Save a Torch DCP checkpoint of the model and optimizer to checkpoint_path.
+
+    Docs: https://docs.pytorch.org/docs/stable/distributed.checkpoint.html
+    """
+    # Save model and optimizer checkpoints.
+    state_dict = {}
+    if model is not None:
+        state_dict["model"] = model.state_dict()
+    if optimizer is not None:
+        state_dict["optimizer"] = optimizer.state_dict()
+    torch.distributed.checkpoint.save(state_dict, checkpoint_id=checkpoint_path)
+
+
+def save_auto_resumable_checkpoint(cfg, model, optimizer, step_idx, loss_value):
+    """Save an auto-resumable checkpoint of the model and optimizer at step_idx.
+
+    Checkpoint directories should adhere to the simple format: step_<step_idx>_loss_<loss_value>.
+    This is used for auto-resumption of training.
+
+    Args:
+        cfg: Hydra config.
+        model: Model to save checkpoints of.
+        optimizer: Optimizer to save checkpoints of.
+        step_idx: Step index to save checkpoint at.
+        loss_value: Loss value to save checkpoint at.
+    """
+
+    # Save validated checkpoint.
+    if cfg.training.checkpoint.path:
+        # Create checkpoint sub-directory.
+        ckpt_dir = Path(cfg.training.checkpoint.path) / f"step_{step_idx}_loss_{loss_value:.3f}"
+        ckpt_dir.mkdir(parents=True, exist_ok=True)
+        # Save model and optimizer checkpoints.
+        save_dcp_checkpoint(ckpt_dir, model, optimizer)
+        # Relax checkpoint permissions, which may be helpful when saving checkpoints in a container owned by root.
+        mode = 0o777
+        for dirpath, _, filenames in os.walk(ckpt_dir):
+            # Change current directory perms.
+            os.chmod(dirpath, mode)
+            for filename in filenames:
+                # Change file perms.
+                file_path = Path(dirpath) / filename
+                os.chmod(file_path, mode)
+        if torch.distributed.get_rank() == 0:
+            _logger.info(f"Saved validated checkpoint to: {ckpt_dir}")
@@ -66,6 +66,10 @@ training:
     path: null
     resume_from_metric: null
 
+inference:
+  checkpoint:
+    path: null
+
 dataset:
   num_classes: 100000
   num_workers: 0
 
@@ -64,6 +64,10 @@ training:
     path: "./checkpoints/vit"
     resume_from_metric: "-"   # + = Highest Metric (Score), - = Lowest Metric (Loss)
 
+inference:
+  checkpoint:
+    path: "./checkpoints/vit/torch_ckpt_test.pt"
+
 dataset:
   num_classes: 100000
   num_workers: 4
 
@@ -10,3 +10,7 @@ training:
   checkpoint:
     path: "./checkpoints/vit_te"
     resume_from_metric: "-"   # + = Highest Metric (Score), - = Lowest Metric (Loss)
+
+inference:
+  checkpoint:
+    path: "./checkpoints/vit_te/torch_ckpt_test.pt"
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from contextlib import contextmanager
+
+import torch
+
+
+@contextmanager
+def initialize_distributed(cfg):
+    """
+    Setup the DeviceMesh for distributed training.
+
+    Args:
+        cfg: Hydra config.
+
+    Yields:
+        device_mesh: The DeviceMesh.
+
+    Raises:
+        ValueError: If the parallelism sizes are invalid.
+    """
+    # Initialize distributed training environment.
+    torch.distributed.init_process_group()
+
+    # Associate all future device operations in the current process
+    # with a uniquely-indexed local device, e.g. "cuda:0" on Rank 0.
+    local_rank = int(os.getenv("LOCAL_RANK", torch.distributed.get_rank()))
+    torch.cuda.set_device(local_rank)
+
+    # Initialize DeviceMesh. Validate parallelism sizes.
+    # TODO(@cspades): Will add TE-backed context parallelism (CP) in the future, just need to
+    # modify the ViT model to shard the sequence dimension after tokenization. For now, we
+    # setup the CP dimension for demonstrating how to use DeviceMesh and CP with Megatron-FSDP.
+    if cfg.distributed.dp_inter * cfg.distributed.dp_shard * cfg.distributed.cp != torch.distributed.get_world_size():
+        raise ValueError(
+            f"Invalid parallelism sizes: dp_inter({cfg.distributed.dp_inter}) * dp_shard({cfg.distributed.dp_shard}) * cp({cfg.distributed.cp}) * tp(1) != world_size({torch.distributed.get_world_size()})"
+        )
+    device_mesh = torch.distributed.device_mesh.init_device_mesh(
+        "cuda",
+        mesh_shape=(
+            cfg.distributed.dp_inter,
+            cfg.distributed.dp_shard,
+            cfg.distributed.cp,
+            1,  # Needed to use TransformerEngine layers with Megatron-FSDP. "TP is always 1."
+        ),
+        mesh_dim_names=("dp_inter", "dp_shard", "cp", "tp"),
+    )
+
+    # Sub-meshes (possibly) required for Megatron-FSDP.
+    # WARNING: These have a tendency to be deleted by Torch. Save references
+    # or pass them to all classes or functions that use them.
+    # DP: Only relevant when using HSDP, where we need the flattened DP group for data parallelism. (Otherwise, just pass dp_shard.)
+    device_mesh[("dp_inter", "dp_shard")]._flatten("dp")
+    # DP-Shard-CP: Only required if using CP. Otherwise, just pass dp_shard to FSDP.
+    device_mesh[("dp_shard", "cp")]._flatten("dp_cp_shard")
+    # HSDP (DP-CP): Only required if using HSDP. Otherwise, don't pass hybrid_fsdp_group to Megatron-FSDP.
+    device_mesh[("dp_inter", "dp_shard", "cp")]._flatten("hsdp")
+
+    # Yield DeviceMesh.
+    yield device_mesh
+
+    # Destroy process group.
+    torch.distributed.destroy_process_group()
@@ -205,12 +205,12 @@ def __init__(
         if isinstance(class_map, str):
             class_to_idx = load_class_map(class_map)
         elif isinstance(class_map, dict):
-            assert dict, "Class-to-Index mapping dict must be non-empty."
+            assert class_map, "Class-to-Index mapping dict must be non-empty."
             class_to_idx = class_map
         if isinstance(label_map, str):
             image_to_label = load_image_labels(label_map)
         elif isinstance(label_map, dict):
-            assert dict, "Image-to-Label mapping dict must be non-empty."
+            assert label_map, "Image-to-Label mapping dict must be non-empty."
             image_to_label = label_map
         self.samples, self.class_to_idx = find_images_and_targets(
             root,