Minor function and config changes.

cspades · cspades · commit 0fffd7b7a3e4 · 2025-09-03T12:50:12.000-07:00
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/recipes/vit/checkpoint.py b/recipes/vit/checkpoint.py
@@ -24,7 +24,7 @@
 _logger = logging.getLogger(__name__)
 
 
-def load_torch_checkpoint(model, checkpoint_path, megatron_fsdp=False):
+def load_torch_checkpoint(checkpoint_path, model, megatron_fsdp=False):
     """Load a Torch checkpoint from checkpoint_path into an unsharded model.
     Used for converting existing TIMM or Torch checkpoints into a freshly initialized
     model prior to sharding with Megatron-FSDP.
@@ -34,19 +34,18 @@ def load_torch_checkpoint(model, checkpoint_path, megatron_fsdp=False):
 
     Docs: https://docs.pytorch.org/tutorials/beginner/saving_loading_models.html
     """
-    # Load model checkpoint. Remove the "module." prefix from the keys from Megatron-FSDP,
-    # which is the main discrepancy between Megatron-FSDP and normal checkpoints.
-    # Must load with weights_only=False if you have an optimizer state in your checkpoint.
-    model_checkpoint = {
-        (k.strip("module.") if megatron_fsdp else k): v
-        for k, v in torch.load(checkpoint_path, weights_only=False)["model"].items()
-    }
+    # Load model checkpoint. Must load with weights_only=False
+    # if you have an optimizer state in your checkpoint.
+    checkpoint = torch.load(checkpoint_path, weights_only=False)
+    # Remove the "module." prefix from the keys of checkpoints
+    # derived from Megatron-FSDP.
+    model_checkpoint = {(k.removeprefix("module.") if megatron_fsdp else k): v for k, v in checkpoint["model"].items()}
     # Warn about Megatron-FSDP checkpoints.
     first_key = next(iter(model_checkpoint))
     if first_key.startswith("module.") and not megatron_fsdp:
         _logger.warning(
             f"Checkpoint state dictionary keys ({first_key}) may be prefixed "
-            "with 'modele.' if converted from a Megatron-FSDP DCP checkpoint."
+            "with 'module.' if converted from a Megatron-FSDP DCP checkpoint."
             "Set megatron_fsdp=True to automatically strip the prefix."
         )
     # Load with strict=False because the checkpoint may have
@@ -66,8 +65,10 @@ def load_dcp_checkpoint(checkpoint_path, model=None, optimizer=None):
     if optimizer is not None:
         state_dict["optimizer"] = optimizer.state_dict()
     torch.distributed.checkpoint.load(state_dict, checkpoint_id=checkpoint_path)
-    model.load_state_dict(state_dict["model"])
-    optimizer.load_state_dict(state_dict["optimizer"])
+    if model is not None:
+        model.load_state_dict(state_dict["model"])
+    if optimizer is not None:
+        optimizer.load_state_dict(state_dict["optimizer"])
 
 
 def load_auto_resume_checkpoint(cfg, model, optimizer):
diff --git a/recipes/vit/config/defaults.yaml b/recipes/vit/config/defaults.yaml
@@ -41,7 +41,7 @@ optimizer:
   weight_decay: 0.01
 
 distributed:
-  dp_inter: 1
+  dp_outer: 1
   dp_shard: 1
   cp: 1
 
@@ -69,6 +69,8 @@ training:
 inference:
   checkpoint:
     path: null
+    format: null
+    megatron_fsdp: null
 
 dataset:
   num_classes: 100000
diff --git a/recipes/vit/config/vit_base_patch16_224.yaml b/recipes/vit/config/vit_base_patch16_224.yaml
@@ -39,7 +39,7 @@ model:
   channels_last: false
 
 distributed:
-  dp_inter: 1
+  dp_outer: 1
   dp_shard: 1
   cp: 1
 
@@ -66,7 +66,12 @@ training:
 
 inference:
   checkpoint:
-    path: "./checkpoints/vit/torch_ckpt_test.pt"
+    path: null
+    # Load a DCP->Torch converted checkpoint for inference without Megatron-FSDP.
+    # Otherwise, set this to "torch_dcp" if using Megatron-FSDP for inference.
+    # If the checkpoint was not trained with Megatron-FSDP, then set megatron_fsdp to false.
+    format: "torch"
+    megatron_fsdp: true
 
 dataset:
   num_classes: 100000
diff --git a/recipes/vit/config/vit_te_base_patch16_224.yaml b/recipes/vit/config/vit_te_base_patch16_224.yaml
@@ -13,4 +13,9 @@ training:
 
 inference:
   checkpoint:
-    path: "./checkpoints/vit_te/torch_ckpt_test.pt"
+    path: null
+    # Load a DCP->Torch converted checkpoint for inference without Megatron-FSDP.
+    # Otherwise, set this to "torch_dcp" if using Megatron-FSDP for inference.
+    # If the checkpoint was not trained with Megatron-FSDP, then set megatron_fsdp to false.
+    format: "torch"
+    megatron_fsdp: true
diff --git a/recipes/vit/distributed.py b/recipes/vit/distributed.py
@@ -20,12 +20,20 @@
 
 
 @contextmanager
-def initialize_distributed(cfg):
+def initialize_distributed(
+    dp_outer: int = 1,
+    dp_shard: int = 1,
+    cp: int = 1,
+    tp: int = 1, 
+):
     """
     Setup the DeviceMesh for distributed training.
 
     Args:
-        cfg: Hydra config.
+        dp_outer: The size of the data parallelism outer dimension.
+        dp_shard: The size of the data parallelism shard dimension.
+        cp: The size of the context parallelism dimension.
+        tp: The size of the tensor parallelism dimension.
 
     Yields:
         device_mesh: The DeviceMesh.
@@ -45,30 +53,30 @@ def initialize_distributed(cfg):
     # TODO(@cspades): Will add TE-backed context parallelism (CP) in the future, just need to
     # modify the ViT model to shard the sequence dimension after tokenization. For now, we
     # setup the CP dimension for demonstrating how to use DeviceMesh and CP with Megatron-FSDP.
-    if cfg.distributed.dp_inter * cfg.distributed.dp_shard * cfg.distributed.cp != torch.distributed.get_world_size():
+    if dp_outer * dp_shard * cp != torch.distributed.get_world_size():
         raise ValueError(
-            f"Invalid parallelism sizes: dp_inter({cfg.distributed.dp_inter}) * dp_shard({cfg.distributed.dp_shard}) * cp({cfg.distributed.cp}) * tp(1) != world_size({torch.distributed.get_world_size()})"
+            f"Invalid parallelism sizes: dp_outer({dp_outer}) * dp_shard({dp_shard}) * cp({cp}) * tp({tp}) != world_size({torch.distributed.get_world_size()})"
         )
     device_mesh = torch.distributed.device_mesh.init_device_mesh(
         "cuda",
         mesh_shape=(
-            cfg.distributed.dp_inter,
-            cfg.distributed.dp_shard,
-            cfg.distributed.cp,
-            1,  # Needed to use TransformerEngine layers with Megatron-FSDP. "TP is always 1."
+            dp_outer,
+            dp_shard,
+            cp,
+            tp,  # Needed to use TransformerEngine layers with Megatron-FSDP.
         ),
-        mesh_dim_names=("dp_inter", "dp_shard", "cp", "tp"),
+        mesh_dim_names=("dp_outer", "dp_shard", "cp", "tp"),
     )
 
     # Sub-meshes (possibly) required for Megatron-FSDP.
     # WARNING: These have a tendency to be deleted by Torch. Save references
     # or pass them to all classes or functions that use them.
     # DP: Only relevant when using HSDP, where we need the flattened DP group for data parallelism. (Otherwise, just pass dp_shard.)
-    device_mesh[("dp_inter", "dp_shard")]._flatten("dp")
+    device_mesh[("dp_outer", "dp_shard")]._flatten("dp")
     # DP-Shard-CP: Only required if using CP. Otherwise, just pass dp_shard to FSDP.
     device_mesh[("dp_shard", "cp")]._flatten("dp_cp_shard")
     # HSDP (DP-CP): Only required if using HSDP. Otherwise, don't pass hybrid_fsdp_group to Megatron-FSDP.
-    device_mesh[("dp_inter", "dp_shard", "cp")]._flatten("hsdp")
+    device_mesh[("dp_outer", "dp_shard", "cp")]._flatten("hsdp")
 
     # Yield DeviceMesh.
     yield device_mesh
diff --git a/recipes/vit/infer.py b/recipes/vit/infer.py
@@ -31,12 +31,14 @@ def main(cfg) -> None:
     """
     Inference script for ViT. Non-distributed inference.
     """
-    with initialize_distributed(cfg) as device_mesh:
+    with initialize_distributed(**cfg.distributed) as device_mesh:
         # Init ViT.
         model = build_vit_model(cfg, device_mesh).cuda()
 
-        # Load model checkpoint trained using Megatron-FSDP.
-        load_torch_checkpoint(model, cfg.inference.checkpoint.path, megatron_fsdp=True)
+        # Load torch.save (non-distributed) model checkpoint trained using (or not using) Megatron-FSDP.
+        load_torch_checkpoint(
+            cfg.inference.checkpoint.path, model, megatron_fsdp=cfg.inference.checkpoint.megatron_fsdp
+        )
         logger.info(f"Model: {model}")
 
         # Mock input.
diff --git a/recipes/vit/test_infer.py b/recipes/vit/test_infer.py
@@ -45,11 +45,15 @@ def test_infer(monkeypatch, tmp_path, config_name):
             config_name=config_name,
             overrides=[
                 f"++inference.checkpoint.path={test_ckpt_path}",
+                # Using a torch.save mock checkpoint for inference.
+                "++inference.checkpoint.format=torch",
+                # Using a non-Megatron-FSDP mock checkpoint for inference.
+                "++inference.checkpoint.megatron_fsdp=false",
             ],
         )
 
     # Write a test checkpoint.
-    with initialize_distributed(vit_config) as device_mesh:
+    with initialize_distributed(**vit_config.distributed) as device_mesh:
         # Init ViT.
         model = build_vit_model(vit_config, device_mesh).cuda()
         # Write checkpoint.
diff --git a/recipes/vit/test_train.py b/recipes/vit/test_train.py
@@ -37,29 +37,27 @@ def test_train(monkeypatch, tmp_path, config_name, init_model_with_meta_device):
 
     # Initialize training config.
     recipe_dir = Path(__file__).parent
+    training_ckpt_path = Path(tmp_path) / "test_train_checkpoints"
     with initialize_config_dir(config_dir=str(recipe_dir / "config"), version_base="1.2"):
         vit_config = compose(
             config_name=config_name,
             overrides=[
-                "++training.steps=10",
-                "++training.val_interval=10",
+                "++training.steps=5",
+                "++training.val_interval=5",
                 "++training.log_interval=1",
-                f"++training.checkpoint.path={Path(tmp_path) / 'ckpt'}",
+                f"++training.checkpoint.path={training_ckpt_path}",
                 "++profiling.torch_memory_profile=false",
                 "++profiling.wandb=false",
                 f"++fsdp.init_model_with_meta_device={init_model_with_meta_device}",
             ],
         )
-        vit_resume_config = deepcopy(vit_config)
-        vit_resume_config.training.steps = 10
 
     main(vit_config)
 
     # Verify checkpoints were created.
-    assert sum(1 for item in (Path(tmp_path) / "ckpt").iterdir() if item.is_dir()) == 1, (
-        "Expected 1 checkpoint with 10 training steps and validation interval of 10."
+    assert sum(1 for item in training_ckpt_path.iterdir() if item.is_dir()) == 1, (
+        "Expected 1 checkpoint with 5 training steps and validation interval of 5."
     )
 
-    # Auto-resume training from checkpoint. For this test, we auto-resume from the best checkpoint,
-    # so depending on what the best checkpoint is, we may have more than 5 checkpoints.
-    main(vit_resume_config)
+    # Auto-resume training from checkpoint. For this test, we auto-resume from the best checkpoint.
+    main(vit_config)
diff --git a/recipes/vit/train.py b/recipes/vit/train.py
@@ -45,7 +45,7 @@ def main(cfg) -> None:
     """Train a ViT model on ImageNet using Megatron-FSDP and TransformerEngine (TE)."""
 
     # Initialize distributed environment.
-    with initialize_distributed(cfg) as device_mesh:
+    with initialize_distributed(**cfg.distributed) as device_mesh:
         """
         Profiling
         """
@@ -92,7 +92,7 @@ def main(cfg) -> None:
             # Always required to use Megatron-FSDP. What we shard on.
             dp_shard_dim="dp_cp_shard",
             # Required if using HSDP. The second / intermediate set of data-parallel process groups.
-            dp_inter_dim="dp_inter",
+            dp_inter_dim="dp_outer",
             # Required if using TP, either from TransformerEngine (TP=1) / Megatron or DTensor-based TP.
             tp_dim="tp",
             # Required if using HSDP. Created by flattening everything we shard on, e.g. DP-CP.
@@ -142,9 +142,9 @@ def main(cfg) -> None:
             sampler=train_sampler,
             num_workers=cfg.dataset.num_workers,
             # IMPORTANT: persistent_workers=True is required for Megatron-FSDP and
-            # Torch DCP, because CUDA/NCCL and Dataloader kill each others workers!
+            # Torch DCP, because CUDA/NCCL and Dataloader kill each others' workers!
             # Alternatively, you can set num_workers=0.
-            persistent_workers=True,
+            persistent_workers=(cfg.dataset.num_workers > 0),
         )
         if torch.distributed.get_rank() == 0:
             _logger.info(f"Training Dataset Size: {len(imagenet_train_ds)}")
@@ -171,9 +171,9 @@ def main(cfg) -> None:
             sampler=val_sampler,
             num_workers=cfg.dataset.num_workers,
             # IMPORTANT: persistent_workers=True is required for Megatron-FSDP and
-            # Torch DCP, because CUDA/NCCL and Dataloader kill each others workers!
+            # Torch DCP, because CUDA/NCCL and Dataloader kill each others' workers!
             # Alternatively, you can set num_workers=0.
-            persistent_workers=True,
+            persistent_workers=(cfg.dataset.num_workers > 0),
         )
         if torch.distributed.get_rank() == 0:
             _logger.info(f"Validation Dataset Size: {len(imagenet_val_ds)}")
@@ -211,6 +211,7 @@ def main(cfg) -> None:
 
             # Set training mode.
             model.train()
+            optimizer.zero_grad()
 
             # Match model input shape.
             if cfg.model.channels_last:
diff --git a/recipes/vit/vit.py b/recipes/vit/vit.py
@@ -64,7 +64,14 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformer_engine.pytorch import TransformerLayer
+
+
+try:
+    from transformer_engine.pytorch import TransformerLayer
+
+    _TE_INSTALLED = True
+except ImportError:
+    _TE_INSTALLED = False
 
 
 def build_vit_model(cfg, device_mesh=None, meta_init=False):
@@ -85,7 +92,7 @@ def build_vit_model(cfg, device_mesh=None, meta_init=False):
         vit_kwargs = dict(cfg.model.vit)
         if meta_init:
             vit_kwargs["weight_init"] = None
-        if cfg.model.transformer_engine:
+        if cfg.model.transformer_engine and _TE_INSTALLED:
             assert device_mesh is not None, "[build_model] device_mesh is required when using TransformerEngine."
             vit_kwargs["block_fn"] = TransformerLayer
             vit_kwargs["micro_batch_size"] = cfg.dataset.train.batch_size
@@ -1385,7 +1392,7 @@ def __init__(
         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]  # stochastic depth decay rule
 
         self.block_fn = block_fn
-        if block_fn == TransformerLayer:
+        if _TE_INSTALLED and block_fn == TransformerLayer:
             self.blocks = nn.Sequential(
                 *[
                     TransformerLayer(
@@ -1464,7 +1471,7 @@ def rescale(param, _layer_id):
             param.div_(math.sqrt(2.0 * _layer_id))
 
         for layer_id, layer in enumerate(self.blocks):
-            if self.block_fn == TransformerLayer:
+            if _TE_INSTALLED and self.block_fn == TransformerLayer:
                 rescale(layer.self_attention.proj.weight.data, layer_id + 1)
                 rescale(layer.layernorm_mlp.fc2_weight.data, layer_id + 1)
             else: