NVIDIA
diff --git a/‎recipes/vit/LICENSE‎
Lines changed: 22 additions & 0 deletions b/‎recipes/vit/LICENSE‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎recipes/vit/README.md‎
Lines changed: 3 additions & 3 deletions b/‎recipes/vit/README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎recipes/vit/beans.py‎
Lines changed: 65 additions & 0 deletions b/‎recipes/vit/beans.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎recipes/vit/checkpoint.py‎
Lines changed: 3 additions & 2 deletions b/‎recipes/vit/checkpoint.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎recipes/vit/config/defaults.yaml‎
Lines changed: 4 additions & 43 deletions b/‎recipes/vit/config/defaults.yaml‎
Lines changed: 4 additions & 43 deletions
diff --git a/‎recipes/vit/config/vit_base_patch16_224.yaml‎
Lines changed: 4 additions & 12 deletions b/‎recipes/vit/config/vit_base_patch16_224.yaml‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎recipes/vit/data/super-tiny-imagenet-5.tar.gz‎
-1.74 MB b/‎recipes/vit/data/super-tiny-imagenet-5.tar.gz‎
-1.74 MB
@@ -199,3 +199,25 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+   MIT License
+
+   Copyright (c) 2020 AIR Lab Makerere University
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
@@ -38,11 +38,11 @@ To train a ViT using FSDP, execute the following command in your Docker containe
 torchrun --nproc-per-node ${NGPU} train.py --config-name vit_base_patch16_224 distributed.dp_shard=${NGPU} training.checkpoint.path=./ckpts/vit
 ```
 
-which will train on a local tiny 5-class version of [ImageNet](https://image-net.org/) ([super-tiny-imagenet-5](./data/super-tiny-imagenet-5/)) and save auto-resumable [Torch DCP](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html) checkpoints to the `training.checkpoint.path` directory.
+which will train on the [`AI-Lab-Makerere/ibean`](https://github.com/AI-Lab-Makerere/ibean/) (HuggingFace: [`AI-Lab-Makerere/beans`](https://huggingface.co/datasets/AI-Lab-Makerere/beans)) dataset and save auto-resumable [Torch DCP](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html) checkpoints to the `training.checkpoint.path` directory.
 
-[`train.py`](train.py) is the transparent entrypoint to this script that explains how to modify your own training loop for `Megatron-FSDP` ([PyPI: `megatron-fsdp`](https://pypi.org/project/megatron-fsdp/) / [Source: Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/distributed/fsdp/src)) to fully-shard your model across all devices. After executing `train.py` for the first time, the de-compressed ImageNet dataset will be available in `data/super-tiny-imagenet-5/...` (sourced from [`super-tiny-imagenet-5.tar.gz`](./data/super-tiny-imagenet-5.tar.gz)) for experimentation and review.
+[`train.py`](train.py) is the transparent entrypoint to this script that explains how to modify your own training loop for `Megatron-FSDP` ([PyPI: `megatron-fsdp`](https://pypi.org/project/megatron-fsdp/) / [Source: Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/distributed/fsdp/src)) to fully-shard your model across all devices.
 
-The TIMM-derived model code for the ViT can be found in [`vit.py`](vit.py), and data utilities for ImageNet can be found in [`imagenet_*.py`](imagenet_dataset.py).
+The TIMM-derived model code for the ViT can be found in [`vit.py`](vit.py), and data utilities for Beans can be found in [`beans.py`](beans.py).
 
 Various configuration options common in computer vision modeling can be found in [config](./config/).
 
 
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+import torch
+from datasets import load_dataset
+from torch.utils.data import Dataset
+from torchvision.transforms.functional import to_tensor
+
+
+logger = logging.getLogger(__name__)
+
+
+def infinite_dataloader(dataloader, sampler):
+    """Create an infinite iterator that automatically restarts at the end of each epoch."""
+    epoch = 0
+    while True:
+        sampler.set_epoch(epoch)  # Update epoch for proper shuffling
+        for batch in dataloader:
+            yield batch
+        epoch += 1  # Increment epoch counter after completing one full pass
+
+
+class BeansDataset(Dataset):
+    """
+    Simple wrapper Dataset for AI-Lab-Makerere/beans that converts PIL images to Tensors.
+    """
+
+    def __init__(self, image_size: tuple[int, int], split: str = "train"):
+        """
+        Args:
+            image_size (tuple[int, int]): Resize 2-D image data to this size.
+            split (str): Dataset split to load. Options: ["train", "validation", "test"]
+        """
+        self.resize_dimensions = image_size
+        # Download Beans Dataset.
+        self.dataset = load_dataset("AI-Lab-Makerere/beans", split=split)
+        self.class_list = self.dataset.features["labels"].names
+        if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+            logger.info(
+                f"[AI-Lab-Makerere/beans (Split={split})]\nDataset Size: {len(self.dataset)}\nClasses (Count={len(self.class_list)}): {self.class_list}"
+            )
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        # Preprocess sample.
+        sample = self.dataset[idx]
+        image_tensor = to_tensor(sample["image"].resize(self.resize_dimensions).convert("RGB"))
+        label_idx = sample["labels"]
+        return image_tensor, label_idx
@@ -39,6 +39,7 @@ def load_torch_checkpoint(checkpoint_path, model, megatron_fsdp=False):
     checkpoint = torch.load(checkpoint_path, weights_only=False)
     # Remove the "module." prefix from the keys of checkpoints
     # derived from Megatron-FSDP.
+    # TODO(@cspades): Remove this when the Megatron-FSDP checkpoint naming is fixed.
     model_checkpoint = {(k.removeprefix("module.") if megatron_fsdp else k): v for k, v in checkpoint["model"].items()}
     # Warn about Megatron-FSDP checkpoints.
     first_key = next(iter(model_checkpoint))
@@ -109,7 +110,7 @@ def load_auto_resume_checkpoint(cfg, model, optimizer):
             latest_step_idx = int(latest_subdir.name.split("_")[1])
             # Load model and optimizer checkpoints.
             load_dcp_checkpoint(latest_subdir, model, optimizer)
-            if torch.distributed.get_rank() == 0:
+            if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
                 _logger.info(f"Loaded latest model and optimizer checkpoints from: {latest_subdir}")
 
     # Return the auto-resumed step index for training progression.
@@ -160,5 +161,5 @@ def save_auto_resumable_checkpoint(cfg, model, optimizer, step_idx, loss_value):
                 # Change file perms.
                 file_path = Path(dirpath) / filename
                 os.chmod(file_path, mode)
-        if torch.distributed.get_rank() == 0:
+        if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
             _logger.info(f"Saved validated checkpoint to: {ckpt_dir}")
@@ -60,9 +60,9 @@ fsdp:
   preserve_fp32_weights: true
 
 training:
-  steps: 500
-  val_interval: 25
-  log_interval: 5
+  steps: 10
+  val_interval: 5
+  log_interval: 1
   checkpoint:
     path: null
     resume_from_metric: null
@@ -74,53 +74,14 @@ inference:
     megatron_fsdp: null
 
 dataset:
-  num_classes: 100000
+  num_classes: 3
   num_workers: 0
   train:
-    root: null
-    class_map: null
-    label_map: null
-    class_filter: null
     batch_size: 1
     shuffle: false
-    transform_kwargs:
-      img_size: 224
-      scale: null
-      ratio: null
-      train_crop_mode: null
-      hflip: 0.5
-      vflip: 0.
-      color_jitter: 0.4
-      color_jitter_prob: null
-      grayscale_prob: 0.
-      gaussian_blur_prob: 0.
-      interpolation: 'random'
-      re_prob: 0.
-      re_mode: 'const'
-      re_count: 1
-      re_num_splits: 0
-      normalize: True
-      separate: False
-      patch_size: 16
-      patchify: False
   val:
-    root: null
-    class_map: null
-    label_map: null
-    class_filter: null
     batch_size: 1
     shuffle: false
-    transform_kwargs:
-      img_size: 224
-      crop_pct: null
-      crop_mode: null
-      crop_border_pixels: null
-      interpolation: "bilinear"
-      mean: [0.485, 0.456, 0.406]
-      std: [0.229, 0.224, 0.225]
-      normalize: true
-      patch_size: 16
-      patchify: false
 
 random:
   seed: 42
 
@@ -75,22 +75,14 @@ inference:
     megatron_fsdp: true
 
 dataset:
-  num_classes: 100000
+  num_classes: 3
   num_workers: 4
   train:
-    root: "./data/super-tiny-imagenet-5/train"
-    class_map: "./data/super-tiny-imagenet-5/words.txt"
-    label_map: null   # Not needed, training data is labeled by directory.
-    class_filter: null
-    batch_size: 5
+    batch_size: 8
     shuffle: true
   val:
-    root: "./data/super-tiny-imagenet-5/val"
-    class_map: "./data/super-tiny-imagenet-5/words.txt"
-    label_map: "./data/super-tiny-imagenet-5/val/val_annotations.txt"
-    class_filter: null
-    batch_size: 5
-    shuffle: false
+    batch_size: 16
+    shuffle: true
 
 random:
   seed: 42