Apply isort and black reformatting

ankitmaster08 · ankitmaster08 · commit 62e6bae20295 · 2025-05-15T14:51:07.000Z
Signed-off-by: ankitmaster08 &lt;ankitmaster08@users.noreply.github.com&gt;
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -64,6 +64,11 @@
     # since PyTorch 2.3 the path has changed
     from torch.amp.grad_scaler import _refresh_per_optimizer_state
 
+from concurrent.futures import ThreadPoolExecutor
+
+import multistorageclient as msc
+from multistorageclient.types import MSC_PROTOCOL
+
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.transformer import AutocastTransformerLayer, ParallelTransformerLayer
 from nemo.collections.nlp.parts import utils_funcs
@@ -73,10 +78,6 @@
 from nemo.utils import AppState, logging
 from nemo.utils.model_utils import ckpt_to_dir, inject_model_parallel_rank, uninject_model_parallel_rank
 
-from concurrent.futures import ThreadPoolExecutor
-from multistorageclient.types import MSC_PROTOCOL
-import multistorageclient as msc
-
 try:
 
     from nemo.core.optim.distributed_adam import MegatronDistributedFusedAdam
@@ -1042,25 +1043,26 @@ def msc_download_dir(url: str, local_path: str):
     if not msc.os.path.exists(url):
         raise Exception(f"Download Path doesn't exist: {url}")
 
-    base_name = os.path.basename(url) #url = "msc://my-profile/path/to/data", base_name = "data"
+    base_name = os.path.basename(url)  # url = "msc://my-profile/path/to/data", base_name = "data"
     files = msc.list(url)
 
     def download_file(item):
         """Helper function to download a single file."""
-        file_name = item.key  #item.key = "msc://profile/path/to/data/file1.txt" 
-        base_name_idx = file_name.find(base_name) # base_name_idx = 23 
-        local_file_path = f"{local_path}/{file_name[base_name_idx:]}" #local_file_path = f"{local_path}/data/file1.txt"
+        file_name = item.key  # item.key = "msc://profile/path/to/data/file1.txt"
+        base_name_idx = file_name.find(base_name)  # base_name_idx = 23
+        local_file_path = (
+            f"{local_path}/{file_name[base_name_idx:]}"  # local_file_path = f"{local_path}/data/file1.txt"
+        )
         os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
         msc.download_file(item, local_file_path)
-        #msc.download_file(f"{MSC_PROTOCOL}{get_profile()}/{file_name}", local_file_path)
+        # msc.download_file(f"{MSC_PROTOCOL}{get_profile()}/{file_name}", local_file_path)
 
     # Use ThreadPoolExecutor for par    allel downloads
     with ThreadPoolExecutor(max_workers=32) as executor:  # Adjust max_workers as needed
         executor.map(download_file, files)
 
     logging.warning(f"msc_download_dir completed rank {torch.distributed.get_rank()}")
-        
-        
+
 
 class NLPSaveRestoreConnector(SaveRestoreConnector):
     """Custom connector to support saving and restoring states."""
@@ -1083,7 +1085,6 @@ def __init__(self) -> None:
             )
         super().__init__()
 
-    
     def save_to(self, model, save_path: str):
         """Save model to save path."""
         app_state = AppState()
@@ -1102,17 +1103,16 @@ def save_to(self, model, save_path: str):
             is_msc_enabled = False
             if MSC_PROTOCOL in dir_name:
                 is_msc_enabled = True
-                
+
             # dist ckpt calls save on every rank
             if dist_ckpt:
                 # model weights is a directory
                 dist_ckpt_dir = ckpt_to_dir(os.path.join(dir_name, self.model_weights_ckpt))
 
                 if is_msc_enabled:
-                    filename = os.path.join(dir_name, self.model_weights_ckpt) 
+                    filename = os.path.join(dir_name, self.model_weights_ckpt)
                     dist_ckpt_dir = os.path.splitext(filename)[0]
-                            
-                
+
                 # dist checkpoint needs torch.distributed to save the checkpoint
                 if not parallel_state.is_initialized():
 
@@ -1185,8 +1185,10 @@ def dummy():
 
                             if is_msc_enabled:
                                 print(f"Downloading {mp_model_weights} to {tmpdir}")
-                                msc_dest=os.path.join(tmpdir, f'mp_rank_{tp_rank:02d}', self.model_weights_ckpt) 
-                                logging.warning(f"msc_download_dir mp_model_weights from {mp_model_weights} {msc_dest} rank {torch.distributed.get_rank()}")
+                                msc_dest = os.path.join(tmpdir, f'mp_rank_{tp_rank:02d}', self.model_weights_ckpt)
+                                logging.warning(
+                                    f"msc_download_dir mp_model_weights from {mp_model_weights} {msc_dest} rank {torch.distributed.get_rank()}"
+                                )
                                 msc_download_dir(mp_model_weights, msc_dest)
                             else:
                                 shutil.move(
@@ -1206,8 +1208,12 @@ def dummy():
 
                             if is_msc_enabled:
                                 print(f"Downloading {mp_model_weights} to {tmpdir}")
-                                msc_dest = os.path.join(tmpdir, f'tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}', self.model_weights_ckpt)
-                                logging.warning(f"msc_download_dir mp_model_weights from {mp_model_weights} {msc_dest} rank {torch.distributed.get_rank()}")
+                                msc_dest = os.path.join(
+                                    tmpdir, f'tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}', self.model_weights_ckpt
+                                )
+                                logging.warning(
+                                    f"msc_download_dir mp_model_weights from {mp_model_weights} {msc_dest} rank {torch.distributed.get_rank()}"
+                                )
                                 msc_download_dir(mp_model_weights, msc_dest)
                             else:
                                 shutil.move(
@@ -1368,28 +1374,24 @@ def _load_state_dict_from_disk(self, model_weights, map_location=None):
         else:
             raise ValueError(f'Expected {model_weights} to be a file or directory.')
 
-
-    def _download_nemo_file(self, 
-                            restore_path: str, 
-                            tmpdir: str) -> str:
-        # .nemo filename 
+    def _download_nemo_file(self, restore_path: str, tmpdir: str) -> str:
+        # .nemo filename
         fname = os.path.basename(restore_path)
-        
-        #check if msc path exists 
+
+        # check if msc path exists
         if not msc.os.path.exists(restore_path):
             raise FileNotFoundError(f".nemo file doesn't exist at {restore_path}")
-        
-        #download .nemo file to tempdir
+
+        # download .nemo file to tempdir
         os.makedirs(tmpdir, exist_ok=True)
         logging.warning(f"Starting .nemo download {restore_path}")
         msc.download_file(restore_path, f"{tmpdir}/{fname}")
-        
-        #update restore_path to point to downloaded .nemo
+
+        # update restore_path to point to downloaded .nemo
         updated_restore_path = os.path.join(tmpdir, fname)
         logging.warning(f".nemo download complete; updated_restore_path to {updated_restore_path}")
         return updated_restore_path
 
-
     def restore_from(
         self,
         calling_cls,
@@ -1459,7 +1461,7 @@ def dummy():
                     trainer.strategy.setup_environment()
 
                 # with tempfile.TemporaryDirectory() as tmpdir:
-                    # Check if self.model_extracted_dir is set, and is a valid path
+                # Check if self.model_extracted_dir is set, and is a valid path
                 if self.model_extracted_dir is not None and os.path.isdir(self.model_extracted_dir):
                     # Log that NeMo will use the provided `model_extracted_dir`
                     logging.info(
@@ -1512,7 +1514,7 @@ def dummy():
             else:
                 state_dict = self.modify_state_dict(conf, state_dict)
                 super().load_instance_with_state_dict(instance, state_dict, strict)
-           
+
             logging.info(f'Model {instance.__class__.__name__} was successfully restored from {restore_path}.')
             return instance
 
diff --git a/nemo/core/connectors/save_restore_connector.py b/nemo/core/connectors/save_restore_connector.py
@@ -20,9 +20,9 @@
 import tempfile
 import time
 import uuid
-import time
 from contextlib import contextmanager
 from typing import Callable, Generator, Optional, Set, Union
+
 import torch
 from lightning.pytorch.trainer.trainer import Trainer
 from omegaconf import DictConfig, OmegaConf
@@ -42,6 +42,7 @@
 except (ImportError, ModuleNotFoundError):
     MULTISTORAGECLIENT_AVAILABLE = False
 
+
 class SaveRestoreConnector:
     def __init__(self) -> None:
         self._model_config_yaml = "model_config.yaml"
diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -25,9 +25,9 @@
 from lightning.fabric.utilities.types import _PATH
 from lightning.pytorch import Callback
 from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
+from multistorageclient.types import MSC_PROTOCOL
 
 from nemo.utils import logging
-from multistorageclient.types import MSC_PROTOCOL
 
 try:
     from megatron.core import dist_checkpointing
diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py
@@ -26,7 +26,7 @@
 from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint, _is_local_file_protocol
 from lightning.pytorch.trainer import call
 from lightning.pytorch.utilities import rank_zero_info
-from torch import Tensor 
+from torch import Tensor
 
 from nemo.collections.common.callbacks import EMA
 from nemo.utils import logging
@@ -232,7 +232,7 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
             if self.multistorageclient_enabled:
                 if not multistorageclient.os.path.exists(maybe_injected_best_model_path):
                     return
-                
+
             if not os.path.exists(maybe_injected_best_model_path):
                 return
 
@@ -242,7 +242,9 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
 
             self.previous_best_path = self.best_model_path
             old_state_dict = deepcopy(pl_module.state_dict())
-            checkpoint = multistorageclient.torch.load(maybe_injected_best_model_path, map_location='cpu', weights_only=False)
+            checkpoint = multistorageclient.torch.load(
+                maybe_injected_best_model_path, map_location='cpu', weights_only=False
+            )
             if 'state_dict' in checkpoint:
                 checkpoint = checkpoint['state_dict']
             # get a new instanace of the model
@@ -295,7 +297,9 @@ def on_train_end(self, trainer, pl_module):
                 )
             else:
                 if self.multistorageclient_enabled:
-                    if multistorageclient.os.path.exists(self.best_model_path) and multistorageclient.os.path.isdir(self.best_model_path):
+                    if multistorageclient.os.path.exists(self.best_model_path) and multistorageclient.os.path.isdir(
+                        self.best_model_path
+                    ):
                         self.best_model_path = self.best_model_path.split('.ckpt')[0]
 
                 else:
@@ -540,7 +544,7 @@ def file_exists(
     ) -> bool:
         """Checks if a file or a file without a suffix (distributed checkpoint) exists."""
         if self.multistorageclient_enabled:
-            exists = self._fs.exists(filepath) # todo(avm): unsure if we need this check 
+            exists = self._fs.exists(filepath)  # todo(avm): unsure if we need this check
         else:
             exists = self._fs.exists(filepath) or (check_dist_ckpt and self._fs.exists(ckpt_to_dir(filepath)))
 
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
@@ -164,6 +164,7 @@ class CallbackParams:
     save_last_n_optim_states: Optional[int] = -1
     multistorageclient_enabled: Optional[bool] = False
 
+
 @dataclass
 class StepTimingParams:
     """StepTimingParams POD"""