Adding update that solves one logger issue for Evo2 training (#1331)

jstjohn · web-flow · commit e2150f863299 · 2026-01-10T00:49:49.000Z
### Description Use nemo training loop to take advantage of their one-logger configuration support. Also remove the private callback that broke checkpointing. ### Type of changes  - [x] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): ### CI Pipeline Configuration Configure CI behavior by applying the relevant labels. By default, only basic unit tests are run. - [ciflow:skip](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:skip) - Skip all CI tests for this PR - [ciflow:notebooks](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:notebooks) - Run Jupyter notebooks execution tests for bionemo2 - [ciflow:slow](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:slow) - Run slow single GPU integration tests marked as @pytest.mark.slow for bionemo2 - [ciflow:all](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all) - Run all tests (unit tests, slow tests, and notebooks) for bionemo2. This label can be used to enforce running tests for all bionemo2. - [ciflow:all-recipes](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all-recipes) - Run tests for all recipes (under bionemo-recipes). This label can be used to enforce running tests for all recipes. Unit tests marked as `@pytest.mark.multi_gpu` or `@pytest.mark.distributed` are not run in the PR pipeline. For more details, see [CONTRIBUTING](CONTRIBUTING.md) > [!NOTE] > By default, only basic unit tests are run. Add appropriate labels to enable an additional test coverage. #### Authorizing CI Runs We use [copy-pr-bot](https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/#automation) to manage authorization of CI runs on NVIDIA's compute resources. - If a pull request is opened by a trusted user and contains only trusted changes, the pull request's code will automatically be copied to a pull-request/ prefixed branch in the source repository (e.g. pull-request/123) - If a pull request is opened by an untrusted user or contains untrusted changes, an NVIDIA org member must leave an `/ok to test` comment on the pull request to trigger CI. This will need to be done for each new commit. ### Pre-submit Checklist - [x] I have tested these changes locally - [ ] I have updated the documentation accordingly - [ ] I have added/updated tests as needed - [ ] All existing tests pass successfully Signed-off-by: John St John <jstjohn@nvidia.com>
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py
@@ -24,7 +24,7 @@
 # TODO add back support for slurm resilience.
 # import nvidia_resiliency_ext.ptl_resiliency as res_module
 import torch
-from lightning.pytorch.callbacks import Callback, LearningRateMonitor, RichModelSummary
+from lightning.pytorch.callbacks import LearningRateMonitor, RichModelSummary
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.enums import Fp8Recipe
 from megatron.core.optimizer import OptimizerConfig
@@ -53,7 +53,7 @@
 from bionemo.evo2.models.mamba import MAMBA_MODEL_OPTIONS, MambaModel, mamba_no_weight_decay_cond_with_embeddings
 from bionemo.evo2.models.peft import Evo2LoRA
 from bionemo.evo2.run.utils import infer_model_type, patch_eden_tokenizer
-from bionemo.evo2.utils.callbacks import GarbageCollectAtInferenceTime
+from bionemo.evo2.utils.callbacks import GarbageCollectAtInferenceTime, _FirstBatchCudaSync
 from bionemo.evo2.utils.config import hyena_no_weight_decay_cond_with_embeddings
 from bionemo.evo2.utils.logging.callbacks import TEVCallback
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
@@ -853,27 +853,6 @@ def train(args: argparse.Namespace) -> nl.Trainer:
         TEVCallback(),
     ]
 
-    # First batch CUDA sync callback: adds barriers for the first training batch to avoid race condition
-    # See https://github.com/NVIDIA/bionemo-framework/issues/1301 for more details.
-    class _FirstBatchCudaSync(Callback):
-        def __init__(self):
-            self._done = False
-
-        def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
-            if not self._done and torch.cuda.is_available():
-                torch.cuda.synchronize()
-
-        def on_after_backward(self, trainer, pl_module):
-            if not self._done and torch.cuda.is_available():
-                torch.cuda.synchronize()
-
-        def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
-            if not self._done and torch.cuda.is_available():
-                torch.cuda.synchronize()
-                # Unset blocking for subsequent batches
-                os.environ.pop("CUDA_LAUNCH_BLOCKING", None)
-                self._done = True
-
     callbacks.append(_FirstBatchCudaSync())
 
     if args.garbage_collect_at_inference:
@@ -1103,15 +1082,6 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
         enable_checkpointing=args.create_checkpoint_callback,
     )
 
-    # Logger setup
-    nemo_logger.setup(
-        trainer,
-        resume_if_exists=True,
-    )
-
-    if auto_resume is not None:
-        auto_resume.setup(trainer, model)
-
     # Optimizer and scheduler setup
     opt_config = OptimizerConfig(
         optimizer="adam",
@@ -1139,12 +1109,8 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
     opt = MegatronOptimizerModule(
         opt_config, sched, no_weight_decay_cond=getattr(model_config, "hyena_no_weight_decay_cond_fn", None)
     )
-    opt.connect(model)
-
-    # Remove earlier warmup and hook logic; first-batch blocking is sufficient.
+    llm.train(model, data_module, trainer, log=nemo_logger, resume=auto_resume, optim=opt, tokenizer="data")
 
-    # Start training
-    trainer.fit(model, data_module)
     return trainer
 
 
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/callbacks.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/callbacks.py
@@ -14,11 +14,35 @@
 # limitations under the License.
 
 import gc
+import os
 
 import torch
 from lightning.pytorch import Callback
 
 
+class _FirstBatchCudaSync(Callback):
+    # TEMPORARY CALLBACK. Remove once bug is fixed.
+    # First batch CUDA sync callback: adds barriers for the first training batch to avoid race condition
+    # See https://github.com/NVIDIA/bionemo-framework/issues/1301 for more details.
+    def __init__(self):
+        self._done = False
+
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
+        if not self._done and torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+    def on_after_backward(self, trainer, pl_module):
+        if not self._done and torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        if not self._done and torch.cuda.is_available():
+            torch.cuda.synchronize()
+            # Unset blocking for subsequent batches
+            os.environ.pop("CUDA_LAUNCH_BLOCKING", None)
+            self._done = True
+
+
 class GarbageCollectAtInferenceTime(Callback):
     """Callback to clean up CUDA memory before validation to prevent initialization errors."""