Skip to content

Commit e2150f8

Browse files
authored
Adding update that solves one logger issue for Evo2 training (#1331)
### Description Use nemo training loop to take advantage of their one-logger configuration support. Also remove the private callback that broke checkpointing. ### Type of changes <!-- Mark the relevant option with an [x] --> - [x] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): ### CI Pipeline Configuration Configure CI behavior by applying the relevant labels. By default, only basic unit tests are run. - [ciflow:skip](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:skip) - Skip all CI tests for this PR - [ciflow:notebooks](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:notebooks) - Run Jupyter notebooks execution tests for bionemo2 - [ciflow:slow](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:slow) - Run slow single GPU integration tests marked as @pytest.mark.slow for bionemo2 - [ciflow:all](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all) - Run all tests (unit tests, slow tests, and notebooks) for bionemo2. This label can be used to enforce running tests for all bionemo2. - [ciflow:all-recipes](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all-recipes) - Run tests for all recipes (under bionemo-recipes). This label can be used to enforce running tests for all recipes. Unit tests marked as `@pytest.mark.multi_gpu` or `@pytest.mark.distributed` are not run in the PR pipeline. For more details, see [CONTRIBUTING](CONTRIBUTING.md) > [!NOTE] > By default, only basic unit tests are run. Add appropriate labels to enable an additional test coverage. #### Authorizing CI Runs We use [copy-pr-bot](https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/#automation) to manage authorization of CI runs on NVIDIA's compute resources. - If a pull request is opened by a trusted user and contains only trusted changes, the pull request's code will automatically be copied to a pull-request/ prefixed branch in the source repository (e.g. pull-request/123) - If a pull request is opened by an untrusted user or contains untrusted changes, an NVIDIA org member must leave an `/ok to test` comment on the pull request to trigger CI. This will need to be done for each new commit. ### Pre-submit Checklist - [x] I have tested these changes locally - [ ] I have updated the documentation accordingly - [ ] I have added/updated tests as needed - [ ] All existing tests pass successfully Signed-off-by: John St John <jstjohn@nvidia.com>
1 parent abd0ecb commit e2150f8

2 files changed

Lines changed: 27 additions & 37 deletions

File tree

sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py

Lines changed: 3 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
# TODO add back support for slurm resilience.
2525
# import nvidia_resiliency_ext.ptl_resiliency as res_module
2626
import torch
27-
from lightning.pytorch.callbacks import Callback, LearningRateMonitor, RichModelSummary
27+
from lightning.pytorch.callbacks import LearningRateMonitor, RichModelSummary
2828
from megatron.core.distributed import DistributedDataParallelConfig
2929
from megatron.core.enums import Fp8Recipe
3030
from megatron.core.optimizer import OptimizerConfig
@@ -53,7 +53,7 @@
5353
from bionemo.evo2.models.mamba import MAMBA_MODEL_OPTIONS, MambaModel, mamba_no_weight_decay_cond_with_embeddings
5454
from bionemo.evo2.models.peft import Evo2LoRA
5555
from bionemo.evo2.run.utils import infer_model_type, patch_eden_tokenizer
56-
from bionemo.evo2.utils.callbacks import GarbageCollectAtInferenceTime
56+
from bionemo.evo2.utils.callbacks import GarbageCollectAtInferenceTime, _FirstBatchCudaSync
5757
from bionemo.evo2.utils.config import hyena_no_weight_decay_cond_with_embeddings
5858
from bionemo.evo2.utils.logging.callbacks import TEVCallback
5959
from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
@@ -853,27 +853,6 @@ def train(args: argparse.Namespace) -> nl.Trainer:
853853
TEVCallback(),
854854
]
855855

856-
# First batch CUDA sync callback: adds barriers for the first training batch to avoid race condition
857-
# See https://github.com/NVIDIA/bionemo-framework/issues/1301 for more details.
858-
class _FirstBatchCudaSync(Callback):
859-
def __init__(self):
860-
self._done = False
861-
862-
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
863-
if not self._done and torch.cuda.is_available():
864-
torch.cuda.synchronize()
865-
866-
def on_after_backward(self, trainer, pl_module):
867-
if not self._done and torch.cuda.is_available():
868-
torch.cuda.synchronize()
869-
870-
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
871-
if not self._done and torch.cuda.is_available():
872-
torch.cuda.synchronize()
873-
# Unset blocking for subsequent batches
874-
os.environ.pop("CUDA_LAUNCH_BLOCKING", None)
875-
self._done = True
876-
877856
callbacks.append(_FirstBatchCudaSync())
878857

879858
if args.garbage_collect_at_inference:
@@ -1103,15 +1082,6 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
11031082
enable_checkpointing=args.create_checkpoint_callback,
11041083
)
11051084

1106-
# Logger setup
1107-
nemo_logger.setup(
1108-
trainer,
1109-
resume_if_exists=True,
1110-
)
1111-
1112-
if auto_resume is not None:
1113-
auto_resume.setup(trainer, model)
1114-
11151085
# Optimizer and scheduler setup
11161086
opt_config = OptimizerConfig(
11171087
optimizer="adam",
@@ -1139,12 +1109,8 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
11391109
opt = MegatronOptimizerModule(
11401110
opt_config, sched, no_weight_decay_cond=getattr(model_config, "hyena_no_weight_decay_cond_fn", None)
11411111
)
1142-
opt.connect(model)
1143-
1144-
# Remove earlier warmup and hook logic; first-batch blocking is sufficient.
1112+
llm.train(model, data_module, trainer, log=nemo_logger, resume=auto_resume, optim=opt, tokenizer="data")
11451113

1146-
# Start training
1147-
trainer.fit(model, data_module)
11481114
return trainer
11491115

11501116

sub-packages/bionemo-evo2/src/bionemo/evo2/utils/callbacks.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,35 @@
1414
# limitations under the License.
1515

1616
import gc
17+
import os
1718

1819
import torch
1920
from lightning.pytorch import Callback
2021

2122

23+
class _FirstBatchCudaSync(Callback):
24+
# TEMPORARY CALLBACK. Remove once bug is fixed.
25+
# First batch CUDA sync callback: adds barriers for the first training batch to avoid race condition
26+
# See https://github.com/NVIDIA/bionemo-framework/issues/1301 for more details.
27+
def __init__(self):
28+
self._done = False
29+
30+
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
31+
if not self._done and torch.cuda.is_available():
32+
torch.cuda.synchronize()
33+
34+
def on_after_backward(self, trainer, pl_module):
35+
if not self._done and torch.cuda.is_available():
36+
torch.cuda.synchronize()
37+
38+
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
39+
if not self._done and torch.cuda.is_available():
40+
torch.cuda.synchronize()
41+
# Unset blocking for subsequent batches
42+
os.environ.pop("CUDA_LAUNCH_BLOCKING", None)
43+
self._done = True
44+
45+
2246
class GarbageCollectAtInferenceTime(Callback):
2347
"""Callback to clean up CUDA memory before validation to prevent initialization errors."""
2448

0 commit comments

Comments
 (0)