From 607b19e7b7cc7e5edc28e075fb502e1e8bb612c9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 7 Apr 2025 16:10:30 +0000
Subject: [PATCH 1/3] Bump 3rdparty/NeMo from `b685967` to `7d7a10c`

Bumps [3rdparty/NeMo](https://github.com/NVIDIA/NeMo) from `b685967` to `7d7a10c`.
- [Release notes](https://github.com/NVIDIA/NeMo/releases)
- [Commits](https://github.com/NVIDIA/NeMo/compare/b685967f9512e1906e11fbd95048ff0fb05ff2fe...7d7a10c1f85404959a62b026d9e1acdabba16302)

---
updated-dependencies:
- dependency-name: 3rdparty/NeMo
  dependency-version: 7d7a10c1f85404959a62b026d9e1acdabba16302
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 3rdparty/NeMo | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/NeMo b/3rdparty/NeMo
index b685967f95..7d7a10c1f8 160000
--- a/3rdparty/NeMo
+++ b/3rdparty/NeMo
@@ -1 +1 @@
-Subproject commit b685967f9512e1906e11fbd95048ff0fb05ff2fe
+Subproject commit 7d7a10c1f85404959a62b026d9e1acdabba16302

From 3ac72f67da0dcafb6ff7cb4875d8b8ebddbaf692 Mon Sep 17 00:00:00 2001
From: "Peter St. John" <pstjohn@nvidia.com>
Date: Mon, 7 Apr 2025 12:15:15 -0700
Subject: [PATCH 2/3] remove call to context_parallel loss

Signed-off-by: Peter St. John <pstjohn@nvidia.com>
---
 .../model/finetune_token_regressor.py         | 17 ++---------------
 .../bionemo-llm/src/bionemo/llm/model/loss.py | 19 +++----------------
 2 files changed, 5 insertions(+), 31 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/model/finetune_token_regressor.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/model/finetune_token_regressor.py
index ee47fd7fc3..bdea3691a0 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/model/finetune_token_regressor.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/model/finetune_token_regressor.py
@@ -24,10 +24,7 @@
 from nemo.collections.llm.peft.lora import LoRA, LoRALinear
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ParallelLinearAdapter
 from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
-from nemo.lightning.megatron_parallel import (
-    masked_token_loss,
-    masked_token_loss_context_parallel,
-)
+from nemo.lightning.megatron_parallel import masked_token_loss
 from torch import Tensor, nn
 
 from bionemo.llm.model.biobert.model import BioBertConfig, BioBertOutput, MegatronBioBertModel
@@ -102,17 +99,7 @@ def forward(
         # TODO(@jstjohn) also handle different output keys, like the sequence loss.
 
         cp_size = parallel_state.get_context_parallel_world_size()
-        if cp_size == 1:
-            # reduce the loss across the micro batch
-            loss_for_microbatch = masked_token_loss(unreduced_token_loss, batch["loss_mask"])
-        else:
-            # reduce the loss across the micro batch.
-            # TODO(@jomitchell): Figure out who defines "num_valid_tokens_in_ub" in the batch and document/understand this.
-            #  This has something to do with context parallel, and there is probably a megatron or nemo function that adds this and
-            #  other necessary keys to the batch. Thanks!
-            loss_for_microbatch = masked_token_loss_context_parallel(
-                unreduced_token_loss, batch["loss_mask"], batch["num_valid_tokens_in_ub"]
-            )
+        loss_for_microbatch = masked_token_loss(unreduced_token_loss, batch["loss_mask"], cp_size)
 
         # If we do not drop the last partial batch of validation, we need to do fancy reduction handling to support
         #  reducing the loss across the data parallel group.
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/model/loss.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/loss.py
index 272ab27829..e04619eda7 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/model/loss.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/model/loss.py
@@ -19,11 +19,7 @@
 from megatron.core import parallel_state, tensor_parallel
 from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy
 from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
-from nemo.lightning.megatron_parallel import (
-    MegatronLossReduction,
-    masked_token_loss,
-    masked_token_loss_context_parallel,
-)
+from nemo.lightning.megatron_parallel import MegatronLossReduction, masked_token_loss
 from torch import Tensor
 
 
@@ -181,17 +177,8 @@ def forward(
 
         # compute loss
         cp_size = parallel_state.get_context_parallel_world_size()
-        if cp_size == 1:
-            # reduce the loss across the micro batch per valid token
-            loss_for_microbatch = masked_token_loss(unreduced_token_loss, batch["loss_mask"])
-        else:
-            # reduce the loss across the micro batch per valid token.
-            # TODO(@jomitchell): Figure out who defines "num_valid_tokens_in_ub" in the batch and document/understand this.
-            #  This has something to do with context parallel, and there is probably a megatron or nemo function that adds this and
-            #  other necessary keys to the batch. Thanks!
-            loss_for_microbatch = masked_token_loss_context_parallel(
-                unreduced_token_loss, batch["loss_mask"], batch["num_valid_tokens_in_ub"]
-            )
+        # reduce the loss across the micro batch per valid token
+        loss_for_microbatch = masked_token_loss(unreduced_token_loss, batch["loss_mask"], cp_size)
 
         # If we do not drop the last partial batch of validation, we need to do fancy reduction handling to support
         #  reducing the loss across the data parallel group.

From aad698729ef47cfecbc863a4c4c088453de59742 Mon Sep 17 00:00:00 2001
From: "Peter St. John" <pstjohn@nvidia.com>
Date: Tue, 15 Apr 2025 09:12:34 -0700
Subject: [PATCH 3/3] bump NeMo to TOT

Signed-off-by: Peter St. John <pstjohn@nvidia.com>
---
 3rdparty/NeMo | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/NeMo b/3rdparty/NeMo
index 7d7a10c1f8..d0fc65838b 160000
--- a/3rdparty/NeMo
+++ b/3rdparty/NeMo
@@ -1 +1 @@
-Subproject commit 7d7a10c1f85404959a62b026d9e1acdabba16302
+Subproject commit d0fc65838ba2bc6f2506da742d5af427acfd0747