NVIDIA-BioNeMo
diff --git a/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 134 additions & 55 deletions b/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 134 additions & 55 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/test_collator.py‎
Lines changed: 28 additions & 28 deletions b/‎bionemo-recipes/models/esm2/tests/test_collator.py‎
Lines changed: 28 additions & 28 deletions
@@ -296,58 +296,6 @@ def _pad_batch_to_multiple_of(self, batch):
         )
 
 
-class MLMDataCollatorWithFlatteningCPAware:
-    """A collator that is aware of context parallelism.
-
-    For the case of context parallelism, padded sequences will be returned from the wrapped collator, and then split into shards for each context parallelism rank.
-
-    The shards are then typically sent to the CPAwareDataloader which will scatter them to the appropriate GPUs.
-    """
-
-    def __init__(self, collator: MLMDataCollatorWithFlattening, cp_world_size: int):
-        """Initialize the MLMDataCollatorWithFlatteningCPAware.
-
-        Args:
-            collator: The collator to use for masking tokens.
-            cp_world_size: The size of the context parallelism group.
-        """
-        self.collator = collator
-        self.cp_world_size = cp_world_size
-
-    def __call__(self, features) -> list[dict[str, Any]]:
-        """Process batches of data and create shards for each context parallelism rank.
-
-        Args:
-            features: List of tokenized sequences, each containing 'input_ids' and optionally 'labels'.
-
-        Returns:
-            A list of dictionaries, each containing a shard of the batch for a given context parallelism rank.
-        """
-        batch = self.collator(features)
-
-        combined_batch = []
-        for cp_rank in range(self.cp_world_size):
-            input_ids_sharded, labels_sharded = split_batch_by_cp_rank(
-                cu_seqlens_padded=batch["cu_seq_lens_q_padded"],
-                input_ids_padded=batch["input_ids"],
-                labels_padded=batch["labels"],
-                qvk_format="thd",
-                cp_rank=cp_rank,
-                cp_world_size=self.cp_world_size,
-            )
-            batch_shard = dict(batch)
-            batch_shard["input_ids"] = input_ids_sharded
-            batch_shard["labels"] = labels_sharded
-            # Now determine the max length of the sequence.
-            seqlens_q = batch_shard["cu_seq_lens_q_padded"][1:] - batch_shard["cu_seq_lens_q_padded"][:-1]
-            batch_shard["max_length_q"] = int((seqlens_q.max().item() + 63) // 64 * 64)
-            batch_shard["max_length_k"] = batch_shard["max_length_q"]
-            batch_shard["pad_between_seqs"] = True
-            combined_batch.append(batch_shard)
-
-        return combined_batch
-
-
 @dataclass
 class DataCollatorWithFlattening(DefaultDataCollator):
     """Data collator for sequence packing with flash attentions cu_seqlens-style attention.
@@ -444,7 +392,7 @@ def __iter__(self):
                     tokens_in_batch = current_length - len(sample["input_ids"])
                     # Calculate how many tokens we can fit from this sample
                     tokens_available = self.max_tokens_per_batch - tokens_in_batch
-                    first_part, remaining_part = split_sample_by_num_tokens(sample, tokens_available)
+                    first_part, remaining_part = _split_sample_by_num_tokens(sample, tokens_available)
                     yield [*samples, first_part]
                     samples = [remaining_part]
 
@@ -460,7 +408,138 @@ def set_epoch(self, epoch: int):
         self.dataset.set_epoch(epoch)
 
 
-def split_sample_by_num_tokens(sample: dict[str, Any], num_tokens: int) -> tuple[dict[str, Any], dict[str, Any]]:
+class DataCollatorForContextParallel:
+    """A collator that is aware of context parallelism.
+
+    For the case of context parallelism, padded sequences will be returned from the wrapped collator, and then split
+    into shards for each context parallelism rank.
+
+    The shards are then typically sent to the ContextParallelDataLoaderWrapper which will scatter them to the
+    appropriate GPUs.
+    """
+
+    def __init__(self, collator: DefaultDataCollator, cp_world_size: int):
+        """Initialize the DataCollatorForContextParallel.
+
+        Args:
+            collator: The collator to use for masking tokens.
+            cp_world_size: The size of the context parallelism group.
+        """
+        self.collator = collator
+        self.cp_world_size = cp_world_size
+
+    def __call__(self, features) -> list[dict[str, Any]]:
+        """Process batches of data and create shards for each context parallelism rank.
+
+        Args:
+            features: List of tokenized sequences, each containing 'input_ids' and optionally 'labels'.
+
+        Returns:
+            A list of dictionaries, each containing a shard of the batch for a given context parallelism rank.
+        """
+        batch = self.collator(features)
+
+        combined_batch = []
+        for cp_rank in range(self.cp_world_size):
+            input_ids_sharded, labels_sharded = _split_batch_by_cp_rank(
+                cu_seqlens_padded=batch["cu_seq_lens_q_padded"],
+                input_ids_padded=batch["input_ids"],
+                labels_padded=batch["labels"],
+                qvk_format="thd",
+                cp_rank=cp_rank,
+                cp_world_size=self.cp_world_size,
+            )
+            batch_shard = dict(batch)
+            batch_shard["input_ids"] = input_ids_sharded
+            batch_shard["labels"] = labels_sharded
+            # Now determine the max length of the sequence.
+            seqlens_q = batch_shard["cu_seq_lens_q_padded"][1:] - batch_shard["cu_seq_lens_q_padded"][:-1]
+            batch_shard["max_length_q"] = int((seqlens_q.max().item() + 63) // 64 * 64)
+            batch_shard["max_length_k"] = batch_shard["max_length_q"]
+            batch_shard["pad_between_seqs"] = True
+            combined_batch.append(batch_shard)
+
+        return combined_batch
+
+
+class ContextParallelDataLoaderWrapper:
+    """A dataloader that is aware of context parallelism."""
+
+    def __init__(
+        self,
+        dataloader: torch.utils.data.DataLoader,
+        cp_group: torch.distributed.ProcessGroup,
+        cp_rank: int,
+    ):
+        """A dataloader wrapper that distributes the data across the context parallelism group.
+
+        This class will get the batch from the dataloader on CP rank 0, and then determine the shards for all the
+        different CP group members. Then it will scatter the shards to the different CP group members. The shards are
+        then returned to the caller for the current CP rank.
+
+        Args:
+            dataloader: The dataloader to use.
+            cp_group: The context parallel group.
+            cp_rank: The rank of the current context parallel process.
+        """
+        self.dataloader = dataloader
+        self.cp_rank = cp_rank
+        self.cp_group = cp_group
+        self.num_cp_ranks = cp_group.size()
+        self._iterator = None
+
+    def __iter__(self):
+        """Make the dataloader iterable."""
+        self._iterator = iter(self.dataloader)  # < --- collator output.
+        return self
+
+    def __next__(self):
+        """Get the batch from the dataloader for the current CP rank."""
+        batch = self._send_data_to_cp_ranks()
+        return batch
+
+    def _send_data_to_cp_ranks(self):
+        """Send data to all the CP ranks.
+
+        This function will get the batch from the dataloader on CP rank 0, and then determine
+        the shards for all the different CP group members.
+        combined_batch = [<cp_rank_0_shard>, <cp_rank_1_shard>, ..., <cp_rank_n_shard>]
+        Then it will scatter the shards to the different CP group members.
+        The shards are then combined into a single batch and returned to the caller
+        for the current CP rank.
+
+        Scalability:
+            Rank 0's work grows linearly with CP size, but the other ranks do not need to store all the shards so they do not
+            grow linearly with CP size.
+
+        Args:
+            None
+
+        Returns:
+            batch: The batch for the current CP rank.
+
+        """
+        if self.cp_rank == 0:
+            # Get data once, then make copies for each rank.
+            if self._iterator is None:
+                self._iterator = iter(self.dataloader)
+            combined_batch = next(self._iterator)
+
+        else:
+            combined_batch = None
+
+        scatter_object_output_list = [None]
+        # Note: This does not provide an async_op handle. Thus its blocking.
+        torch.distributed.scatter_object_list(
+            scatter_object_output_list=scatter_object_output_list,
+            scatter_object_input_list=combined_batch,
+            group=self.cp_group,
+            group_src=0,
+        )
+        return scatter_object_output_list[0]
+
+
+def _split_sample_by_num_tokens(sample: dict[str, Any], num_tokens: int) -> tuple[dict[str, Any], dict[str, Any]]:
     """Split a sample dictionary at a specified number of tokens.
 
     This function splits a sample into two parts: the first part contains exactly `num_tokens` tokens,
@@ -615,7 +694,7 @@ def _pt_pad_to_multiple_of(batch: dict[str, Any], pad_to_multiple_of: int, token
 
 # TODO(@jomitchell): Once this gets merged: https://github.com/NVIDIA/TransformerEngine/pull/2387
 # we can replace this with the one in TransformerEngine.
-def split_batch_by_cp_rank(
+def _split_batch_by_cp_rank(
     cu_seqlens_padded: torch.Tensor,
     input_ids_padded: torch.Tensor,
     labels_padded: torch.Tensor,
 
@@ -23,7 +23,7 @@
     DataCollatorWithFlattening,
     MLMDataCollatorWithFlattening,
     TokenPackingDataset,
-    split_sample_by_num_tokens,
+    _split_sample_by_num_tokens,
 )
 
 
@@ -494,36 +494,36 @@ def __iter__(self):
     assert sum(len(sample["input_ids"]) for sample in batches[0]) == 90
 
 
-def test_split_sample_by_num_tokens_basic():
-    """Test split_sample_by_num_tokens with basic input_ids."""
+def test__split_sample_by_num_tokens_basic():
+    """Test _split_sample_by_num_tokens with basic input_ids."""
     sample = {"input_ids": [0, 5, 6, 7, 8, 9, 2]}
-    first, remaining = split_sample_by_num_tokens(sample, 3)
+    first, remaining = _split_sample_by_num_tokens(sample, 3)
 
     assert first["input_ids"] == [0, 5, 6]
     assert remaining["input_ids"] == [7, 8, 9, 2]
     assert len(first["input_ids"]) == 3
     assert len(remaining["input_ids"]) == 4
 
 
-def test_split_sample_by_num_tokens_with_labels():
-    """Test split_sample_by_num_tokens with input_ids and labels."""
+def test__split_sample_by_num_tokens_with_labels():
+    """Test _split_sample_by_num_tokens with input_ids and labels."""
     sample = {"input_ids": [0, 5, 6, 7, 8, 2], "labels": [0, 5, 6, 7, 8, 2]}
-    first, remaining = split_sample_by_num_tokens(sample, 3)
+    first, remaining = _split_sample_by_num_tokens(sample, 3)
 
     assert first["input_ids"] == [0, 5, 6]
     assert first["labels"] == [0, 5, 6]
     assert remaining["input_ids"] == [7, 8, 2]
     assert remaining["labels"] == [7, 8, 2]
 
 
-def test_split_sample_by_num_tokens_with_attention_mask():
-    """Test split_sample_by_num_tokens with input_ids, attention_mask, and labels."""
+def test__split_sample_by_num_tokens_with_attention_mask():
+    """Test _split_sample_by_num_tokens with input_ids, attention_mask, and labels."""
     sample = {
         "input_ids": [0, 5, 6, 7, 8, 2],
         "attention_mask": [1, 1, 1, 1, 1, 1],
         "labels": [0, 5, 6, 7, 8, 2],
     }
-    first, remaining = split_sample_by_num_tokens(sample, 4)
+    first, remaining = _split_sample_by_num_tokens(sample, 4)
 
     assert first["input_ids"] == [0, 5, 6, 7]
     assert first["attention_mask"] == [1, 1, 1, 1]
@@ -533,14 +533,14 @@ def test_split_sample_by_num_tokens_with_attention_mask():
     assert remaining["labels"] == [8, 2]
 
 
-def test_split_sample_by_num_tokens_with_token_type_ids():
-    """Test split_sample_by_num_tokens with token_type_ids."""
+def test__split_sample_by_num_tokens_with_token_type_ids():
+    """Test _split_sample_by_num_tokens with token_type_ids."""
     sample = {
         "input_ids": [0, 5, 6, 7, 8, 2],
         "token_type_ids": [0, 0, 0, 1, 1, 1],
         "labels": [0, 5, 6, 7, 8, 2],
     }
-    first, remaining = split_sample_by_num_tokens(sample, 3)
+    first, remaining = _split_sample_by_num_tokens(sample, 3)
 
     assert first["input_ids"] == [0, 5, 6]
     assert first["token_type_ids"] == [0, 0, 0]
@@ -550,14 +550,14 @@ def test_split_sample_by_num_tokens_with_token_type_ids():
     assert remaining["labels"] == [7, 8, 2]
 
 
-def test_split_sample_by_num_tokens_with_token_type():
-    """Test split_sample_by_num_tokens with token_type (alternative name)."""
+def test__split_sample_by_num_tokens_with_token_type():
+    """Test _split_sample_by_num_tokens with token_type (alternative name)."""
     sample = {
         "input_ids": [0, 5, 6, 7, 8, 2],
         "token_type": [0, 0, 0, 1, 1, 1],
         "labels": [0, 5, 6, 7, 8, 2],
     }
-    first, remaining = split_sample_by_num_tokens(sample, 3)
+    first, remaining = _split_sample_by_num_tokens(sample, 3)
 
     assert first["input_ids"] == [0, 5, 6]
     assert first["token_type"] == [0, 0, 0]
@@ -567,14 +567,14 @@ def test_split_sample_by_num_tokens_with_token_type():
     assert remaining["labels"] == [7, 8, 2]
 
 
-def test_split_sample_by_num_tokens_with_tensors():
-    """Test split_sample_by_num_tokens with torch tensors."""
+def test__split_sample_by_num_tokens_with_tensors():
+    """Test _split_sample_by_num_tokens with torch tensors."""
     sample = {
         "input_ids": torch.tensor([0, 5, 6, 7, 8, 2]),
         "attention_mask": torch.tensor([1, 1, 1, 1, 1, 1]),
         "labels": torch.tensor([0, 5, 6, 7, 8, 2]),
     }
-    first, remaining = split_sample_by_num_tokens(sample, 3)
+    first, remaining = _split_sample_by_num_tokens(sample, 3)
 
     assert torch.equal(first["input_ids"], torch.tensor([0, 5, 6]))
     assert torch.equal(first["attention_mask"], torch.tensor([1, 1, 1]))
@@ -584,14 +584,14 @@ def test_split_sample_by_num_tokens_with_tensors():
     assert torch.equal(remaining["labels"], torch.tensor([7, 8, 2]))
 
 
-def test_split_sample_by_num_tokens_with_metadata():
-    """Test split_sample_by_num_tokens preserves non-sequence fields."""
+def test__split_sample_by_num_tokens_with_metadata():
+    """Test _split_sample_by_num_tokens preserves non-sequence fields."""
     sample = {
         "input_ids": [0, 5, 6, 7, 8, 2],
         "labels": [0, 5, 6, 7, 8, 2],
         "metadata": {"id": 123, "source": "test"},
     }
-    first, remaining = split_sample_by_num_tokens(sample, 3)
+    first, remaining = _split_sample_by_num_tokens(sample, 3)
 
     # Sequence fields should be split
     assert first["input_ids"] == [0, 5, 6]
@@ -602,23 +602,23 @@ def test_split_sample_by_num_tokens_with_metadata():
     assert remaining["metadata"] == {"id": 123, "source": "test"}
 
 
-def test_split_sample_by_num_tokens_errors():
-    """Test split_sample_by_num_tokens raises errors for invalid inputs."""
+def test__split_sample_by_num_tokens_errors():
+    """Test _split_sample_by_num_tokens raises errors for invalid inputs."""
     sample = {"input_ids": [0, 5, 6, 7, 2]}
 
     # num_tokens >= sample_length should raise ValueError
     with pytest.raises(ValueError, match="num_tokens.*must be less than sample length"):
-        split_sample_by_num_tokens(sample, 5)
+        _split_sample_by_num_tokens(sample, 5)
 
     with pytest.raises(ValueError, match="num_tokens.*must be less than sample length"):
-        split_sample_by_num_tokens(sample, 10)
+        _split_sample_by_num_tokens(sample, 10)
 
     # num_tokens <= 0 should raise ValueError
     with pytest.raises(ValueError, match="num_tokens.*must be positive"):
-        split_sample_by_num_tokens(sample, 0)
+        _split_sample_by_num_tokens(sample, 0)
 
     with pytest.raises(ValueError, match="num_tokens.*must be positive"):
-        split_sample_by_num_tokens(sample, -1)
+        _split_sample_by_num_tokens(sample, -1)
 
 
 def test_token_packing_dataset_with_split_samples():