NVIDIA-BioNeMo
diff --git a/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 20 additions & 16 deletions b/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 20 additions & 16 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/test_collator_context_parallel.py‎
Lines changed: 106 additions & 0 deletions b/‎bionemo-recipes/models/esm2/tests/test_collator_context_parallel.py‎
Lines changed: 106 additions & 0 deletions
@@ -275,6 +275,7 @@ def set_epoch(self, epoch: int):
         self.dataset.set_epoch(epoch)
 
 
+@dataclass
 class DataCollatorForContextParallel:
     """A collator that is aware of context parallelism.
 
@@ -285,15 +286,9 @@ class DataCollatorForContextParallel:
     appropriate GPUs.
     """
 
-    def __init__(self, collator: DataCollator, cp_world_size: int):
-        """Initialize the DataCollatorForContextParallel.
-
-        Args:
-            collator: The collator to use for masking tokens.
-            cp_world_size: The size of the context parallelism group.
-        """
-        self.collator = collator
-        self.cp_world_size = cp_world_size
+    collator: DataCollator
+    cp_world_size: int
+    qkv_format: str = "thd"
 
     def __call__(self, features) -> list[dict[str, Any]]:
         """Process batches of data and create shards for each context parallelism rank.
@@ -309,21 +304,29 @@ def __call__(self, features) -> list[dict[str, Any]]:
         combined_batch = []
         for cp_rank in range(self.cp_world_size):
             input_ids_sharded, labels_sharded = _split_batch_by_cp_rank(
-                cu_seqlens_padded=batch["cu_seq_lens_q_padded"],
+                cu_seqlens_padded=batch.get("cu_seq_lens_q_padded", None),  # This will be None for BSHD format.
                 input_ids_padded=batch["input_ids"],
                 labels_padded=batch["labels"],
-                qvk_format="thd",
+                qvk_format=self.qkv_format,
                 cp_rank=cp_rank,
                 cp_world_size=self.cp_world_size,
             )
             batch_shard = dict(batch)
             batch_shard["input_ids"] = input_ids_sharded
             batch_shard["labels"] = labels_sharded
             # Now determine the max length of the sequence.
-            seqlens_q = batch_shard["cu_seq_lens_q_padded"][1:] - batch_shard["cu_seq_lens_q_padded"][:-1]
-            batch_shard["max_length_q"] = int((seqlens_q.max().item() + 63) // 64 * 64)
-            batch_shard["max_length_k"] = batch_shard["max_length_q"]
-            batch_shard["pad_between_seqs"] = True
+            if self.qkv_format == "thd":
+                seqlens_q = batch_shard["cu_seq_lens_q_padded"][1:] - batch_shard["cu_seq_lens_q_padded"][:-1]
+                max_length = seqlens_q.max().item()
+                batch_shard["pad_between_seqs"] = True
+            elif self.qkv_format == "bshd":
+                max_length = batch["input_ids"].shape[1]
+                # For BSHD context parallelism, we can't handle padding, so we remove the attention mask.
+                del batch_shard["attention_mask"]
+            else:
+                raise ValueError(f"Unsupported qvk_format: {self.qkv_format}!")
+
+            batch_shard["max_length_k"] = batch_shard["max_length_q"] = max_length * round(max_length / 64)
             combined_batch.append(batch_shard)
 
         return combined_batch
@@ -727,7 +730,7 @@ def process_tensor_bshd(val):
 
 
 class BatchType(TypedDict):
-    """The fields in the batch dictionary for context parallel."""
+    """The fields in the batch dictionary fo THD context parallel."""
 
     input_ids: torch.Tensor
     labels: torch.Tensor
@@ -737,6 +740,7 @@ class BatchType(TypedDict):
     cu_seq_lens_k_padded: torch.Tensor
     max_length_q: int
     max_length_k: int
+    pad_between_seqs: bool
 
 
 def _scatter_batch_to_cp_ranks(
 
@@ -17,12 +17,15 @@
 from typing import Dict, Iterator, List
 from unittest import mock
 
+import pytest
 import torch
 from transformer_engine.pytorch.attention.dot_product_attention.context_parallel import pad_thd_sequences_for_cp
 from transformers import DataCollatorForLanguageModeling
 
 from esm.collator import (
+    BatchType,
     ContextParallelDataLoaderWrapper,
+    DataCollatorForContextParallel,
     DataCollatorWithFlattening,
     _split_batch_by_cp_rank,
 )
@@ -887,3 +890,106 @@ def test_bshd_and_thd_equivalence(tokenizer):
         torch.sort(batch_bshd["input_ids"][1])[0],
         msg="Reconstructed sequence 2 doesn't match original",
     )
+
+
+@pytest.mark.parametrize("cp_world_size", [2, 4])
+def test_data_collator_for_context_parallel_returns_correct_list_size(tokenizer, cp_world_size):
+    """Test that DataCollatorForContextParallel returns a list of the correct size."""
+    divisibility_factor = 2 * cp_world_size
+
+    # Create the wrapped collator that produces padded THD batches
+    base_collator = DataCollatorWithFlattening(
+        collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15),
+        pad_sequences_to_be_divisible_by=divisibility_factor,
+    )
+
+    # Create the context parallel collator
+    cp_collator = DataCollatorForContextParallel(collator=base_collator, cp_world_size=cp_world_size)
+
+    # Create test sequences
+    features = [
+        {"input_ids": [0, 5, 6, 7, 8, 9, 10, 2]},  # 8 tokens
+        {"input_ids": [0, 11, 12, 13, 14, 15, 16, 17, 2]},  # 9 tokens
+    ]
+
+    # Call the collator
+    result = cp_collator(features)
+
+    # Assert that the result is a list of the correct size
+    assert isinstance(result, list), f"Expected list, got {type(result)}"
+    assert len(result) == cp_world_size, f"Expected list of size {cp_world_size}, got {len(result)}"
+
+
+def test_data_collator_for_context_parallel_thd(tokenizer):
+    """Test that each shard from DataCollatorForContextParallel has all required keys from BatchType."""
+
+    cp_world_size = 2
+    divisibility_factor = 2 * cp_world_size
+
+    # Create the wrapped collator that produces padded THD batches
+    base_collator = DataCollatorWithFlattening(
+        collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15),
+        pad_sequences_to_be_divisible_by=divisibility_factor,
+    )
+
+    # Create the context parallel collator
+    cp_collator = DataCollatorForContextParallel(collator=base_collator, cp_world_size=cp_world_size)
+
+    # Create test sequences
+    features = [
+        {"input_ids": [0, 5, 6, 7, 8, 9, 10, 2]},  # 8 tokens
+        {"input_ids": [0, 11, 12, 13, 14, 15, 16, 17, 2]},  # 9 tokens
+    ]
+
+    # Call the collator
+    result = cp_collator(features)
+
+    assert len(result) == cp_world_size, f"Expected list of size {cp_world_size}, got {len(result)}"
+
+    # Define the required keys from BatchType
+    required_keys = set(BatchType.__annotations__.keys())
+
+    # Assert each shard has all required keys
+    for cp_rank, shard in enumerate(result):
+        assert set(shard.keys()) == required_keys, (
+            f"CP rank {cp_rank}: difference: {set(shard.keys()) - required_keys}"
+        )
+
+
+def test_data_collator_for_context_parallel_bshd(tokenizer):
+    """Test that each shard from DataCollatorForContextParallel has all required keys from BatchType."""
+
+    cp_world_size = 2
+    divisibility_factor = 2 * cp_world_size
+
+    # Create the wrapped collator that produces padded THD batches
+    base_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm_probability=0.15,
+        pad_to_multiple_of=divisibility_factor,
+    )
+
+    # Create the context parallel collator
+    cp_collator = DataCollatorForContextParallel(
+        collator=base_collator, cp_world_size=cp_world_size, qkv_format="bshd"
+    )
+
+    # Create test sequences
+    features = [
+        {"input_ids": [0, 5, 6, 7, 8, 9, 10, 2]},  # 8 tokens
+        {"input_ids": [0, 11, 12, 13, 14, 15, 16, 17, 2]},  # 9 tokens
+    ]
+
+    # Call the collator
+    result = cp_collator(features)
+
+    assert len(result) == cp_world_size, f"Expected list of size {cp_world_size}, got {len(result)}"
+
+    # Define the required keys from BatchType
+    required_keys = {"input_ids", "labels", "max_length_q", "max_length_k"}
+
+    # Assert each shard has all required keys
+    for cp_rank, shard in enumerate(result):
+        assert set(shard.keys()) == required_keys, (
+            f"CP rank {cp_rank}: expected keys {required_keys}, got {set(shard.keys())}"
+        )