NVIDIA-BioNeMo
diff --git a/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 95 additions & 4 deletions b/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 95 additions & 4 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/test_collator.py‎
Lines changed: 216 additions & 1 deletion b/‎bionemo-recipes/models/esm2/tests/test_collator.py‎
Lines changed: 216 additions & 1 deletion
@@ -411,21 +411,44 @@ class TokenPackingDataset(torch.utils.data.IterableDataset):
     """Maximum number of tokens per batch."""
     drop_last: bool = True
     """Whether to drop the last batch if it's less than max_length."""
+    split_samples: bool = False
+    """Whether to split samples to ensure batches have exactly max_tokens_per_batch tokens."""
 
     def __iter__(self):
         """Yield batches of samples, each with a variable number of tokens up to the maximum length.
 
+        When split_samples=True, ensures each batch has exactly max_tokens_per_batch by splitting
+        the final sample if needed. The remaining tokens from the split sample start the next batch.
+
         Returns:
             A generator of batches of samples, each with a variable number of tokens up to the maximum length.
         """
         samples = []
         current_length = 0
         for sample in iter(self.dataset):
             current_length += len(sample["input_ids"])
-            if current_length > self.max_tokens_per_batch:
-                yield samples
-                samples = [sample]
-                current_length = len(sample["input_ids"])
+            if current_length == self.max_tokens_per_batch:
+                yield [*samples, sample]
+                samples = []
+                current_length = 0
+
+            elif current_length > self.max_tokens_per_batch:
+                if not self.split_samples:
+                    # If we are not splitting samples, we can just yield the current batch (before this sample) and
+                    # start a new one.
+                    yield samples
+                    samples = [sample]
+
+                else:
+                    # Calculate how many tokens are already in the batch
+                    tokens_in_batch = current_length - len(sample["input_ids"])
+                    # Calculate how many tokens we can fit from this sample
+                    tokens_available = self.max_tokens_per_batch - tokens_in_batch
+                    first_part, remaining_part = split_sample_by_num_tokens(sample, tokens_available)
+                    yield [*samples, first_part]
+                    samples = [remaining_part]
+
+                current_length = len(samples[0]["input_ids"])
             else:
                 samples.append(sample)
 
@@ -437,6 +460,74 @@ def set_epoch(self, epoch: int):
         self.dataset.set_epoch(epoch)
 
 
+def split_sample_by_num_tokens(sample: dict[str, Any], num_tokens: int) -> tuple[dict[str, Any], dict[str, Any]]:
+    """Split a sample dictionary at a specified number of tokens.
+
+    This function splits a sample into two parts: the first part contains exactly `num_tokens` tokens,
+    and the second part contains the remaining tokens. All fields that are sequences (input_ids, attention_mask,
+    token_type_ids, labels, etc.) are split accordingly.
+
+    Args:
+        sample: Dictionary containing sample data with fields like input_ids, attention_mask, token_type_ids, labels, etc.
+        num_tokens: Number of tokens to include in the first part of the split.
+
+    Returns:
+        A tuple of two dictionaries: (first_part, remaining_part), where:
+        - first_part contains the first `num_tokens` tokens from each sequence field
+        - remaining_part contains the remaining tokens from each sequence field
+
+    Example:
+        >>> sample = {
+        ...     "input_ids": [0, 5, 6, 7, 8, 9, 2],
+        ...     "attention_mask": [1, 1, 1, 1, 1, 1, 1],
+        ...     "labels": [0, 5, 6, 7, 8, 9, 2]
+        ... }
+        >>> first, remaining = split_sample_by_num_tokens(sample, 3)
+        >>> first["input_ids"]  # [0, 5, 6]
+        >>> remaining["input_ids"]  # [7, 8, 9, 2]
+    """
+    sample_length = len(sample["input_ids"])
+    if num_tokens >= sample_length:
+        raise ValueError(
+            f"num_tokens ({num_tokens}) must be less than sample length ({sample_length}) to split the sample"
+        )
+    if num_tokens <= 0:
+        raise ValueError(f"num_tokens ({num_tokens}) must be positive")
+
+    first_part = {}
+    remaining_part = {}
+
+    # Fields that should be split by tokens (sequence fields)
+    sequence_fields = ["input_ids", "attention_mask", "token_type_ids", "token_type", "labels"]
+
+    for key, value in sample.items():
+        if key in sequence_fields:
+            # Handle both list and tensor inputs
+            if isinstance(value, torch.Tensor):
+                first_part[key] = value[:num_tokens].clone()
+                remaining_part[key] = value[num_tokens:].clone()
+            elif isinstance(value, list):
+                first_part[key] = value[:num_tokens]
+                remaining_part[key] = value[num_tokens:]
+            else:
+                # For other types, try to slice if possible
+                try:
+                    first_part[key] = value[:num_tokens]
+                    remaining_part[key] = value[num_tokens:]
+                except (TypeError, IndexError):
+                    # If slicing doesn't work, copy the value to both parts
+                    # This handles fields that shouldn't be split (like metadata)
+                    first_part[key] = value
+                    remaining_part[key] = value
+        else:
+            # For non-sequence fields, copy to both parts
+            # This handles metadata fields that shouldn't be split
+            first_part[key] = value
+            remaining_part[key] = value
+
+    return first_part, remaining_part
+
+
 def _pt_flatten_collate(features: list[dict[str, list[int]]], return_position_ids: bool = False):
     is_labels_provided = "labels" in features[0]
     sample_lengths = [len(sample["input_ids"]) for sample in features]
 
@@ -15,10 +15,16 @@
 
 from unittest.mock import MagicMock
 
+import pytest
 import torch
 from transformers import DataCollatorForLanguageModeling
 
-from esm.collator import DataCollatorWithFlattening, MLMDataCollatorWithFlattening, TokenPackingDataset
+from esm.collator import (
+    DataCollatorWithFlattening,
+    MLMDataCollatorWithFlattening,
+    TokenPackingDataset,
+    split_sample_by_num_tokens,
+)
 
 
 def test_data_collator_with_flattening_basic():
@@ -486,3 +492,212 @@ def __iter__(self):
     assert len(batches) == 1
     assert len(batches[0]) == 3
     assert sum(len(sample["input_ids"]) for sample in batches[0]) == 90
+
+
+def test_split_sample_by_num_tokens_basic():
+    """Test split_sample_by_num_tokens with basic input_ids."""
+    sample = {"input_ids": [0, 5, 6, 7, 8, 9, 2]}
+    first, remaining = split_sample_by_num_tokens(sample, 3)
+
+    assert first["input_ids"] == [0, 5, 6]
+    assert remaining["input_ids"] == [7, 8, 9, 2]
+    assert len(first["input_ids"]) == 3
+    assert len(remaining["input_ids"]) == 4
+
+
+def test_split_sample_by_num_tokens_with_labels():
+    """Test split_sample_by_num_tokens with input_ids and labels."""
+    sample = {"input_ids": [0, 5, 6, 7, 8, 2], "labels": [0, 5, 6, 7, 8, 2]}
+    first, remaining = split_sample_by_num_tokens(sample, 3)
+
+    assert first["input_ids"] == [0, 5, 6]
+    assert first["labels"] == [0, 5, 6]
+    assert remaining["input_ids"] == [7, 8, 2]
+    assert remaining["labels"] == [7, 8, 2]
+
+
+def test_split_sample_by_num_tokens_with_attention_mask():
+    """Test split_sample_by_num_tokens with input_ids, attention_mask, and labels."""
+    sample = {
+        "input_ids": [0, 5, 6, 7, 8, 2],
+        "attention_mask": [1, 1, 1, 1, 1, 1],
+        "labels": [0, 5, 6, 7, 8, 2],
+    }
+    first, remaining = split_sample_by_num_tokens(sample, 4)
+
+    assert first["input_ids"] == [0, 5, 6, 7]
+    assert first["attention_mask"] == [1, 1, 1, 1]
+    assert first["labels"] == [0, 5, 6, 7]
+    assert remaining["input_ids"] == [8, 2]
+    assert remaining["attention_mask"] == [1, 1]
+    assert remaining["labels"] == [8, 2]
+
+
+def test_split_sample_by_num_tokens_with_token_type_ids():
+    """Test split_sample_by_num_tokens with token_type_ids."""
+    sample = {
+        "input_ids": [0, 5, 6, 7, 8, 2],
+        "token_type_ids": [0, 0, 0, 1, 1, 1],
+        "labels": [0, 5, 6, 7, 8, 2],
+    }
+    first, remaining = split_sample_by_num_tokens(sample, 3)
+
+    assert first["input_ids"] == [0, 5, 6]
+    assert first["token_type_ids"] == [0, 0, 0]
+    assert first["labels"] == [0, 5, 6]
+    assert remaining["input_ids"] == [7, 8, 2]
+    assert remaining["token_type_ids"] == [1, 1, 1]
+    assert remaining["labels"] == [7, 8, 2]
+
+
+def test_split_sample_by_num_tokens_with_token_type():
+    """Test split_sample_by_num_tokens with token_type (alternative name)."""
+    sample = {
+        "input_ids": [0, 5, 6, 7, 8, 2],
+        "token_type": [0, 0, 0, 1, 1, 1],
+        "labels": [0, 5, 6, 7, 8, 2],
+    }
+    first, remaining = split_sample_by_num_tokens(sample, 3)
+
+    assert first["input_ids"] == [0, 5, 6]
+    assert first["token_type"] == [0, 0, 0]
+    assert first["labels"] == [0, 5, 6]
+    assert remaining["input_ids"] == [7, 8, 2]
+    assert remaining["token_type"] == [1, 1, 1]
+    assert remaining["labels"] == [7, 8, 2]
+
+
+def test_split_sample_by_num_tokens_with_tensors():
+    """Test split_sample_by_num_tokens with torch tensors."""
+    sample = {
+        "input_ids": torch.tensor([0, 5, 6, 7, 8, 2]),
+        "attention_mask": torch.tensor([1, 1, 1, 1, 1, 1]),
+        "labels": torch.tensor([0, 5, 6, 7, 8, 2]),
+    }
+    first, remaining = split_sample_by_num_tokens(sample, 3)
+
+    assert torch.equal(first["input_ids"], torch.tensor([0, 5, 6]))
+    assert torch.equal(first["attention_mask"], torch.tensor([1, 1, 1]))
+    assert torch.equal(first["labels"], torch.tensor([0, 5, 6]))
+    assert torch.equal(remaining["input_ids"], torch.tensor([7, 8, 2]))
+    assert torch.equal(remaining["attention_mask"], torch.tensor([1, 1, 1]))
+    assert torch.equal(remaining["labels"], torch.tensor([7, 8, 2]))
+
+
+def test_split_sample_by_num_tokens_with_metadata():
+    """Test split_sample_by_num_tokens preserves non-sequence fields."""
+    sample = {
+        "input_ids": [0, 5, 6, 7, 8, 2],
+        "labels": [0, 5, 6, 7, 8, 2],
+        "metadata": {"id": 123, "source": "test"},
+    }
+    first, remaining = split_sample_by_num_tokens(sample, 3)
+
+    # Sequence fields should be split
+    assert first["input_ids"] == [0, 5, 6]
+    assert remaining["input_ids"] == [7, 8, 2]
+
+    # Metadata should be copied to both parts
+    assert first["metadata"] == {"id": 123, "source": "test"}
+    assert remaining["metadata"] == {"id": 123, "source": "test"}
+
+
+def test_split_sample_by_num_tokens_errors():
+    """Test split_sample_by_num_tokens raises errors for invalid inputs."""
+    sample = {"input_ids": [0, 5, 6, 7, 2]}
+
+    # num_tokens >= sample_length should raise ValueError
+    with pytest.raises(ValueError, match="num_tokens.*must be less than sample length"):
+        split_sample_by_num_tokens(sample, 5)
+
+    with pytest.raises(ValueError, match="num_tokens.*must be less than sample length"):
+        split_sample_by_num_tokens(sample, 10)
+
+    # num_tokens <= 0 should raise ValueError
+    with pytest.raises(ValueError, match="num_tokens.*must be positive"):
+        split_sample_by_num_tokens(sample, 0)
+
+    with pytest.raises(ValueError, match="num_tokens.*must be positive"):
+        split_sample_by_num_tokens(sample, -1)
+
+
+def test_token_packing_dataset_with_split_samples():
+    """Test TokenPackingDataset with split_samples=True ensures exact batch sizes."""
+
+    class MockDataset(torch.utils.data.IterableDataset):
+        def __iter__(self):
+            yield {"input_ids": torch.arange(40)}  # 40 tokens
+            yield {"input_ids": torch.arange(50)}  # 50 tokens
+            yield {"input_ids": torch.arange(30)}  # 30 tokens
+
+    dataset = MockDataset()
+    token_packing_dataset = TokenPackingDataset(dataset, max_tokens_per_batch=100, split_samples=True, drop_last=False)
+    batches = list(token_packing_dataset)
+
+    # First batch should have exactly 100 tokens (40 + 50 + 10 from the 30-token sample)
+    assert len(batches) >= 1
+    assert sum(len(sample["input_ids"]) for sample in batches[0]) == 100
+
+    # Second batch should start with the remaining 20 tokens from the split sample
+    if len(batches) > 1:
+        assert sum(len(sample["input_ids"]) for sample in batches[1]) == 20
+
+
+def test_token_packing_dataset_with_split_samples_exact_fit():
+    """Test TokenPackingDataset with split_samples=True when samples exactly fill batches."""
+
+    class MockDataset(torch.utils.data.IterableDataset):
+        def __iter__(self):
+            yield {"input_ids": torch.arange(50)}  # 50 tokens
+            yield {"input_ids": torch.arange(50)}  # 50 tokens (total: 100, exactly max)
+
+    dataset = MockDataset()
+    token_packing_dataset = TokenPackingDataset(dataset, max_tokens_per_batch=100, split_samples=True, drop_last=False)
+    batches = list(token_packing_dataset)
+
+    # Should have 1 batch with exactly 100 tokens
+    assert len(batches) == 1
+    assert sum(len(sample["input_ids"]) for sample in batches[0]) == 100
+
+
+def test_token_packing_dataset_with_split_samples_multiple_fields():
+    """Test TokenPackingDataset with split_samples=True handles multiple fields correctly."""
+
+    class MockDataset(torch.utils.data.IterableDataset):
+        def __iter__(self):
+            yield {
+                "input_ids": torch.arange(40),
+                "attention_mask": torch.ones(40),
+                "labels": torch.arange(40),
+            }
+            yield {
+                "input_ids": torch.arange(50),
+                "attention_mask": torch.ones(50),
+                "labels": torch.arange(50),
+            }
+            yield {
+                "input_ids": torch.arange(30),
+                "attention_mask": torch.ones(30),
+                "labels": torch.arange(30),
+            }
+
+    dataset = MockDataset()
+    token_packing_dataset = TokenPackingDataset(dataset, max_tokens_per_batch=100, split_samples=True, drop_last=False)
+    batches = list(token_packing_dataset)
+
+    # First batch should have exactly 100 tokens
+    assert len(batches) >= 1
+    first_batch_total = sum(len(sample["input_ids"]) for sample in batches[0])
+    assert first_batch_total == 100
+
+    # Second batch should have exactly 20 tokens
+    second_batch_total = sum(len(sample["input_ids"]) for sample in batches[1])
+    assert second_batch_total == 20
+
+    # Verify all fields are present and consistent
+    for sample in batches[0]:
+        assert "input_ids" in sample
+        assert "attention_mask" in sample
+        assert "labels" in sample
+        assert len(sample["input_ids"]) == len(sample["attention_mask"])
+        assert len(sample["input_ids"]) == len(sample["labels"])