|
15 | 15 |
|
16 | 16 | from unittest.mock import MagicMock |
17 | 17 |
|
| 18 | +import pytest |
18 | 19 | import torch |
19 | 20 | from transformers import DataCollatorForLanguageModeling |
20 | 21 |
|
21 | | -from esm.collator import DataCollatorWithFlattening, MLMDataCollatorWithFlattening, TokenPackingDataset |
| 22 | +from esm.collator import ( |
| 23 | + DataCollatorWithFlattening, |
| 24 | + MLMDataCollatorWithFlattening, |
| 25 | + TokenPackingDataset, |
| 26 | + split_sample_by_num_tokens, |
| 27 | +) |
22 | 28 |
|
23 | 29 |
|
24 | 30 | def test_data_collator_with_flattening_basic(): |
@@ -486,3 +492,212 @@ def __iter__(self): |
486 | 492 | assert len(batches) == 1 |
487 | 493 | assert len(batches[0]) == 3 |
488 | 494 | assert sum(len(sample["input_ids"]) for sample in batches[0]) == 90 |
| 495 | + |
| 496 | + |
| 497 | +def test_split_sample_by_num_tokens_basic(): |
| 498 | + """Test split_sample_by_num_tokens with basic input_ids.""" |
| 499 | + sample = {"input_ids": [0, 5, 6, 7, 8, 9, 2]} |
| 500 | + first, remaining = split_sample_by_num_tokens(sample, 3) |
| 501 | + |
| 502 | + assert first["input_ids"] == [0, 5, 6] |
| 503 | + assert remaining["input_ids"] == [7, 8, 9, 2] |
| 504 | + assert len(first["input_ids"]) == 3 |
| 505 | + assert len(remaining["input_ids"]) == 4 |
| 506 | + |
| 507 | + |
| 508 | +def test_split_sample_by_num_tokens_with_labels(): |
| 509 | + """Test split_sample_by_num_tokens with input_ids and labels.""" |
| 510 | + sample = {"input_ids": [0, 5, 6, 7, 8, 2], "labels": [0, 5, 6, 7, 8, 2]} |
| 511 | + first, remaining = split_sample_by_num_tokens(sample, 3) |
| 512 | + |
| 513 | + assert first["input_ids"] == [0, 5, 6] |
| 514 | + assert first["labels"] == [0, 5, 6] |
| 515 | + assert remaining["input_ids"] == [7, 8, 2] |
| 516 | + assert remaining["labels"] == [7, 8, 2] |
| 517 | + |
| 518 | + |
| 519 | +def test_split_sample_by_num_tokens_with_attention_mask(): |
| 520 | + """Test split_sample_by_num_tokens with input_ids, attention_mask, and labels.""" |
| 521 | + sample = { |
| 522 | + "input_ids": [0, 5, 6, 7, 8, 2], |
| 523 | + "attention_mask": [1, 1, 1, 1, 1, 1], |
| 524 | + "labels": [0, 5, 6, 7, 8, 2], |
| 525 | + } |
| 526 | + first, remaining = split_sample_by_num_tokens(sample, 4) |
| 527 | + |
| 528 | + assert first["input_ids"] == [0, 5, 6, 7] |
| 529 | + assert first["attention_mask"] == [1, 1, 1, 1] |
| 530 | + assert first["labels"] == [0, 5, 6, 7] |
| 531 | + assert remaining["input_ids"] == [8, 2] |
| 532 | + assert remaining["attention_mask"] == [1, 1] |
| 533 | + assert remaining["labels"] == [8, 2] |
| 534 | + |
| 535 | + |
| 536 | +def test_split_sample_by_num_tokens_with_token_type_ids(): |
| 537 | + """Test split_sample_by_num_tokens with token_type_ids.""" |
| 538 | + sample = { |
| 539 | + "input_ids": [0, 5, 6, 7, 8, 2], |
| 540 | + "token_type_ids": [0, 0, 0, 1, 1, 1], |
| 541 | + "labels": [0, 5, 6, 7, 8, 2], |
| 542 | + } |
| 543 | + first, remaining = split_sample_by_num_tokens(sample, 3) |
| 544 | + |
| 545 | + assert first["input_ids"] == [0, 5, 6] |
| 546 | + assert first["token_type_ids"] == [0, 0, 0] |
| 547 | + assert first["labels"] == [0, 5, 6] |
| 548 | + assert remaining["input_ids"] == [7, 8, 2] |
| 549 | + assert remaining["token_type_ids"] == [1, 1, 1] |
| 550 | + assert remaining["labels"] == [7, 8, 2] |
| 551 | + |
| 552 | + |
| 553 | +def test_split_sample_by_num_tokens_with_token_type(): |
| 554 | + """Test split_sample_by_num_tokens with token_type (alternative name).""" |
| 555 | + sample = { |
| 556 | + "input_ids": [0, 5, 6, 7, 8, 2], |
| 557 | + "token_type": [0, 0, 0, 1, 1, 1], |
| 558 | + "labels": [0, 5, 6, 7, 8, 2], |
| 559 | + } |
| 560 | + first, remaining = split_sample_by_num_tokens(sample, 3) |
| 561 | + |
| 562 | + assert first["input_ids"] == [0, 5, 6] |
| 563 | + assert first["token_type"] == [0, 0, 0] |
| 564 | + assert first["labels"] == [0, 5, 6] |
| 565 | + assert remaining["input_ids"] == [7, 8, 2] |
| 566 | + assert remaining["token_type"] == [1, 1, 1] |
| 567 | + assert remaining["labels"] == [7, 8, 2] |
| 568 | + |
| 569 | + |
| 570 | +def test_split_sample_by_num_tokens_with_tensors(): |
| 571 | + """Test split_sample_by_num_tokens with torch tensors.""" |
| 572 | + sample = { |
| 573 | + "input_ids": torch.tensor([0, 5, 6, 7, 8, 2]), |
| 574 | + "attention_mask": torch.tensor([1, 1, 1, 1, 1, 1]), |
| 575 | + "labels": torch.tensor([0, 5, 6, 7, 8, 2]), |
| 576 | + } |
| 577 | + first, remaining = split_sample_by_num_tokens(sample, 3) |
| 578 | + |
| 579 | + assert torch.equal(first["input_ids"], torch.tensor([0, 5, 6])) |
| 580 | + assert torch.equal(first["attention_mask"], torch.tensor([1, 1, 1])) |
| 581 | + assert torch.equal(first["labels"], torch.tensor([0, 5, 6])) |
| 582 | + assert torch.equal(remaining["input_ids"], torch.tensor([7, 8, 2])) |
| 583 | + assert torch.equal(remaining["attention_mask"], torch.tensor([1, 1, 1])) |
| 584 | + assert torch.equal(remaining["labels"], torch.tensor([7, 8, 2])) |
| 585 | + |
| 586 | + |
| 587 | +def test_split_sample_by_num_tokens_with_metadata(): |
| 588 | + """Test split_sample_by_num_tokens preserves non-sequence fields.""" |
| 589 | + sample = { |
| 590 | + "input_ids": [0, 5, 6, 7, 8, 2], |
| 591 | + "labels": [0, 5, 6, 7, 8, 2], |
| 592 | + "metadata": {"id": 123, "source": "test"}, |
| 593 | + } |
| 594 | + first, remaining = split_sample_by_num_tokens(sample, 3) |
| 595 | + |
| 596 | + # Sequence fields should be split |
| 597 | + assert first["input_ids"] == [0, 5, 6] |
| 598 | + assert remaining["input_ids"] == [7, 8, 2] |
| 599 | + |
| 600 | + # Metadata should be copied to both parts |
| 601 | + assert first["metadata"] == {"id": 123, "source": "test"} |
| 602 | + assert remaining["metadata"] == {"id": 123, "source": "test"} |
| 603 | + |
| 604 | + |
| 605 | +def test_split_sample_by_num_tokens_errors(): |
| 606 | + """Test split_sample_by_num_tokens raises errors for invalid inputs.""" |
| 607 | + sample = {"input_ids": [0, 5, 6, 7, 2]} |
| 608 | + |
| 609 | + # num_tokens >= sample_length should raise ValueError |
| 610 | + with pytest.raises(ValueError, match="num_tokens.*must be less than sample length"): |
| 611 | + split_sample_by_num_tokens(sample, 5) |
| 612 | + |
| 613 | + with pytest.raises(ValueError, match="num_tokens.*must be less than sample length"): |
| 614 | + split_sample_by_num_tokens(sample, 10) |
| 615 | + |
| 616 | + # num_tokens <= 0 should raise ValueError |
| 617 | + with pytest.raises(ValueError, match="num_tokens.*must be positive"): |
| 618 | + split_sample_by_num_tokens(sample, 0) |
| 619 | + |
| 620 | + with pytest.raises(ValueError, match="num_tokens.*must be positive"): |
| 621 | + split_sample_by_num_tokens(sample, -1) |
| 622 | + |
| 623 | + |
| 624 | +def test_token_packing_dataset_with_split_samples(): |
| 625 | + """Test TokenPackingDataset with split_samples=True ensures exact batch sizes.""" |
| 626 | + |
| 627 | + class MockDataset(torch.utils.data.IterableDataset): |
| 628 | + def __iter__(self): |
| 629 | + yield {"input_ids": torch.arange(40)} # 40 tokens |
| 630 | + yield {"input_ids": torch.arange(50)} # 50 tokens |
| 631 | + yield {"input_ids": torch.arange(30)} # 30 tokens |
| 632 | + |
| 633 | + dataset = MockDataset() |
| 634 | + token_packing_dataset = TokenPackingDataset(dataset, max_tokens_per_batch=100, split_samples=True, drop_last=False) |
| 635 | + batches = list(token_packing_dataset) |
| 636 | + |
| 637 | + # First batch should have exactly 100 tokens (40 + 50 + 10 from the 30-token sample) |
| 638 | + assert len(batches) >= 1 |
| 639 | + assert sum(len(sample["input_ids"]) for sample in batches[0]) == 100 |
| 640 | + |
| 641 | + # Second batch should start with the remaining 20 tokens from the split sample |
| 642 | + if len(batches) > 1: |
| 643 | + assert sum(len(sample["input_ids"]) for sample in batches[1]) == 20 |
| 644 | + |
| 645 | + |
| 646 | +def test_token_packing_dataset_with_split_samples_exact_fit(): |
| 647 | + """Test TokenPackingDataset with split_samples=True when samples exactly fill batches.""" |
| 648 | + |
| 649 | + class MockDataset(torch.utils.data.IterableDataset): |
| 650 | + def __iter__(self): |
| 651 | + yield {"input_ids": torch.arange(50)} # 50 tokens |
| 652 | + yield {"input_ids": torch.arange(50)} # 50 tokens (total: 100, exactly max) |
| 653 | + |
| 654 | + dataset = MockDataset() |
| 655 | + token_packing_dataset = TokenPackingDataset(dataset, max_tokens_per_batch=100, split_samples=True, drop_last=False) |
| 656 | + batches = list(token_packing_dataset) |
| 657 | + |
| 658 | + # Should have 1 batch with exactly 100 tokens |
| 659 | + assert len(batches) == 1 |
| 660 | + assert sum(len(sample["input_ids"]) for sample in batches[0]) == 100 |
| 661 | + |
| 662 | + |
| 663 | +def test_token_packing_dataset_with_split_samples_multiple_fields(): |
| 664 | + """Test TokenPackingDataset with split_samples=True handles multiple fields correctly.""" |
| 665 | + |
| 666 | + class MockDataset(torch.utils.data.IterableDataset): |
| 667 | + def __iter__(self): |
| 668 | + yield { |
| 669 | + "input_ids": torch.arange(40), |
| 670 | + "attention_mask": torch.ones(40), |
| 671 | + "labels": torch.arange(40), |
| 672 | + } |
| 673 | + yield { |
| 674 | + "input_ids": torch.arange(50), |
| 675 | + "attention_mask": torch.ones(50), |
| 676 | + "labels": torch.arange(50), |
| 677 | + } |
| 678 | + yield { |
| 679 | + "input_ids": torch.arange(30), |
| 680 | + "attention_mask": torch.ones(30), |
| 681 | + "labels": torch.arange(30), |
| 682 | + } |
| 683 | + |
| 684 | + dataset = MockDataset() |
| 685 | + token_packing_dataset = TokenPackingDataset(dataset, max_tokens_per_batch=100, split_samples=True, drop_last=False) |
| 686 | + batches = list(token_packing_dataset) |
| 687 | + |
| 688 | + # First batch should have exactly 100 tokens |
| 689 | + assert len(batches) >= 1 |
| 690 | + first_batch_total = sum(len(sample["input_ids"]) for sample in batches[0]) |
| 691 | + assert first_batch_total == 100 |
| 692 | + |
| 693 | + # Second batch should have exactly 20 tokens |
| 694 | + second_batch_total = sum(len(sample["input_ids"]) for sample in batches[1]) |
| 695 | + assert second_batch_total == 20 |
| 696 | + |
| 697 | + # Verify all fields are present and consistent |
| 698 | + for sample in batches[0]: |
| 699 | + assert "input_ids" in sample |
| 700 | + assert "attention_mask" in sample |
| 701 | + assert "labels" in sample |
| 702 | + assert len(sample["input_ids"]) == len(sample["attention_mask"]) |
| 703 | + assert len(sample["input_ids"]) == len(sample["labels"]) |
0 commit comments