feat: add consecutive batch shard sampler for pytorch (#3886)

Jay-ju · web-flow · commit b7fb848278c0 · 2025-06-20T07:35:37.000+08:00
Signed-off-by: jukejian &lt;jukejian@bytedance.com&gt;
diff --git a/python/python/lance/sampler.py b/python/python/lance/sampler.py
@@ -356,6 +356,10 @@ def __init__(
         self._world_size = world_size
         self._randomize = randomize
         self._seed = seed
+        self._epoch = 0
+
+    def set_epoch(self, epoch: int):
+        self._epoch = epoch
 
     @staticmethod
     def from_torch(randomize: bool = False, seed: int = 0) -> ShardedFragmentSampler:
@@ -399,6 +403,7 @@ class ShardedBatchSampler(Sampler):
     not assigned to it.  The resulting stream is then randomized via a reservoir
     sampler.  This does not perfectly randomize the stream but it should generate
     a stream that is random enough for many use cases.
+
     """
 
     def __init__(
@@ -408,6 +413,13 @@ def __init__(
         self._world_size = world_size
         self._randomize = randomize
         self._seed = seed
+        self._epoch = 0
+
+    def __len__(self):
+        return self._len
+
+    def set_epoch(self, epoch: int):
+        self._epoch = epoch
 
     @staticmethod
     def from_torch(randomize: bool = False, seed: int = 0) -> ShardedBatchSampler:
@@ -488,7 +500,7 @@ def _sample_filtered(
         if not self._randomize:
             yield from shard_scan
 
-        random.seed(self._seed)
+        random.seed(self._seed + self._epoch)
         heap = []
         # We want to randomize the incoming sequence.  The normal approach
         # is to pull the whole thing in memory and run fisher-yates.  We
@@ -563,3 +575,96 @@ def __call__(
             return self._sample_filtered(
                 dataset, batch_size, columns, batch_readahead, filter
             )
+
+
+class ShardedFixedBatchSampler(ShardedBatchSampler):
+    """
+    Sharded fixed batch sampler for distributed index-based batching.
+
+    This sampler is designed for static datasets with a known total number of rows.
+    It divides the dataset into consecutive index ranges (batches) and assigns each
+    process (rank) a unique subset of these batches for efficient distributed loading.
+
+    Features:
+    - Requires `total_num_rows` and `batch_size` to be specified.
+    - Each rank receives consecutive, non-overlapping index ranges.
+    - Optionally randomizes the order of batches per epoch if `randomize=True`.
+    - Suitable for integration with PyTorch DataLoader or similar frameworks.
+
+    Example (total_num_rows=1000, world_size=4, batch_size=100):
+    - Rank 0: [0-99], [100-199], [200-299]
+    - Rank 1: [250-349], [350-449], [450-549]
+    - Rank 2: [500-599], [600-699], [700-799]
+    - Rank 3: [750-849], [850-949], [950-999]
+
+    Parameters
+    ----------
+    rank : int
+        The rank (process index) in the distributed cluster.
+    world_size : int
+        The total number of processes in the distributed cluster.
+    randomize : bool, default False
+        Whether to randomize the order of batches for each epoch.
+    seed : int, default 0
+        Random seed for reproducibility when randomize is enabled.
+    batch_size : int, default 0
+        The number of rows per batch.
+    total_num_rows : int, default 0
+        The total number of rows in the dataset.
+    """
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        randomize: bool = False,
+        seed: int = 0,
+        batch_size: int = 0,
+        total_num_rows: int = 0,
+    ):
+        super().__init__(rank, world_size, randomize, seed)
+        self._total_num_rows = total_num_rows
+        self._batch_size = batch_size
+        self._len = self._compute_length()
+
+    # The sampler here is mainly implemented with the hope that
+    # the data of batch_size are all adjacent, so we don't want
+    # to use filter to break this adjacent feature.
+    def _compute_length(self):
+        if self._batch_size == 0 and self._total_num_rows == 0:
+            return 0
+        per_rank = math.ceil(self._total_num_rows / self._world_size)
+        return math.ceil(per_rank / self._batch_size)
+
+    def __len__(self):
+        return self._len
+
+    def __iter__(self) -> Generator[List[int], None, None]:
+        per_rank = math.ceil(self._total_num_rows / self._world_size)
+        start = self._rank * per_rank
+        end = min(start + per_rank, self._total_num_rows)
+
+        batches = []
+        current = start
+        while current < end:
+            batch_end = min(current + self._batch_size, end)
+            batches.append(list(range(current, batch_end)))
+            current = batch_end
+
+        if self._randomize:
+            random.seed(self._seed + self._epoch)
+            random.shuffle(batches)
+
+        yield from batches
+
+    @staticmethod
+    def from_torch(
+        total_num_rows: int, batch_size: int, randomize: bool = False, seed: int = 0
+    ) -> ShardedFixedBatchSampler:
+        import torch
+
+        rank = torch.distributed.get_rank()
+        world_size = torch.distributed.get_world_size()
+        return ShardedFixedBatchSampler(
+            rank, world_size, total_num_rows, batch_size, randomize, seed
+        )
diff --git a/python/python/tests/test_sampler.py b/python/python/tests/test_sampler.py
@@ -7,7 +7,249 @@
 import numpy as np
 import pyarrow as pa
 import pytest
-from lance.sampler import maybe_sample
+from lance.sampler import ShardedBatchSampler, ShardedFixedBatchSampler, maybe_sample
+
+TEST_CONFIG = {
+    "total_rows": 1000,
+    "batch_size": 250,
+    "world_size": 4,
+    "vec_dim": 32,
+    "test_port": "29501",
+    "master_addr": "127.0.0.1",
+    "seed": 42,
+    "test_shard_ratio": 0.5,
+    "max_takes_factor": 0.1,
+}
+
+
+@pytest.fixture
+def sample_dataset_path(tmp_path):
+    data = pa.Table.from_arrays(
+        [
+            pa.array(range(TEST_CONFIG["total_rows"])),
+            pa.array(np.random.rand(TEST_CONFIG["total_rows"])),
+            pa.array([f"text_{i}" for i in range(TEST_CONFIG["total_rows"])]),
+        ],
+        names=["id", "value", "text"],
+    )
+
+    dataset_path = tmp_path / "test_dataset.lance"
+    lance.write_dataset(data, dataset_path)
+    return dataset_path
+
+
+@pytest.fixture
+def sample_dataset(sample_dataset_path) -> lance.LanceDataset:
+    return lance.dataset(sample_dataset_path)
+
+
+def test_consecutive_index_blocks():
+    sampler = ShardedFixedBatchSampler(
+        rank=0,
+        world_size=TEST_CONFIG["world_size"],
+        total_num_rows=TEST_CONFIG["total_rows"],
+        batch_size=TEST_CONFIG["batch_size"],
+    )
+
+    batches = list(sampler)
+    expected_size = TEST_CONFIG["total_rows"] // (
+        TEST_CONFIG["world_size"] * TEST_CONFIG["batch_size"]
+    )
+    assert len(batches) == expected_size
+    assert batches[0] == list(range(TEST_CONFIG["batch_size"]))
+
+
+def _distributed_test_worker(rank, world_size, dataset_path):
+    import os
+
+    import torch
+
+    os.environ.update(
+        {
+            "MASTER_ADDR": TEST_CONFIG["master_addr"],
+            "MASTER_PORT": TEST_CONFIG["test_port"],
+            "CUDA_VISIBLE_DEVICES": ",".join(
+                map(str, range(torch.cuda.device_count()))
+            ),
+        }
+    )
+
+    try:
+        if torch.cuda.is_available():
+            torch.cuda.set_device(rank % torch.cuda.device_count())
+
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        torch.distributed.init_process_group(
+            backend=backend, world_size=world_size, rank=rank
+        )
+
+        dataset = lance.dataset(dataset_path)
+        assert len(dataset) == TEST_CONFIG["total_rows"]
+
+        sampler = ShardedBatchSampler(
+            rank=rank,
+            world_size=world_size,
+            total_num_rows=TEST_CONFIG["total_rows"],
+            batch_size=TEST_CONFIG["batch_size"],
+        )
+
+        class DatasetAdapter(torch.utils.data.Dataset):
+            def __init__(self, dataset):
+                self.dataset = dataset
+
+            def __getitem__(self, index):
+                return self.dataset.take([index], ["id", "value"]).to_pylist()[0]
+
+            def __len__(self):
+                return len(self.dataset)
+
+        def collate_fn(batch):
+            return {
+                "ids": torch.tensor([x["id"] for x in batch], dtype=torch.long),
+                "values": torch.tensor(
+                    [x["value"] for x in batch], dtype=torch.float32
+                ),
+            }
+
+        dataloader = torch.utils.data.DataLoader(
+            DatasetAdapter(dataset),
+            batch_sampler=sampler,
+            collate_fn=collate_fn,
+            num_workers=0,
+        )
+
+        total = 0
+        for batch_indices, batch_data in zip(sampler, dataloader):
+            current_size = batch_data["ids"].size(0)
+            assert current_size == TEST_CONFIG["batch_size"]
+            assert batch_data["ids"].tolist() == list(batch_indices)
+            total += current_size
+
+        expected_total = TEST_CONFIG["total_rows"] // world_size
+        assert total == expected_total
+
+    finally:
+        if torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
+
+
+@pytest.mark.cuda
+def test_pytorch_integration(sample_dataset_path):
+    import torch
+
+    test_world_sizes = [1, 2] if torch.cuda.device_count() >= 2 else [1]
+    for ws in test_world_sizes:
+        torch.multiprocessing.spawn(
+            _distributed_test_worker,
+            args=(ws, str(sample_dataset_path)),
+            nprocs=ws,
+            join=True,
+        )
+
+
+def test_data_stream_without_filter(sample_dataset):
+    """Validate direct data loading without filters."""
+    sampler = ShardedFixedBatchSampler(0, 4)
+    batches = list(sampler(sample_dataset, batch_size=250, columns=["id", "value"]))
+
+    # Data integrity checks
+    batch = batches[0]
+    assert batch.num_rows == 250, "Batch should contain 250 records"
+    assert batch.column_names == ["id", "value"], "Should load specified columns"
+
+    # Consecutive ID validation
+    ids = batch["id"].to_numpy()
+    assert np.array_equal(ids, np.arange(0, 250)), "IDs should be sequential 0-249"
+
+
+def test_filtered_data_handling(sample_dataset):
+    """Test filtered data processing with sharding."""
+    # Apply ID filter and load data
+    sampler = ShardedFixedBatchSampler(0, 4)
+    batches = list(
+        sampler(sample_dataset, batch_size=100, filter="id < 500", columns=["id"])
+    )
+
+    # Aggregated results validation
+    all_ids = []
+    for batch in batches:
+        all_ids.extend(batch["id"].to_numpy().tolist())
+
+    # Filter and sharding assertions
+    assert all(id_val < 500 for id_val in all_ids), "Should respect ID filter"
+    assert all(id_val % 4 == 0 for id_val in all_ids), "Should keep rank 0 shard"
+
+
+def test_randomization_effect():
+    """Verify epoch-based randomization behavior."""
+    # Initialize randomized sampler
+    sampler = ShardedFixedBatchSampler(
+        rank=0,
+        world_size=4,
+        total_num_rows=2000,
+        batch_size=250,
+        randomize=True,
+        seed=42,
+    )
+
+    assert len(list(sampler)) > 1
+
+    # Cross-epoch comparison
+    sampler.set_epoch(1)
+    epoch1 = list(sampler)
+    sampler.set_epoch(2)
+    epoch2 = list(sampler)
+
+    assert epoch1 != epoch2, "Different epochs should produce different orders"
+
+
+def test_edge_cases():
+    """Validate handling of partial batches and data boundaries."""
+
+    sampler = ShardedFixedBatchSampler(
+        rank=3, world_size=4, batch_size=250, total_num_rows=1000
+    )
+    batches = list(sampler)
+    assert len(batches) == 1, "Should handle partial batch"
+    assert batches[0] == list(range(750, 1000)), "Last rank should get 750-999"
+
+    sampler = ShardedFixedBatchSampler(
+        rank=0, world_size=2, batch_size=128, total_num_rows=500
+    )
+    batches = list(sampler)
+    # rank 0: 0~249, rank 1: 250~499
+    # rank 0: [0-127], [128-249]
+    assert batches[0] == list(range(0, 128))
+    assert batches[1] == list(range(128, 250))
+
+    # total_num_rows < batch_size
+    sampler = ShardedFixedBatchSampler(
+        rank=0, world_size=1, batch_size=250, total_num_rows=100
+    )
+    batches = list(sampler)
+    assert len(batches) == 1
+    assert batches[0] == list(range(0, 100))
+
+    # total_num_rows < world_size
+    sampler = ShardedFixedBatchSampler(
+        rank=2, world_size=4, batch_size=10, total_num_rows=2
+    )
+    batches = list(sampler)
+    assert len(batches) == 0, "No data for this rank"
+
+    # batch_size=1
+    sampler = ShardedFixedBatchSampler(
+        rank=0, world_size=2, batch_size=1, total_num_rows=4
+    )
+    batches = list(sampler)
+    assert batches == [[0], [1]]
+
+    # world_size=1
+    sampler = ShardedFixedBatchSampler(
+        rank=0, world_size=1, batch_size=3, total_num_rows=5
+    )
+    batches = list(sampler)
+    assert batches == [list(range(0, 3)), list(range(3, 5))]
 
 
 # We use + 97 to test case where num_rows and chunk_size aren't exactly aligned.