Fix multi-GPU test sharding

Donglai Wei · Donglai Wei · commit 6d70585b05e5 · 2026-03-10T17:35:31.000-04:00
diff --git a/connectomics/inference/tta.py b/connectomics/inference/tta.py
@@ -604,6 +604,10 @@ def _apply_distributed_reduction(
     ) -> Optional[torch.Tensor]:
         """Reduce ensemble results across DDP ranks. Returns None on non-zero ranks."""
         _is_dist, rank, _world_size = self._distributed_context()
+        self._validate_distributed_reduction_shape(
+            ensemble_result,
+            reduction_device=reduction_device,
+        )
 
         if ensemble_mode == "mean":
             reduced_sum = self._reduce_cpu_tensor_to_rank_zero(
@@ -640,6 +644,53 @@ def _apply_distributed_reduction(
 
         return None  # non-zero ranks
 
+    def _validate_distributed_reduction_shape(
+        self,
+        ensemble_result: torch.Tensor,
+        *,
+        reduction_device: torch.device,
+    ) -> None:
+        """Fail fast when DDP ranks try to reduce different TTA prediction shapes."""
+        is_dist, _rank, world_size = self._distributed_context()
+        if not is_dist or world_size <= 1:
+            return
+
+        max_dims = 6
+        if ensemble_result.ndim > max_dims:
+            raise RuntimeError(
+                "Distributed TTA shape validation only supports tensors with up to "
+                f"{max_dims} dimensions, got shape {tuple(ensemble_result.shape)}."
+            )
+
+        shape_info = torch.zeros(max_dims + 1, device=reduction_device, dtype=torch.int64)
+        shape_info[0] = int(ensemble_result.ndim)
+        if ensemble_result.ndim:
+            shape_info[1 : 1 + ensemble_result.ndim] = torch.tensor(
+                tuple(int(v) for v in ensemble_result.shape),
+                device=reduction_device,
+                dtype=torch.int64,
+            )
+
+        gathered = [torch.empty_like(shape_info) for _ in range(world_size)]
+        torch.distributed.all_gather(gathered, shape_info)
+
+        shapes: list[tuple[int, ...]] = []
+        for gathered_shape in gathered:
+            ndim = int(gathered_shape[0].item())
+            shapes.append(tuple(int(v.item()) for v in gathered_shape[1 : 1 + ndim]))
+
+        if any(shape != shapes[0] for shape in shapes[1:]):
+            shape_summary = ", ".join(
+                f"rank {rank_idx}: {shape}" for rank_idx, shape in enumerate(shapes)
+            )
+            raise RuntimeError(
+                "Distributed TTA sharding requires every DDP rank to reduce predictions "
+                f"with the same shape, got {shape_summary}. This usually means multiple "
+                "test volumes were sharded across ranks; disable "
+                "`inference.test_time_augmentation.distributed_sharding` for multi-volume "
+                "tests."
+            )
+
     def _apply_mask_to_result(
         self,
         ensemble_result: torch.Tensor,
diff --git a/connectomics/training/lightning/data.py b/connectomics/training/lightning/data.py
@@ -14,7 +14,7 @@
 import torch
 from monai.data import CacheDataset, Dataset
 from monai.transforms import Compose
-from torch.utils.data import DataLoader
+from torch.utils.data import DataLoader, Sampler
 
 
 class ConnectomicsDataModule(pl.LightningDataModule):
@@ -125,21 +125,26 @@ def val_dataloader(self):
         return self._create_dataloader(self.val_dataset, shuffle=False)
 
     def test_dataloader(self):
+        sampler = None
+        if self.test_dataset is not None and _is_distributed_evaluation_active():
+            sampler = DistributedEvaluationSampler(self.test_dataset)
         return self._create_dataloader(
             self.test_dataset,
             shuffle=False,
             collate_fn=collate_dict_list,
+            sampler=sampler,
         )
 
-    def _create_dataloader(self, dataset, shuffle, collate_fn=None):
+    def _create_dataloader(self, dataset, shuffle, collate_fn=None, sampler=None):
         if dataset is None:
             return None
         if collate_fn is None:
             collate_fn = collate_dict
         return DataLoader(
             dataset=dataset,
             batch_size=self.batch_size,
-            shuffle=shuffle,
+            shuffle=shuffle if sampler is None else False,
+            sampler=sampler,
             num_workers=self.num_workers,
             pin_memory=self.pin_memory,
             persistent_workers=(self.persistent_workers and self.num_workers > 0),
@@ -189,6 +194,45 @@ def __getitem__(self, index):
         return self.dataset[index % len(self.dataset)]
 
 
+def _is_distributed_evaluation_active() -> bool:
+    return torch.distributed.is_available() and torch.distributed.is_initialized()
+
+
+class DistributedEvaluationSampler(Sampler[int]):
+    """Shard evaluation samples across DDP ranks without padding or duplication."""
+
+    def __init__(
+        self,
+        dataset,
+        *,
+        rank: Optional[int] = None,
+        world_size: Optional[int] = None,
+    ):
+        if rank is None or world_size is None:
+            if not _is_distributed_evaluation_active():
+                raise RuntimeError(
+                    "DistributedEvaluationSampler requires an initialized distributed process "
+                    "group or explicit rank/world_size."
+                )
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+
+        if world_size <= 0:
+            raise ValueError(f"world_size must be positive, got {world_size}.")
+        if rank < 0 or rank >= world_size:
+            raise ValueError(f"rank must satisfy 0 <= rank < world_size, got {rank}/{world_size}.")
+
+        self.rank = int(rank)
+        self.world_size = int(world_size)
+        self.indices = list(range(len(dataset)))[self.rank :: self.world_size]
+
+    def __iter__(self):
+        return iter(self.indices)
+
+    def __len__(self):
+        return len(self.indices)
+
+
 def collate_dict(
     batch: List[Dict[str, Any]],
 ) -> Dict[str, Any]:
@@ -226,6 +270,7 @@ def collate_dict_list(
 
 __all__ = [
     "ConnectomicsDataModule",
+    "DistributedEvaluationSampler",
     "SimpleDataModule",
     "collate_dict",
     "collate_dict_list",
diff --git a/connectomics/training/lightning/trainer.py b/connectomics/training/lightning/trainer.py
@@ -300,6 +300,7 @@ def create_trainer(
         detect_anomaly=detect_anomaly,
         enable_progress_bar=True,
         plugins=plugins,
+        use_distributed_sampler=mode not in ("test", "tune-test"),
     )
 
     _log.info(f"  Training mode: {training_mode}")
diff --git a/scripts/main.py b/scripts/main.py
@@ -399,6 +399,15 @@ def maybe_limit_test_devices(cfg: Config, datamodule) -> bool:
 
     tta_cfg = getattr(getattr(cfg, "inference", None), "test_time_augmentation", None)
     distributed_tta_sharding = bool(getattr(tta_cfg, "distributed_sharding", False))
+    if distributed_tta_sharding and test_volume_count != 1:
+        print(
+            "  WARNING: Disabling distributed TTA sharding for multi-volume test datasets. "
+            "DDP ranks would otherwise reduce predictions from different volumes, which can "
+            "mix samples or hang when shapes differ."
+        )
+        tta_cfg.distributed_sharding = False
+        distributed_tta_sharding = False
+
     if distributed_tta_sharding and test_volume_count == 1:
         safe_devices = max(1, min(requested_devices, _estimate_tta_total_passes(cfg)))
         if safe_devices < requested_devices:
diff --git a/tests/unit/test_connectomics_module.py b/tests/unit/test_connectomics_module.py
@@ -1,5 +1,6 @@
 from typing import List, Optional
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -173,3 +174,25 @@ def test_save_metrics_to_file_uses_runtime_inference_output_path(tmp_path):
     )
 
     assert (tmp_path / "evaluation_metrics_vol0.txt").exists()
+
+
+def test_load_cached_predictions_reads_existing_prediction_files(tmp_path, monkeypatch):
+    """Existing cached predictions should load without falling back to inference."""
+    cfg = _base_config()
+    module = ConnectomicsModule(cfg, model=SimpleModel())
+    pred_file = tmp_path / "sample_prediction.h5"
+    pred_file.write_text("stub")
+
+    expected = np.ones((1, 4, 4, 4), dtype=np.float32)
+    monkeypatch.setattr("connectomics.training.lightning.model.read_volume", lambda *_args, **_kwargs: expected)
+
+    predictions, loaded, suffix = module._load_cached_predictions(
+        str(tmp_path),
+        ["sample"],
+        "_prediction.h5",
+        "test",
+    )
+
+    assert loaded is True
+    assert suffix == "_prediction.h5"
+    assert predictions.shape == (1, 4, 4, 4)
diff --git a/tests/unit/test_inference_tta_masking.py b/tests/unit/test_inference_tta_masking.py
@@ -151,3 +151,22 @@ def test_tta_channel_activations_follow_python_slice_semantics():
     expected[:, 0:2, ...] = torch.sigmoid(expected[:, 0:2, ...])
     expected[:, 2:3, ...] = torch.tanh(expected[:, 2:3, ...])
     assert torch.allclose(pred, expected)
+
+
+def test_distributed_tta_reduction_raises_on_mismatched_rank_shapes(monkeypatch):
+    cfg = Config()
+    predictor = TTAPredictor(cfg=cfg, sliding_inferer=None, forward_fn=_forward_constant)
+
+    monkeypatch.setattr(predictor, "_distributed_context", lambda: (True, 0, 2))
+
+    def _fake_all_gather(output_tensors, _input_tensor):
+        output_tensors[0].copy_(torch.tensor([5, 1, 2, 4, 4, 4, 0], dtype=torch.int64))
+        output_tensors[1].copy_(torch.tensor([5, 1, 2, 6, 4, 4, 0], dtype=torch.int64))
+
+    monkeypatch.setattr(torch.distributed, "all_gather", _fake_all_gather)
+
+    with pytest.raises(RuntimeError, match="same shape"):
+        predictor._validate_distributed_reduction_shape(
+            torch.zeros((1, 2, 4, 4, 4), dtype=torch.float32),
+            reduction_device=torch.device("cpu"),
+        )
diff --git a/tests/unit/test_lightning_data_collate.py b/tests/unit/test_lightning_data_collate.py
@@ -4,7 +4,11 @@
 
 from connectomics.config import Config
 from connectomics.inference.output import resolve_output_filenames
-from connectomics.training.lightning.data import ConnectomicsDataModule, collate_dict
+from connectomics.training.lightning.data import (
+    ConnectomicsDataModule,
+    DistributedEvaluationSampler,
+    collate_dict,
+)
 
 
 def test_test_dataloader_preserves_variable_shape_tensors():
@@ -53,3 +57,17 @@ def test_resolve_output_filenames_supports_list_collated_images():
     }
 
     assert resolve_output_filenames(cfg, batch, global_step=11) == ["input_a", "input_b"]
+
+
+def test_distributed_evaluation_sampler_partitions_without_duplicates():
+    dataset = list(range(10))
+
+    rank0 = list(DistributedEvaluationSampler(dataset, rank=0, world_size=4))
+    rank1 = list(DistributedEvaluationSampler(dataset, rank=1, world_size=4))
+    rank2 = list(DistributedEvaluationSampler(dataset, rank=2, world_size=4))
+    rank3 = list(DistributedEvaluationSampler(dataset, rank=3, world_size=4))
+
+    combined = rank0 + rank1 + rank2 + rank3
+
+    assert sorted(combined) == list(range(10))
+    assert len(set(combined)) == 10
diff --git a/tests/unit/test_main_runtime_stage_switch.py b/tests/unit/test_main_runtime_stage_switch.py
@@ -4,7 +4,11 @@
 from connectomics.config import Config, save_config
 from connectomics.config.schema.inference import EvaluationConfig
 from connectomics.training.lightning.utils import setup_config
-from scripts.main import _is_test_evaluation_enabled, resolve_test_stage_runtime
+from scripts.main import (
+    _is_test_evaluation_enabled,
+    maybe_limit_test_devices,
+    resolve_test_stage_runtime,
+)
 
 
 def _make_args(config_path: Path, mode: str = "test"):
@@ -69,3 +73,36 @@ def test_is_test_evaluation_enabled_supports_mapping_or_dataclass_config():
 
     cfg.inference.evaluation.enabled = True
     assert _is_test_evaluation_enabled(cfg) is True
+
+
+class _DummyTestDataModule:
+    def __init__(self, volume_count: int):
+        self.test_data_dicts = [{} for _ in range(volume_count)]
+
+
+def test_maybe_limit_test_devices_disables_distributed_tta_sharding_for_multi_volume_tests():
+    cfg = Config()
+    cfg.system.num_gpus = 4
+    cfg.inference.test_time_augmentation.enabled = True
+    cfg.inference.test_time_augmentation.distributed_sharding = True
+
+    changed = maybe_limit_test_devices(cfg, _DummyTestDataModule(volume_count=2))
+
+    assert changed is True
+    assert cfg.system.num_gpus == 2
+    assert cfg.inference.test_time_augmentation.distributed_sharding is False
+
+
+def test_maybe_limit_test_devices_keeps_distributed_tta_sharding_for_single_volume_tests():
+    cfg = Config()
+    cfg.system.num_gpus = 4
+    cfg.inference.test_time_augmentation.enabled = True
+    cfg.inference.test_time_augmentation.distributed_sharding = True
+    cfg.inference.test_time_augmentation.flip_axes = [1, 2]
+    cfg.inference.test_time_augmentation.rotation90_axes = [[1, 2]]
+
+    changed = maybe_limit_test_devices(cfg, _DummyTestDataModule(volume_count=1))
+
+    assert changed is False
+    assert cfg.system.num_gpus == 4
+    assert cfg.inference.test_time_augmentation.distributed_sharding is True
diff --git a/tests/unit/test_trainer_logging.py b/tests/unit/test_trainer_logging.py
@@ -19,3 +19,27 @@ def test_create_trainer_disables_logger_for_non_train_modes(tmp_path: Path, mode
 
     assert trainer.logger is None
     assert not (tmp_path / "logs").exists()
+
+
+def test_create_trainer_disables_lightning_distributed_sampler_replacement_for_test(
+    tmp_path: Path, monkeypatch
+):
+    cfg = from_dict(
+        {
+            "system": {"num_gpus": 0},
+            "optimization": {"max_epochs": 1},
+        }
+    )
+    captured = {}
+
+    class _FakeTrainer:
+        def __init__(self, **kwargs):
+            captured.update(kwargs)
+            self.logger = kwargs.get("logger")
+
+    monkeypatch.setattr("connectomics.training.lightning.trainer.pl.Trainer", _FakeTrainer)
+
+    trainer = create_trainer(cfg, run_dir=tmp_path, mode="test")
+
+    assert isinstance(trainer, _FakeTrainer)
+    assert captured["use_distributed_sampler"] is False

Original file line number	Diff line number	Diff line change
`@@ -300,6 +300,7 @@ def create_trainer(`
`300`	`300`	`detect_anomaly=detect_anomaly,`
`301`	`301`	`enable_progress_bar=True,`
`302`	`302`	`plugins=plugins,`
	`303`	`+ use_distributed_sampler=mode not in ("test", "tune-test"),`
`303`	`304`	`)`
`304`	`305`
`305`	`306`	`_log.info(f" Training mode: {training_mode}")`