Use independent shards for multi-GPU test

Donglai Wei · Donglai Wei · commit 5470b073c3a7 · 2026-03-10T22:06:48.000-04:00
diff --git a/scripts/main.py b/scripts/main.py
@@ -437,6 +437,99 @@ def maybe_limit_test_devices(cfg: Config, datamodule) -> bool:
     return True
 
 
+def resolve_test_rank_shard_from_env() -> tuple[int | None, int | None]:
+    """Return rank/world_size for externally launched multi-process test jobs."""
+    for rank_key, world_key in (("RANK", "WORLD_SIZE"), ("SLURM_PROCID", "SLURM_NTASKS")):
+        rank_raw = os.environ.get(rank_key)
+        world_raw = os.environ.get(world_key)
+        if rank_raw is None or world_raw is None:
+            continue
+        try:
+            rank = int(rank_raw)
+            world_size = int(world_raw)
+        except ValueError:
+            continue
+        if world_size > 1:
+            return rank, world_size
+
+    return None, None
+
+
+def resolve_test_image_paths(cfg: Config) -> list[str]:
+    """Resolve test image paths from config for shard planning."""
+    data_cfg = getattr(cfg, "data", None)
+    test_image = getattr(getattr(data_cfg, "test", None), "image", None)
+    if not test_image:
+        return []
+
+    from connectomics.training.lightning.path_utils import expand_file_paths
+
+    try:
+        return expand_file_paths(test_image)
+    except Exception as exc:
+        print(f"  WARNING: Failed to resolve test_image paths for sharding: {exc}")
+        return []
+
+
+def maybe_enable_independent_test_sharding(args, cfg: Config) -> bool:
+    """Run test as independent single-GPU shards instead of DDP when rank info is available."""
+    requested_devices = int(getattr(cfg.system, "num_gpus", 0) or 0)
+    if requested_devices <= 1:
+        return False
+
+    shard_id = getattr(args, "shard_id", None)
+    num_shards = getattr(args, "num_shards", None)
+    source = None
+
+    if shard_id is not None and num_shards is not None and int(num_shards) > 1:
+        source = "explicit shard arguments"
+    else:
+        test_image_paths = resolve_test_image_paths(cfg)
+        if len(test_image_paths) <= 1:
+            return False
+
+        shard_id, num_shards = resolve_test_rank_shard_from_env()
+        if shard_id is None or num_shards is None:
+            return False
+
+        args.shard_id = shard_id
+        args.num_shards = num_shards
+        source = "distributed launcher environment"
+
+    tta_cfg = getattr(getattr(cfg, "inference", None), "test_time_augmentation", None)
+    if tta_cfg is not None and bool(getattr(tta_cfg, "distributed_sharding", False)):
+        print(
+            "  WARNING: Disabling distributed TTA sharding for independent per-rank test sharding."
+        )
+        tta_cfg.distributed_sharding = False
+
+    cfg.system.num_gpus = 1 if torch.cuda.is_available() else 0
+    print(
+        "  INFO: Independent multi-GPU test sharding enabled "
+        f"({source}); each process will handle its own shard with no DDP communication."
+    )
+    return True
+
+
+def has_assigned_test_shard(cfg: Config, args) -> bool:
+    """Return True if the current shard has at least one test volume to process."""
+    shard_id = getattr(args, "shard_id", None)
+    num_shards = getattr(args, "num_shards", None)
+    if shard_id is None or num_shards is None:
+        return True
+
+    test_image_paths = resolve_test_image_paths(cfg)
+    if not test_image_paths:
+        return True
+
+    if test_image_paths[shard_id::num_shards]:
+        return True
+
+    print(f"  Shard {shard_id}/{num_shards} is empty, nothing to do.")
+    print("[OK]Test completed successfully (empty shard).")
+    return False
+
+
 def shard_test_datamodule(datamodule, shard_id: int, num_shards: int):
     """Shard test volumes across machines.
 
@@ -766,6 +859,11 @@ def main():
         print(f"Random seed set to: {cfg.system.seed}")
         seed_everything(cfg.system.seed, workers=True)
 
+    if args.mode == "test":
+        maybe_enable_independent_test_sharding(args, cfg)
+        if not has_assigned_test_shard(cfg, args):
+            return
+
     # Cache-only preflight path for test mode (can skip model/trainer/dataloader entirely).
     if try_cache_only_test_execution(cfg, args.mode, args.shard_id, args.num_shards):
         return
@@ -839,6 +937,17 @@ def main():
             # Re-resolve test-stage runtime overrides after tuning, including sentinels.
             cfg = resolve_test_stage_runtime(cfg)
 
+            if maybe_enable_independent_test_sharding(args, cfg):
+                trainer = create_trainer(
+                    cfg,
+                    run_dir=run_dir,
+                    fast_dev_run=args.fast_dev_run,
+                    ckpt_path=ckpt_path,
+                    mode="test",
+                )
+            if not has_assigned_test_shard(cfg, args):
+                return
+
             # Create datamodule
             datamodule = create_datamodule(cfg, mode="test")
 
@@ -882,7 +991,7 @@ def main():
 
             trainer.test(
                 model,
-                datamodule=datamodule,
+                datamodule,
                 ckpt_path=test_ckpt_path,
             )
 
diff --git a/tests/unit/test_main_runtime_stage_switch.py b/tests/unit/test_main_runtime_stage_switch.py
@@ -1,12 +1,16 @@
 import argparse
 from pathlib import Path
 
+import torch
+
 from connectomics.config import Config, save_config
 from connectomics.config.schema.inference import EvaluationConfig
 from connectomics.training.lightning.utils import setup_config
 from scripts.main import (
     _is_test_evaluation_enabled,
+    has_assigned_test_shard,
     maybe_limit_test_devices,
+    maybe_enable_independent_test_sharding,
     resolve_test_stage_runtime,
 )
 
@@ -30,6 +34,8 @@ def _make_args(config_path: Path, mode: str = "test"):
         tune_trials=None,
         nnunet_preprocess=False,
         overrides=[],
+        shard_id=None,
+        num_shards=None,
     )
 
 
@@ -106,3 +112,69 @@ def test_maybe_limit_test_devices_keeps_distributed_tta_sharding_for_single_volu
     assert changed is False
     assert cfg.system.num_gpus == 4
     assert cfg.inference.test_time_augmentation.distributed_sharding is True
+
+
+def test_maybe_enable_independent_test_sharding_uses_rank_env_for_multi_volume_tests(
+    tmp_path, monkeypatch
+):
+    cfg = Config()
+    cfg.system.num_gpus = 4
+    cfg.inference.test_time_augmentation.distributed_sharding = True
+    args = _make_args(tmp_path / "config.yaml")
+
+    monkeypatch.setenv("SLURM_PROCID", "2")
+    monkeypatch.setenv("SLURM_NTASKS", "4")
+    monkeypatch.setattr("scripts.main.resolve_test_image_paths", lambda _cfg: ["a", "b", "c", "d"])
+
+    changed = maybe_enable_independent_test_sharding(args, cfg)
+
+    assert changed is True
+    assert args.shard_id == 2
+    assert args.num_shards == 4
+    assert cfg.system.num_gpus == (1 if torch.cuda.is_available() else 0)
+    assert cfg.inference.test_time_augmentation.distributed_sharding is False
+
+
+def test_maybe_enable_independent_test_sharding_uses_explicit_shard_args(tmp_path):
+    cfg = Config()
+    cfg.system.num_gpus = 4
+    args = _make_args(tmp_path / "config.yaml")
+    args.shard_id = 1
+    args.num_shards = 4
+
+    changed = maybe_enable_independent_test_sharding(args, cfg)
+
+    assert changed is True
+    assert cfg.system.num_gpus == (1 if torch.cuda.is_available() else 0)
+
+
+def test_maybe_enable_independent_test_sharding_skips_single_volume_tests(
+    tmp_path, monkeypatch
+):
+    cfg = Config()
+    cfg.system.num_gpus = 4
+    args = _make_args(tmp_path / "config.yaml")
+
+    monkeypatch.setenv("SLURM_PROCID", "0")
+    monkeypatch.setenv("SLURM_NTASKS", "4")
+    monkeypatch.setattr("scripts.main.resolve_test_image_paths", lambda _cfg: ["only_one"])
+
+    changed = maybe_enable_independent_test_sharding(args, cfg)
+
+    assert changed is False
+    assert args.shard_id is None
+    assert args.num_shards is None
+    assert cfg.system.num_gpus == 4
+
+
+def test_has_assigned_test_shard_returns_false_for_empty_slice(
+    tmp_path, monkeypatch
+):
+    args = _make_args(tmp_path / "config.yaml")
+    cfg = Config()
+    args.shard_id = 3
+    args.num_shards = 4
+
+    monkeypatch.setattr("scripts.main.resolve_test_image_paths", lambda _cfg: ["vol0"])
+
+    assert has_assigned_test_shard(cfg, args) is False