ROCm
diff --git a/‎tests/ccl/test_all_gather_gluon.py‎
Lines changed: 114 additions & 66 deletions b/‎tests/ccl/test_all_gather_gluon.py‎
Lines changed: 114 additions & 66 deletions
@@ -1,17 +1,27 @@
 # SPDX-License-Identifier: MIT
 # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 
+"""All-gather correctness: eager and under HIP-graph capture, identical and
+varying inputs, with and without the trailing cross-rank barrier.
+
+The eager + barrier path (the original coverage) hides a cross-rank
+write-visibility issue that only surfaces when the trailing barrier is dropped
+AND the input changes between back-to-back ops — the regime cudagraph capture
+forces on vLLM (the host barrier is illegal under capture, so async_op=True, and
+the captured step replays every token with fresh activations). `mode` separates
+the cause from the trigger:
+  eager_barrier   : eager, async_op=False — trailing ctx.barrier() (correct baseline)
+  eager_nobarrier : eager, async_op=True  — no barrier, no graph (isolates the barrier)
+  graph           : capture + replay, async_op=True (the vLLM regime)
+`vary=False` replays identical input (a stale read looks correct); `vary=True`
+feeds fresh input each step. impl: torch (known-good control via torch.distributed),
+triton and gluon (the two iris backends, selected by config.use_gluon).
 """
-Test suite for all-gather collective operation using Gluon.
-"""
-
-import os
 
 import pytest
 import torch
 import torch.distributed as dist
 
-# Try to import Gluon, skip tests if not available
 try:
     import iris
     from iris.ccl import Config
@@ -22,84 +32,122 @@
     GLUON_AVAILABLE = False
 
 
+NUM_REPLAYS = 200
+
+
+def _all_gather(impl, src, stage_buf, result, shmem, config, async_op):
+    """Stage src into the input buffer, then all-gather. Module-level (no closure
+    over shmem) so the test can ``del shmem`` for IPC cleanup."""
+    stage_buf.copy_(src)
+    if impl == "torch":
+        dist.all_gather_into_tensor(result, stage_buf)
+    else:
+        shmem.ccl.all_gather(result, stage_buf, config=config, async_op=async_op)
+
+
+def _make_buffers(impl, shmem, rank, world_size, M, N, dtype, block_size_m, block_size_n):
+    """Resolve impl -> (stage_buf, result, config) in one place: torch uses plain
+    device tensors and no config; the iris backends use symmetric-heap buffers and
+    a use_gluon config. Output is (world_size * M, N) — block r holds rank r's input."""
+    if impl == "torch":
+        stage = torch.empty((M, N), dtype=dtype, device=f"cuda:{rank}")
+        result = torch.empty((world_size * M, N), dtype=dtype, device=f"cuda:{rank}")
+        return stage, result, None
+    stage = shmem.zeros((M, N), dtype=dtype)
+    result = shmem.zeros((world_size * M, N), dtype=dtype)
+    config = Config(use_gluon=(impl == "gluon"), block_size_m=block_size_m, block_size_n=block_size_n)
+    return stage, result, config
+
+
 @pytest.mark.skipif(not GLUON_AVAILABLE, reason="Gluon not available")
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.float32,
-        torch.bfloat16,
-    ],
-)
+@pytest.mark.parametrize("impl", ["torch", "triton", "gluon"])
+@pytest.mark.parametrize("mode", ["eager_barrier", "eager_nobarrier", "graph"])
+@pytest.mark.parametrize("vary", [False, True])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16], ids=["bf16", "fp16"])
 @pytest.mark.parametrize(
     "M, N, block_size_m, block_size_n",
-    [
-        # block_size_n must be a multiple of (threads_per_warp * num_warps).
-        # With defaults (threads_per_warp=64, num_warps=4), minimum is 256.
-        # elems_per_thread = block_size_n / 256: higher = wider vector loads.
-        (256, 256, 32, 256),  # Small: elems_per_thread=1 (scalar loads)
-        (1024, 512, 32, 512),  # Medium: elems_per_thread=2 (dword loads)
-        (8192, 8192, 32, 1024),  # Large: elems_per_thread=4 (dwordx4, optimal)
-    ],
+    [(64, 8192, 32, 1024), (256, 8192, 32, 1024)],
 )
-def test_all_gather_gluon(dtype, M, N, block_size_m, block_size_n):
-    """Test all-gather functionality using Gluon by comparing against PyTorch's implementation."""
-    # Ensure torch.distributed is initialized (should be done by test runner)
+def test_all_gather_gluon(impl, mode, vary, dtype, M, N, block_size_m, block_size_n):
+    """Rank r fills its whole input with 1 + r + replay%16 (exact integers), so
+    output block r must equal 1 + r + replay%16 — any >=1 mismatch is a real drop.
+    Controls: torch and eager_barrier must pass every cell (correct when synced),
+    so eager_nobarrier failing isolates the missing cross-rank barrier (no
+    cudagraph involved) and graph failing is the vLLM regime. Per-peer-slice fail
+    tallies show which peer slices dropped (structured vs scattered)."""
     if not dist.is_initialized():
         pytest.skip("torch.distributed not initialized")
+    if impl == "torch" and mode == "eager_nobarrier":
+        pytest.skip("torch has no barrier knob; eager_barrier already covers eager torch")
 
-    # Size heap to fit input (M*N) + output (max_ranks*M*N) with headroom
-    max_ranks = int(os.environ.get("WORLD_SIZE", 8))
-    elem_size = torch.tensor([], dtype=dtype).element_size()
-    needed = (1 + max_ranks) * M * N * elem_size
-    heap_size = max(2**30, int(needed * 2))  # 2x headroom, minimum 1GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
-
-    # Each rank has an M x N input tensor
-    # Output is (world_size * M, N) - concatenated along dimension 0
-    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-    # Fill with deterministic values for easier debugging
-    pytorch_input_tensor.fill_(float(rank + 1))
-
-    # Create output tensor for PyTorch: (world_size * M, N)
-    pytorch_output_tensor = torch.zeros(world_size * M, N, dtype=dtype, device=f"cuda:{rank}")
-
-    # Run PyTorch's all_gather_into_tensor to get reference output
-    shmem.barrier()
-    dist.all_gather_into_tensor(pytorch_output_tensor, pytorch_input_tensor)
-    torch.cuda.synchronize()
+    # Resolve (impl, mode) up front; the body runs straight-line off these.
+    async_op = mode != "eager_barrier"
+    capture = mode == "graph"
 
-    # Now set up Iris Gluon all_gather
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
+    shmem = iris.iris(2**33)  # 8 GB
+    rank, world_size = shmem.get_rank(), shmem.get_num_ranks()
+    src = torch.empty((M, N), dtype=dtype, device=f"cuda:{rank}")
+    stage_buf, result, config = _make_buffers(impl, shmem, rank, world_size, M, N, dtype, block_size_m, block_size_n)
+    shmem.barrier()
 
-    iris_output_tensor = shmem.zeros((world_size * M, N), dtype=dtype)
+    def fill_src(replay):
+        src.fill_(float(1 + rank + (replay % 16)))
 
-    # Run Iris Gluon all_gather
-    shmem.barrier()
-    config = Config(use_gluon=True, block_size_m=block_size_m, block_size_n=block_size_n)
-    shmem.ccl.all_gather(iris_output_tensor, iris_input_tensor, config=config)
+    # Warmup (runs lazy JIT/setup), then capture the step once if in graph mode.
+    fill_src(0)
+    _all_gather(impl, src, stage_buf, result, shmem, config, async_op)
     torch.cuda.synchronize()
+    shmem.barrier()
 
-    # Compare results
-    atol = 1e-3 if dtype == torch.float16 else 1e-5
-    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
-
+    graph = None
+    if capture:
+        stream = torch.cuda.Stream()
+        stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(stream):
+            graph = torch.cuda.CUDAGraph()
+            graph.capture_begin()
+            _all_gather(impl, src, stage_buf, result, shmem, config, async_op)
+            graph.capture_end()
+        torch.cuda.current_stream().wait_stream(stream)
+
+    atol = 0.5  # exact integer inputs; >=1 mismatch is a real drop
+    failures = []  # (step, max|diff|, bad_slices)
+    block_fail = [0] * world_size  # steps each peer slice dropped
     try:
-        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank}: Iris Gluon output doesn't match PyTorch's all_gather_into_tensor"
+        for i in range(NUM_REPLAYS):
+            replay = i if vary else 0
+            fill_src(replay)
+            if capture:
+                graph.replay()
+            else:
+                _all_gather(impl, src, stage_buf, result, shmem, config, async_op)
+            torch.cuda.synchronize()
+            diffs = [
+                torch.abs(result[r * M : (r + 1) * M] - float(1 + r + (replay % 16))).max().item()
+                for r in range(world_size)
+            ]
+            bad = [r for r in range(world_size) if diffs[r] > atol]
+            for r in bad:
+                block_fail[r] += 1
+            if bad:
+                failures.append((i, round(max(diffs[r] for r in bad), 4), bad))
+        print(
+            f"[rank {rank}] all_gather impl={impl} mode={mode} vary={vary} dtype={dtype} "
+            f"{M}x{N}: {NUM_REPLAYS - len(failures)}/{NUM_REPLAYS} ok; "
+            f"per-peer-slice fail counts={block_fail}" + (f"; first FAIL={failures[0]}" if failures else ""),
+            flush=True,
+        )
+        assert not failures, (
+            f"impl={impl} mode={mode} vary={vary} dtype={dtype} {M}x{N}: "
+            f"{len(failures)}/{NUM_REPLAYS} steps wrong (first {failures[0]}; per-peer-slice "
+            f"fail counts={block_fail}). torch and eager_barrier must pass; eager_nobarrier "
+            f"failing isolates the missing cross-rank barrier (no cudagraph); graph is the vLLM regime."
         )
     finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
+        if graph is not None:
+            del graph
         shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
         del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
         import gc
 
         gc.collect()