data: Replace NBX consensus with a portable sparse exchange

mloubout · mloubout · commit f17919a85395 · 2026-06-26T10:34:17.000-04:00
The point-to-point router used the NBX nonblocking-consensus algorithm
(Issend + Iprobe(ANY_SOURCE) + Ibarrier). It works under IntelMPI and
mpirun, but deadlocks under OpenMPI inside the ipyparallel example
notebooks -- whereas ordinary collective/halo MPI (e.g. op.apply) runs
fine there. So the NBX pattern itself is the fragile part.

Replace it with a reduce-scatter-based sparse exchange: each rank learns
how many peers will send to it via one Reduce_scatter_block over a
length-nprocs 0/1 indicator, then Isend / Probe / Recv / Waitall. These
are the same standard, portable calls that work elsewhere; payloads still
move strictly point-to-point (no data all-to-all). Rename nbx_exchange -&gt;
sparse_exchange and nbx_push -&gt; sparse_push accordingly, and revert the
ineffective OpenMPI yield CI env var.

Correctness verified under mpirun: TestDataDistributed, TestDataGather and
the sparse advanced-indexing tests pass (modes 4 and 6).
diff --git a/.github/workflows/examples-mpi.yaml b/.github/workflows/examples-mpi.yaml
@@ -44,11 +44,6 @@ jobs:
       DEVITO_ARCH: "gcc"
       CC: "gcc"
       CXX: "g++"
-      # Make OpenMPI yield the CPU while waiting instead of busy-spinning. With
-      # 4 ipyparallel engines contending for cores, the point-to-point routing
-      # in the data notebooks otherwise livelocks under OpenMPI (IntelMPI yields
-      # by default, so it is unaffected and ignores this OMPI_* setting).
-      OMPI_MCA_mpi_yield_when_idle: "1"
 
     steps:
     - name: Checkout devito
diff --git a/devito/data/distributed/__init__.py b/devito/data/distributed/__init__.py
@@ -19,4 +19,4 @@
 from devito.data.distributed.plan import ExchangePlan  # noqa
 from devito.data.distributed.redistribution import redistribute_set  # noqa
 from devito.data.distributed.selection import Selection  # noqa
-from devito.data.distributed.transport import nbx_exchange  # noqa
+from devito.data.distributed.transport import sparse_exchange  # noqa
diff --git a/devito/data/distributed/exchange.py b/devito/data/distributed/exchange.py
@@ -62,19 +62,20 @@ class _ExchangeKey:
     """
     Hashable, content-addressed key wrapping `(data, idx)` for plan caching.
 
-    NumPy index arrays are unhashable, so the key digests their content (together
-    with the decomposition and shape the plan depends on) and lets
-    `functools.lru_cache` own eviction. Content addressing means a freshly
-    built index array with the same values still hits, and an in-place mutation
-    correctly misses.
+    Generates a signature from the `data` identity and the `idx` content, so that
+    the same plan is reused across calls with the same index expression, even if
+    the `data` object is a different view of the same underlying buffer.
+    The signature does not include the `data` values,
+    but only the `data` metadata (shape, decomposition, distributor)
+    and the `idx` content so that the same plan is reused across calls.
     """
 
     __slots__ = ('data', 'idx', '_sig')
 
     def __init__(self, data, idx):
         self.data = data
         self.idx = idx
-        self._sig = _signature(data, idx)
+        self._sig = _signature(self.data, self.idx)
 
     def __hash__(self):
         return hash(self._sig)
diff --git a/devito/data/distributed/plan.py b/devito/data/distributed/plan.py
@@ -23,11 +23,11 @@
 import numpy as np
 
 from devito.data.distributed.selection import Affine, IndexScalar
-from devito.data.distributed.transport import nbx_exchange
+from devito.data.distributed.transport import sparse_exchange
 from devito.mpi import MPI
 from devito.tools import prod
 
-__all__ = ['ExchangePlan', 'nbx_push']
+__all__ = ['ExchangePlan', 'sparse_push']
 
 
 class ExchangePlan:
@@ -206,7 +206,7 @@ def get(self, local):
         # Send each owner the offsets of the elements we want from it...
         headers = {r: _encode(ps, self._block_offsets, dist_lin)
                    for r, (_, dist_lin) in self._peers.items()}
-        requests = nbx_exchange(comm, headers, np.int64, tag=41)
+        requests = sparse_exchange(comm, headers, np.int64, tag=41)
 
         # ...and reply to whoever asked us with the requested values
         moved = self._moved(local)
@@ -215,7 +215,7 @@ def get(self, local):
             block_offsets, dist_lin = _decode(buf)
             midx = self._owner_apply(moved, dist_lin, block_offsets)
             replies[src] = np.ascontiguousarray(moved[midx]).reshape(-1)
-        payloads = nbx_exchange(comm, replies, dtype, tag=42)
+        payloads = sparse_exchange(comm, replies, dtype, tag=42)
 
         # Scatter the received values back into result-row order
         rows_flat = np.zeros((self._nrows(), ps), dtype=dtype)
@@ -239,9 +239,9 @@ def put(self, local, value):
         """
         self._raise_on_error(check_dup=True)
         rows_flat = self._value_to_rows(value, local.dtype)
-        nbx_push(self.comm, self.layout.distributed_axes, self._repl_total,
-                 self._peers, self._block_offsets, self.payload_size, rows_flat,
-                 local)
+        sparse_push(self.comm, self.layout.distributed_axes, self._repl_total,
+                    self._peers, self._block_offsets, self.payload_size,
+                    rows_flat, local)
 
     # ------------------------------------------------------- result <-> rows
 
@@ -422,8 +422,8 @@ def _group_peers(layout, owners, dist_local, sub, gcoords):
     return peers, oob_error, dup_error
 
 
-def nbx_push(comm, distributed_axes, repl_total, peers, block_offsets,
-             payload_size, rows_flat, local):
+def sparse_push(comm, distributed_axes, repl_total, peers, block_offsets,
+                payload_size, rows_flat, local):
     """
     Route `rows_flat` to the owner ranks (NBX) and scatter each received
     payload into `local` at its owner-local position.
@@ -456,8 +456,8 @@ def nbx_push(comm, distributed_axes, repl_total, peers, block_offsets,
                for r, (_, dist_lin) in peers.items()}
     payloads = {r: rows_flat[rows].reshape(-1)
                 for r, (rows, _) in peers.items() if rows.size}
-    requests = nbx_exchange(comm, headers, np.int64, tag=43)
-    values = nbx_exchange(comm, payloads, rows_flat.dtype, tag=44)
+    requests = sparse_exchange(comm, headers, np.int64, tag=43)
+    values = sparse_exchange(comm, payloads, rows_flat.dtype, tag=44)
 
     # ...then scatter whatever we received into our own local array
     moved = np.moveaxis(local, distributed_axes, range(len(distributed_axes)))
diff --git a/devito/data/distributed/redistribution.py b/devito/data/distributed/redistribution.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from devito.data.distributed.layout import Layout
-from devito.data.distributed.plan import _group_peers, _resolve_owners, nbx_push
+from devito.data.distributed.plan import _group_peers, _resolve_owners, sparse_push
 from devito.data.distributed.selection import Affine, Selection
 from devito.mpi import MPI
 
@@ -121,12 +121,12 @@ def _push(layout, gcoords, values, local):
     Push `values` (one per global coordinate in `gcoords`) to their owners.
 
     A structured assignment has exactly one value per distributed point and no
-    replicated payload, so it is `nbx_push` with `payload_size == 1`
+    replicated payload, so it is `sparse_push` with `payload_size == 1`
     (`block_offsets == [0]`, `repl_total == 1`).
     """
     owners, dist_local, sub = _resolve_owners(None, layout, gcoords)
     peers, _, _ = _group_peers(layout, owners, dist_local, sub, gcoords)
 
     block_offsets = np.zeros(1, dtype=np.int64)   # no replicated payload
-    nbx_push(layout.distributor.comm, layout.distributed_axes, 1, peers,
-             block_offsets, 1, values.reshape(-1, 1), local)
+    sparse_push(layout.distributor.comm, layout.distributed_axes, 1, peers,
+                block_offsets, 1, values.reshape(-1, 1), local)
diff --git a/devito/data/distributed/transport.py b/devito/data/distributed/transport.py
@@ -2,29 +2,35 @@
 Transport layer for distributed data redistribution.
 
 This module knows nothing about indexing or `Data`; it only moves contiguous
-buffers between MPI ranks. The single primitive, `nbx_exchange`, performs
-a sparse "all-to-some" exchange in which only the ranks that actually share data
-ever communicate. It can be swapped for neighbor collectives or a persistent
-graph communicator without affecting the layers above.
+buffers between MPI ranks. The single primitive, `sparse_exchange`, performs a
+sparse "all-to-some" exchange in which only the ranks that actually share data
+exchange payloads.
+
+Each rank first learns *how many* peers will send to it via a single small
+`Reduce_scatter_block` over one integer per rank, then posts the point-to-point
+messages and receives exactly that many. This relies only on standard, widely
+portable MPI calls (no synchronous-send / nonblocking-barrier consensus), so it
+behaves uniformly across MPI implementations; the payloads themselves still move
+strictly point-to-point, so no data all-to-all takes place.
 """
 
 import numpy as np
 
 from devito.mpi import MPI
 from devito.tools import dtype_to_mpidtype
 
-__all__ = ['nbx_exchange']
+__all__ = ['sparse_exchange']
 
 
-def nbx_exchange(comm, sendbufs, dtype, tag=0):
+def sparse_exchange(comm, sendbufs, dtype, tag=0):
     """
-    Sparse "all-to-some" exchange via nonblocking consensus (NBX).
+    Sparse "all-to-some" exchange of contiguous buffers.
 
-    Each rank sends a buffer to each peer listed in `sendbufs` and receives
-    from whichever ranks send to it, without any rank needing global knowledge
-    of the communication pattern and without any dense collective. Only ranks
-    that actually exchange data communicate; global termination is detected with
-    a single nonblocking barrier (log-depth).
+    Each rank sends a buffer to each peer listed in `sendbufs` and receives from
+    whichever ranks send to it. The number of incoming messages is discovered
+    with one `Reduce_scatter_block` over a length-`nprocs` 0/1 indicator (a few
+    bytes per rank); only ranks that share data then exchange payloads, strictly
+    point-to-point.
 
     Parameters
     ----------
@@ -44,16 +50,9 @@ def nbx_exchange(comm, sendbufs, dtype, tag=0):
     dict
         Maps each source rank to the 1D buffer received from it. The caller
         reshapes using its known payload shape.
-
-    Notes
-    -----
-    Implements the NBX algorithm (Hoefler et al., "Scalable Communication
-    Protocols for Dynamic Sparse Data Exchange"). Synchronous sends (`Issend`)
-    complete only once matched by a receive, so a rank can enter the nonblocking
-    barrier only after every message it sent has been taken. Once all ranks are
-    in the barrier no message is in flight, so probing can safely stop.
     """
     rank = comm.Get_rank()
+    nprocs = comm.Get_size()
     mpitype = dtype_to_mpidtype(dtype)
 
     recvd = {}
@@ -63,32 +62,36 @@ def nbx_exchange(comm, sendbufs, dtype, tag=0):
     if local is not None and local.size:
         recvd[rank] = np.ravel(np.ascontiguousarray(local))
 
-    # Post synchronous sends to every other peer. The buffers must stay alive
-    # until the matching requests complete, hence `live_bufs`.
+    # Discover how many peers will send to this rank: the column sum of a 0/1
+    # "rank r sends to rank c" matrix, scattered so each rank gets its own count.
+    indicator = np.zeros(nprocs, dtype=np.int32)
+    for peer, buf in sendbufs.items():
+        if peer != rank and buf.size:
+            indicator[peer] = 1
+    incoming = np.zeros(1, dtype=np.int32)
+    comm.Reduce_scatter_block([indicator, MPI.INT], [incoming, MPI.INT],
+                              op=MPI.SUM)
+
+    # Post the point-to-point sends. The buffers must stay alive until the
+    # matching requests complete, hence `live_bufs`.
     sends = []
     live_bufs = []
     for peer, buf in sendbufs.items():
         if peer == rank or buf.size == 0:
             continue
         buf = np.ascontiguousarray(buf)
         live_bufs.append(buf)
-        sends.append(comm.Issend([buf, mpitype], dest=peer, tag=tag))
+        sends.append(comm.Isend([buf, mpitype], dest=peer, tag=tag))
 
-    barrier = None
+    # Receive exactly the expected number of messages, sizing each from its probe
     status = MPI.Status()
-    while True:
-        if comm.Iprobe(source=MPI.ANY_SOURCE, tag=tag, status=status):
-            src = status.Get_source()
-            count = status.Get_count(mpitype)
-            buf = np.empty(count, dtype=dtype)
-            comm.Recv([buf, mpitype], source=src, tag=tag)
-            recvd[src] = buf
-        elif barrier is None:
-            if MPI.Request.Testall(sends):
-                # All my sends were matched -> announce I am done sending
-                barrier = comm.Ibarrier()
-        elif barrier.Test():
-            # Everyone is done sending and nothing is in flight
-            break
+    for _ in range(int(incoming[0])):
+        comm.Probe(source=MPI.ANY_SOURCE, tag=tag, status=status)
+        src = status.Get_source()
+        count = status.Get_count(mpitype)
+        buf = np.empty(count, dtype=dtype)
+        comm.Recv([buf, mpitype], source=src, tag=tag)
+        recvd[src] = buf
 
+    MPI.Request.Waitall(sends)
     return recvd
diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py
@@ -103,6 +103,12 @@ def test_visible_devices(self, env_variables):
 
         eq = Eq(u, u+1)
 
+        # Prevent already defined visible-device environment variables
+        # from interfering with the test for example with ROCR_VISIBLE_DEVICES
+        # when running on the nvidia CI
+        env_variables = {**{i: None for i in self.visible_device_envs},
+                         **env_variables}
+
         with switchenv(env_variables):
             op1 = Operator(eq)
             argmap1 = op1.arguments()