Merge pull request #182 from PyLops/cuy-sync

mrava87 · web-flow · commit 75eed0103a38 · 2026-02-11T16:19:49.000Z
Add CuPy synchronization
diff --git a/pylops_mpi/utils/_mpi.py b/pylops_mpi/utils/_mpi.py
@@ -47,13 +47,13 @@ def mpi_allgather(base_comm: MPI.Comm,
         send_shapes = base_comm.allgather(send_buf.shape)
         (padded_send, padded_recv) = _prepare_allgather_inputs(send_buf, send_shapes, engine=engine)
         recv_buffer_to_use = recv_buf if recv_buf else padded_recv
-        base_comm.Allgather(padded_send, recv_buffer_to_use)
+        _mpi_calls(base_comm, "Allgather", padded_send, recv_buffer_to_use, engine=engine)
         return _unroll_allgather_recv(recv_buffer_to_use, padded_send.shape, send_shapes)
     else:
         # CuPy with non-CUDA-aware MPI
         if recv_buf is None:
-            return base_comm.allgather(send_buf)
-        base_comm.Allgather(send_buf, recv_buf)
+            return _mpi_calls(base_comm, "allgather", send_buf)
+        _mpi_calls(base_comm, "Allgather", send_buf, recv_buf)
         return recv_buf
 
 
@@ -92,14 +92,14 @@ def mpi_allreduce(base_comm: MPI.Comm,
     if deps.cuda_aware_mpi_enabled or engine == "numpy":
         ncp = get_module(engine)
         recv_buf = ncp.zeros(send_buf.size, dtype=send_buf.dtype)
-        base_comm.Allreduce(send_buf, recv_buf, op)
+        _mpi_calls(base_comm, "Allreduce", send_buf, recv_buf, op, engine=engine)
         return recv_buf
     else:
         # CuPy with non-CUDA-aware MPI
         if recv_buf is None:
-            return base_comm.allreduce(send_buf, op)
+            return _mpi_calls(base_comm, "allreduce", send_buf, op, engine=engine)
         # For MIN and MAX which require recv_buf
-        base_comm.Allreduce(send_buf, recv_buf, op)
+        _mpi_calls(base_comm, "Allreduce", send_buf, recv_buf, op, engine=engine)
         return recv_buf
 
 
@@ -131,11 +131,11 @@ def mpi_bcast(base_comm: MPI.Comm,
 
     """
     if deps.cuda_aware_mpi_enabled or engine == "numpy":
-        base_comm.Bcast(send_buf, root=root)
+        _mpi_calls(base_comm, "Bcast", send_buf, engine=engine, root=root)
         return send_buf
     # CuPy with non-CUDA-aware MPI: use object broadcast
     value = send_buf if base_comm.Get_rank() == root else None
-    return base_comm.bcast(value, root=root)
+    return _mpi_calls(base_comm, "bcast", value, engine=engine, root=root)
 
 
 def mpi_send(base_comm: MPI.Comm,
@@ -171,10 +171,10 @@ def mpi_send(base_comm: MPI.Comm,
         mpi_type = MPI._typedict[send_buf.dtype.char]
         if count is None:
             count = send_buf.size
-        base_comm.Send([send_buf, count, mpi_type], dest=dest, tag=tag)
+        _mpi_calls(base_comm, "Send", [send_buf, count, mpi_type], engine=engine, dest=dest, tag=tag)
     else:
         # Uses CuPy without CUDA-aware MPI
-        base_comm.send(send_buf, dest, tag)
+        _mpi_calls(base_comm, "send", send_buf, dest, tag, engine=engine)
 
 
 def mpi_recv(base_comm: MPI.Comm,
@@ -219,8 +219,36 @@ def mpi_recv(base_comm: MPI.Comm,
             # dimension or shape-related integers are send/recv
             recv_buf = ncp.zeros(count, dtype=ncp.int32)
         mpi_type = MPI._typedict[recv_buf.dtype.char]
-        base_comm.Recv([recv_buf, recv_buf.size, mpi_type], source=source, tag=tag)
+        _mpi_calls(base_comm, "Recv", [recv_buf, recv_buf.size, mpi_type], engine=engine, source=source, tag=tag)
     else:
         # Uses CuPy without CUDA-aware MPI
-        recv_buf = base_comm.recv(source=source, tag=tag)
+        recv_buf = _mpi_calls(base_comm, "recv", engine=engine, source=source, tag=tag)
     return recv_buf
+
+
+def _mpi_calls(comm: MPI.Comm, func: str, *args, engine: Optional[str] = "numpy", **kwargs):
+    """MPI Calls
+    Wrapper around MPI comm calls with optional GPU synchronization for CuPy arrays.
+
+    Parameters
+    ----------
+    comm: :obj:`MPI.Comm`
+        MPI Communicator
+    func
+        MPI Function to call.
+    args
+        Arguments to pass to the function.
+    engine: :obj:`str`, optional
+        Engine used to store array (``numpy`` or ``cupy``)
+    kwargs
+        Keyword arguments passed to the MPI call.
+
+    Returns
+    -------
+        Result of the MPI call
+    """
+    if engine == "cupy" and deps.cuda_aware_mpi_enabled:
+        ncp = get_module(engine)
+        ncp.cuda.Device().synchronize()
+    mpi_func = getattr(comm, func)
+    return mpi_func(*args, **kwargs)