Added comments

astroC86 · astroC86 · commit 5805f4f5eb75 · 2026-06-02T22:37:33.000+02:00
diff --git a/pylops_mpi/basicoperators/Halo.py b/pylops_mpi/basicoperators/Halo.py
@@ -1,4 +1,5 @@
 import math
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 from mpi4py import MPI
@@ -8,7 +9,11 @@
 from pylops_mpi.Distributed import DistributedMixIn
 
 
-def halo_block_split(global_shape: tuple, comm, grid_shape: tuple = None) -> tuple:
+def halo_block_split(
+    global_shape: tuple,
+    comm: MPI.Comm,
+    grid_shape: Optional[tuple] = None,
+) -> tuple:
     r"""Split a global array over a Cartesian process grid.
 
     Compute the local slice owned by the calling rank when ``global_shape`` is
@@ -133,11 +138,11 @@ class MPIHalo(DistributedMixIn, MPILinearOperator):
     def __init__(
         self,
         dims: tuple,
-        halo,
-        proc_grid_shape: tuple = None,
+        halo: Union[int, tuple],
+        proc_grid_shape: Optional[tuple] = None,
         comm: MPI.Comm = MPI.COMM_WORLD,
-        dtype=np.float64,
-    ):
+        dtype: Any = np.float64,
+    ) -> None:
         self.global_dims = tuple(dims)
         self.ndim = len(dims)
 
@@ -163,7 +168,8 @@ def __init__(
         )
         super().__init__(shape=self.shape, dtype=np.dtype(dtype), base_comm=comm)
 
-    def _parse_halo(self, h):
+    def _parse_halo(self, h: Union[int, tuple]) -> tuple:
+        """Normalize halo input and trim halos at global boundaries."""
         if isinstance(h, (int, np.int64, np.int32)):
             halo = (h,) * (2 * self.ndim)
             trimmed = list(halo)
@@ -185,7 +191,8 @@ def _parse_halo(self, h):
             raise ValueError(f"Invalid halo length {len(h)} for ndim={self.ndim}")
         return halo
 
-    def _build_topo(self):
+    def _build_topo(self) -> Tuple[MPI.Comm, Dict[Tuple[str, int], int]]:
+        """Create the Cartesian communicator and map neighboring ranks on the distribution axis."""
         cart_comm = self.comm.Create_cart(
             self.proc_grid_shape,
             periods=[False] * self.ndim,
@@ -198,7 +205,8 @@ def _build_topo(self):
             neigh[("+", ax)] = after
         return cart_comm, neigh
 
-    def _calc_local_dims(self):
+    def _calc_local_dims(self) -> tuple:
+        """Compute this rank's local block shape before halo padding."""
         rank = self.cart_comm.Get_rank()
         coords = self.cart_comm.Get_coords(rank)
         local = []
@@ -211,14 +219,16 @@ def _calc_local_dims(self):
             local.append(end - start)
         return tuple(local)
 
-    def _calc_local_extent(self):
+    def _calc_local_extent(self) -> tuple:
+        """Compute this rank's local block shape after halo padding."""
         ext = []
         for ax in range(self.ndim):
             minus_halo, plus_halo = self.halo[2 * ax], self.halo[2 * ax + 1]
             ext.append(self.local_dims[ax] + minus_halo + plus_halo)
         return tuple(ext)
 
-    def _exchange_along_axis(self, ncp, arr, axis, before, after, engine):
+    def _exchange_along_axis(self, ncp: Any, arr: Any, axis: int, before: int, after: int, engine: str) -> None:
+        """Exchange boundary/halo slices with neighboring ranks along one axis."""
         minus_nbr, plus_nbr = self.neigh[("-", axis)], self.neigh[("+", axis)]
         # slice definitions
         slicer = [slice(None)] * self.ndim
@@ -259,7 +269,7 @@ def _exchange_along_axis(self, ncp, arr, axis, before, after, engine):
             )
             arr[tuple(rcv_s)] = rcv
 
-    def _matvec(self, x):
+    def _matvec(self, x: DistributedArray) -> DistributedArray:
         ncp = get_module(x.engine)
         if x.partition != Partition.SCATTER:
             raise ValueError(
@@ -295,7 +305,7 @@ def _matvec(self, x):
         y[:] = halo_arr.ravel()
         return y
 
-    def _rmatvec(self, x):
+    def _rmatvec(self, x: DistributedArray) -> DistributedArray:
         if x.partition != Partition.SCATTER:
             raise ValueError(
                 f"x should have partition={Partition.SCATTER} Got {x.partition} instead..."
diff --git a/pylops_mpi/utils/_common.py b/pylops_mpi/utils/_common.py
@@ -17,30 +17,32 @@ def _float_scalar(value) -> float:
 
 # TODO: return type annotation for both cupy and numpy
 def _prepare_allgather_inputs(send_buf, send_buf_shapes, engine):
-    r""" Prepare send_buf and recv_buf for NCCL allgather (nccl_allgather)
+    r"""Prepare send_buf and recv_buf for buffered allgather
 
-    Buffered Allgather (MPI and NCCL) requires the sending buffer to have the same size for every device.
+    Buffered Allgather (MPI and NCCL) requires the sending buffer to have the
+    same size for every rank/device.
     Therefore, padding is required when the array is not evenly partitioned across
-    all the ranks. The padding is applied such that the each dimension of the sending buffers
+    all the ranks. The padding is applied such that each dimension of the sending buffers
     is equal to the max size of that dimension across all ranks.
 
-    Similarly, each receiver buffer (recv_buf) is created with size equal to :math:n_rank \cdot send_buf.size
+    Similarly, each receiver buffer (recv_buf) is created with size equal to
+    :math:`n_rank \cdot send_buf.size`
 
     Parameters
     ----------
-    send_buf : :obj: `numpy.ndarray` or `cupy.ndarray` or array-like
-        The data buffer from the local GPU to be sent for allgather.
-    send_buf_shapes: :obj:`list`
-        A list of shapes for each GPU send_buf (used to calculate padding size)
+    send_buf : :obj:`numpy.ndarray` or :obj:`cupy.ndarray` or array-like
+        The data buffer from the local rank/device to be sent for allgather.
+    send_buf_shapes : :obj:`list`
+        A list of shapes for each rank/device send_buf (used to calculate padding size)
     engine : :obj:`str`
         Engine used to store array (``numpy`` or ``cupy``)
 
     Returns
     -------
-    send_buf: :obj:`cupy.ndarray`
+    send_buf : :obj:`numpy.ndarray` or :obj:`cupy.ndarray`
         A buffer containing the data and padded elements to be sent by this rank.
-    recv_buf : :obj:`cupy.ndarray`
-        An empty, padded buffer to gather data from all GPUs.
+    recv_buf : :obj:`numpy.ndarray` or :obj:`cupy.ndarray`
+        An empty, padded buffer to gather data from all ranks.
     """
     ncp = get_module(engine)
     sizes_each_dim = list(zip(*send_buf_shapes))
@@ -60,25 +62,35 @@ def _prepare_allgather_inputs(send_buf, send_buf_shapes, engine):
 
 
 def _unroll_allgather_recv(recv_buf, padded_send_buf_shape, send_buf_shapes, displs=None) -> list:
-    r"""Unrolll recv_buf after Buffered Allgather (MPI and NCCL)
+    r"""Unroll recv_buf after Buffered Allgather (MPI and NCCL)
+
+    Depending on the provided parameters, the function:
+    - uses ``displs`` and element counts to extract variable-sized chunks.
+    - removes padding and reshapes each chunk using ``padded_send_buf_shape``.
 
-    Remove the padded elements in recv_buff, extract an individual array from each device and return them as a list of arrays
-    Each GPU may send array with a different shape, so the return type has to be a list of array
-    instead of the concatenated array.
+    Each rank may send an array with a different shape, so the return type is a list of arrays
+    instead of a concatenated array.
 
     Parameters
     ----------
     recv_buf: :obj:`cupy.ndarray` or array-like
-        The data buffer returned from nccl_allgather call
-    padded_send_buf_shape: :obj:`tuple`:int
-        The size of send_buf after padding used in nccl_allgather
+        The data buffer returned from the allgather call
     send_buf_shapes: :obj:`list`
-        A list of original shapes for each GPU send_buf prior to padding
+        A list of original shapes of each rank's send_buf before any padding.
+    padded_send_buf_shape : tuple
+        Shape of each rank's data as stored in ``recv_buf``. This should match
+        the layout used during allgather: use the padded send buffer shape when
+        padding is applied (e.g., NCCL), or the original send buffer shape when
+        no padding is used.
+    displs : list, optional
+        Starting offsets in recv_buf for each rank's data, used when chunks have
+        variable sizes (e.g., mpi_allgather with displacements).
 
     Returns
     -------
-    chunks: :obj:`list`
-        A list of `cupy.ndarray` from each GPU with the padded element removed
+    chunks : list of ndarray
+        List of arrays (NumPy or CuPy, depending on ``engine``), one per rank,
+        reconstructed to their original shapes with any padding removed.
     """
     ndev = len(send_buf_shapes)
     if displs is not None: