From 2f7371897a64fcd151a93fe6805598b681334205 Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Wed, 11 Jun 2025 22:21:04 +0100
Subject: [PATCH 1/5] feat: add cuda backend to Kirchhoff

---
 pylops/waveeqprocessing/_kirchhoff_cuda.py | 488 +++++++++++++++++++++
 pylops/waveeqprocessing/kirchhoff.py       |  46 +-
 2 files changed, 522 insertions(+), 12 deletions(-)
 create mode 100644 pylops/waveeqprocessing/_kirchhoff_cuda.py

diff --git a/pylops/waveeqprocessing/_kirchhoff_cuda.py b/pylops/waveeqprocessing/_kirchhoff_cuda.py
new file mode 100644
index 000000000..4a73347e1
--- /dev/null
+++ b/pylops/waveeqprocessing/_kirchhoff_cuda.py
@@ -0,0 +1,488 @@
+from math import cos, fabs
+
+import numpy as np
+from numba import cuda
+
+from pylops.utils.backend import to_cupy
+
+
+class _KirchhoffCudaHelper:
+    """A helper class to perform Kirchhoff demigration/migration using Numba CUDA.
+
+    This class provides methods to compute the forward and adjoint operations for the
+    Kirchhoff operator, utilizing GPU acceleration through Numba's CUDA capabilities.
+
+    Parameters
+    ----------
+    ns : :obj:`int`
+        Number of sources.
+    nr : :obj:`int`
+        Number of receivers.
+    nt : :obj:`int`
+        Number of time samples.
+    ni : :obj:`int`
+        Number of image points.
+    dynamic : :obj:`bool`, optional
+        Flag indicating whether to use dynamic computation
+        (default is ``False``).
+
+    """
+
+    def __init__(self, ns, nr, nt, ni, dynamic=False):
+        self.ns, self.nr, self.nt, self.ni = ns, nr, nt, ni
+        self.dynamic = dynamic
+        self._grid_setup()
+
+    def _grid_setup(self):
+        """Set up CUDA grid and block dimensions.
+
+        This method configures the number of blocks and threads per block for
+        CUDA kernels, depending on the number of sources and receivers.
+        """
+        # use warp size as number of threads per block
+        current_device = cuda.get_current_device()
+        warp = current_device.WARP_SIZE // 2
+        self.num_threads_per_blocks = (warp, warp)
+        # configure number of blocks
+        self.num_blocks = (
+            (self.ns + self.num_threads_per_blocks[0] - 1)
+            // self.num_threads_per_blocks[0],
+            (self.nr + self.num_threads_per_blocks[1] - 1)
+            // self.num_threads_per_blocks[1],
+        )
+
+    @staticmethod
+    @cuda.jit
+    def _travsrcrec_kirch_matvec_cuda(x, y, ns, nr, nt, ni, dt, trav_srcs, trav_recs):
+        isrc, irec = cuda.grid(2)
+        if ns > isrc and nr > irec:
+            for ii in range(ni):
+                travisrc = trav_srcs[ii, isrc]
+                travirec = trav_recs[ii, irec]
+                trav = travisrc + travirec
+                itravii = int(trav / dt)
+                travdii = trav / dt - itravii
+                if 0 <= itravii < nt - 1:
+                    ind1 = (isrc * nr + irec, itravii)
+                    val1 = x[ii] * (1 - travdii)
+                    ind2 = (isrc * nr + irec, itravii + 1)
+                    val2 = x[ii] * travdii
+                    cuda.atomic.add(y, ind1, val1)
+                    cuda.atomic.add(y, ind2, val2)
+
+    @staticmethod
+    @cuda.jit
+    def _travsrcrec_kirch_rmatvec_cuda(x, y, ns, nr, nt, ni, dt, trav_srcs, trav_recs):
+        isrc, irec = cuda.grid(2)
+        if ns > isrc and nr > irec:
+            for ii in range(ni):
+                travisrc = trav_srcs[ii, isrc]
+                travirec = trav_recs[ii, irec]
+                trav = travisrc + travirec
+                itravii = int(trav / dt)
+                travdii = trav / dt - itravii
+                if 0 <= itravii < nt - 1:
+                    vii = (
+                        x[isrc * nr + irec, itravii] * (1 - travdii)
+                        + x[isrc * nr + irec, itravii + 1] * travdii
+                    )
+                    cuda.atomic.add(y, ii, vii)
+
+    @staticmethod
+    @cuda.jit
+    def _ampsrcrec_kirch_matvec_cuda(
+        x,
+        y,
+        ns,
+        nr,
+        nt,
+        ni,
+        dt,
+        vel,
+        trav_srcs,
+        trav_recs,
+        amp_srcs,
+        amp_recs,
+        aperturemin,
+        aperturemax,
+        aperturetap,
+        nz,
+        six,
+        rix,
+        angleaperturemin,
+        angleaperturemax,
+        angles_srcs,
+        angles_recs,
+    ):
+        daperture = aperturemax - aperturemin
+        dangleaperture = angleaperturemax - angleaperturemin
+
+        isrc, irec = cuda.grid(2)
+        if ns > isrc and nr > irec:
+            for ii in range(ni):
+                sixisrcrec = six[isrc * nr + irec]
+                rixisrcrec = rix[isrc * nr + irec]
+                travisrc = trav_srcs[ii, isrc]
+                travirec = trav_recs[ii, irec]
+                trav = travisrc + travirec
+                itravii = int(trav / dt)
+                travdii = trav / dt - itravii
+                ampisrc = amp_srcs[ii, isrc]
+                ampirec = amp_recs[ii, irec]
+                # extract source and receiver angle at given image point
+                angle_src = angles_srcs[ii, isrc]
+                angle_rec = angles_recs[ii, irec]
+                abs_angle_src = fabs(angle_src)
+                abs_angle_rec = fabs(angle_rec)
+                # compute cosine of half opening angle and total amplitude scaling
+                cosangle = cos((angle_src - angle_rec) / 2.0)
+                damp = 2.0 * cosangle * ampisrc * ampirec / vel[ii]
+                # identify z-index of image point
+                iz = ii % nz
+                # angle apertures checks
+                aptap = 1.0
+                if (
+                    abs_angle_src < angleaperturemax
+                    and abs_angle_rec < angleaperturemax
+                ):
+                    if abs_angle_src >= angleaperturemin:
+                        # extract source angle aperture taper value
+                        aptap = (
+                            aptap
+                            * aperturetap[
+                                int(
+                                    20
+                                    * (abs_angle_src - angleaperturemin)
+                                    // dangleaperture
+                                )
+                            ]
+                        )
+                    if abs_angle_rec >= angleaperturemin:
+                        # extract receiver angle aperture taper value
+                        aptap = (
+                            aptap
+                            * aperturetap[
+                                int(
+                                    20
+                                    * (abs_angle_rec - angleaperturemin)
+                                    // dangleaperture
+                                )
+                            ]
+                        )
+
+                    # aperture check
+                    aperture = abs(sixisrcrec - rixisrcrec) / (iz + 1)
+                    if aperture < aperturemax:
+                        if aperture >= aperturemin:
+                            # extract aperture taper value
+                            aptap = (
+                                aptap
+                                * aperturetap[
+                                    int(20 * ((aperture - aperturemin) // daperture))
+                                ]
+                            )
+                        # time limit check
+                        if 0 <= itravii < nt - 1:
+                            ind1 = (isrc * nr + irec, itravii)
+                            val1 = x[ii] * (1 - travdii) * damp * aptap
+                            ind2 = (isrc * nr + irec, itravii + 1)
+                            val2 = x[ii] * travdii * damp * aptap
+                            cuda.atomic.add(y, ind1, val1)
+                            cuda.atomic.add(y, ind2, val2)
+
+    @staticmethod
+    @cuda.jit
+    def _ampsrcrec_kirch_rmatvec_cuda(
+        x,
+        y,
+        ns,
+        nr,
+        nt,
+        ni,
+        dt,
+        vel,
+        trav_srcs,
+        trav_recs,
+        amp_srcs,
+        amp_recs,
+        aperturemin,
+        aperturemax,
+        aperturetap,
+        nz,
+        six,
+        rix,
+        angleaperturemin,
+        angleaperturemax,
+        angles_srcs,
+        angles_recs,
+    ):
+        daperture = aperturemax - aperturemin
+        dangleaperture = angleaperturemax - angleaperturemin
+
+        isrc, irec = cuda.grid(2)
+        if ns > isrc and nr > irec:
+            for ii in range(ni):
+                sixisrcrec = six[isrc * nr + irec]
+                rixisrcrec = rix[isrc * nr + irec]
+                travisrc = trav_srcs[ii, isrc]
+                travirec = trav_recs[ii, irec]
+                trav = travisrc + travirec
+                itravii = int(trav / dt)
+                travdii = trav / dt - itravii
+                ampisrc = amp_srcs[ii, isrc]
+                ampirec = amp_recs[ii, irec]
+                # extract source and receiver angle at given image point
+                angle_src = angles_srcs[ii, isrc]
+                angle_rec = angles_recs[ii, irec]
+                abs_angle_src = fabs(angle_src)
+                abs_angle_rec = fabs(angle_rec)
+                # compute cosine of half opening angle and total amplitude scaling
+                cosangle = cos((angle_src - angle_rec) / 2.0)
+                damp = 2.0 * cosangle * ampisrc * ampirec / vel[ii]
+                # identify z-index of image point
+                iz = ii % nz
+                # angle apertures checks
+                aptap = 1.0
+                if (
+                    abs_angle_src < angleaperturemax
+                    and abs_angle_rec < angleaperturemax
+                ):
+                    if abs_angle_src >= angleaperturemin:
+                        # extract source angle aperture taper value
+                        aptap = (
+                            aptap
+                            * aperturetap[
+                                int(
+                                    20
+                                    * (abs_angle_src - angleaperturemin)
+                                    // dangleaperture
+                                )
+                            ]
+                        )
+                    if abs_angle_rec >= angleaperturemin:
+                        # extract receiver angle aperture taper value
+                        aptap = (
+                            aptap
+                            * aperturetap[
+                                int(
+                                    20
+                                    * (abs_angle_rec - angleaperturemin)
+                                    // dangleaperture
+                                )
+                            ]
+                        )
+
+                    # aperture check
+                    aperture = abs(sixisrcrec - rixisrcrec) / (iz + 1)
+                    if aperture < aperturemax:
+                        if aperture >= aperturemin:
+                            # extract aperture taper value
+                            aptap = (
+                                aptap
+                                * aperturetap[
+                                    int(20 * ((aperture - aperturemin) // daperture))
+                                ]
+                            )
+                        # time limit check
+                        if 0 <= itravii < nt - 1:
+                            ind1 = ii
+                            val1 = (
+                                (
+                                    x[isrc * nr + irec, itravii] * (1 - travdii)
+                                    + x[isrc * nr + irec, itravii + 1] * travdii
+                                )
+                                * damp
+                                * aptap
+                            )
+                            cuda.atomic.add(y, ind1, val1)
+
+    def _call_kinematic(self, opt, *inputs):
+        """Kinematic-only computations using CUDA.
+
+        This method handles data preparation and execution of CUDA kernels
+        for both forward and adjoint operations of kinematic-only operator.
+
+        Parameters
+        ----------
+        opt : :obj:`str`
+            Operation type, either '_matvec' for forward or '_rmatvec'
+            for adjoint.
+        *inputs : :obj:`list`
+            List of input parameters required by the kernels.
+
+        Returns
+        -------
+        y_d : :obj:`cupy.ndarray`
+            Output data.
+
+        """
+        x_d = inputs[0]
+        y_d = inputs[1]
+        ns_d = np.int32(inputs[2])
+        nr_d = np.int32(inputs[3])
+        nt_d = np.int32(inputs[4])
+        ni_d = np.int32(inputs[5])
+        dt_d = np.float32(inputs[6])
+        trav_srcs_d = to_cupy(inputs[7])
+        trav_recs_d = to_cupy(inputs[8])
+
+        if opt == "_matvec":
+            self._travsrcrec_kirch_matvec_cuda[
+                self.num_blocks, self.num_threads_per_blocks
+            ](x_d, y_d, ns_d, nr_d, nt_d, ni_d, dt_d, trav_srcs_d, trav_recs_d)
+        elif opt == "_rmatvec":
+            self._travsrcrec_kirch_rmatvec_cuda[
+                self.num_blocks, self.num_threads_per_blocks
+            ](x_d, y_d, ns_d, nr_d, nt_d, ni_d, dt_d, trav_srcs_d, trav_recs_d)
+        cuda.synchronize()
+
+        return y_d
+
+    def _call_dynamic(self, opt, *inputs):
+        """Synamic computations using CUDA.
+
+        This method handles data preparation and execution of CUDA kernels
+        for both forward and adjoint operations of dynamic operator.
+
+        Parameters
+        ----------
+        opt : :obj:`str`
+            Operation type, either '_matvec' for forward or '_rmatvec'
+            for adjoint.
+        *inputs : :obj:`list`
+            List of input parameters required by the kernels.
+
+        Returns
+        -------
+        y_d : :obj:`cupy.ndarray`
+            Output data.
+
+        """
+        x_d = inputs[0]
+        y_d = inputs[1]
+        ns_d = np.int32(inputs[2])
+        nr_d = np.int32(inputs[3])
+        nt_d = np.int32(inputs[4])
+        ni_d = np.int32(inputs[5])
+        dt_d = np.float32(inputs[6])
+        vel_d = to_cupy(inputs[7])
+        trav_srcs_d = to_cupy(inputs[8])
+        trav_recs_d = to_cupy(inputs[9])
+        amp_srcs_d = to_cupy(inputs[10])
+        amp_recs_d = to_cupy(inputs[11])
+        aperturemin_d = np.float32(inputs[12])
+        aperturemax_d = np.float32(inputs[13])
+        aperturetap_d = to_cupy(inputs[14])
+        nz_d = np.int32(inputs[15])
+        six_d = to_cupy(inputs[16])
+        rix_d = to_cupy(inputs[17])
+        angleaperturemin_d = np.float32(inputs[18])
+        angleaperturemax_d = np.float32(inputs[19])
+        angles_srcs_d = to_cupy(inputs[20])
+        angles_recs_d = to_cupy(inputs[21])
+
+        if opt == "_matvec":
+            self._ampsrcrec_kirch_matvec_cuda[
+                self.num_blocks, self.num_threads_per_blocks
+            ](
+                x_d,
+                y_d,
+                ns_d,
+                nr_d,
+                nt_d,
+                ni_d,
+                dt_d,
+                vel_d,
+                trav_srcs_d,
+                trav_recs_d,
+                amp_srcs_d,
+                amp_recs_d,
+                aperturemin_d,
+                aperturemax_d,
+                aperturetap_d,
+                nz_d,
+                six_d,
+                rix_d,
+                angleaperturemin_d,
+                angleaperturemax_d,
+                angles_srcs_d,
+                angles_recs_d,
+            )
+        elif opt == "_rmatvec":
+            self._ampsrcrec_kirch_rmatvec_cuda[
+                self.num_blocks, self.num_threads_per_blocks
+            ](
+                x_d,
+                y_d,
+                ns_d,
+                nr_d,
+                nt_d,
+                ni_d,
+                dt_d,
+                vel_d,
+                trav_srcs_d,
+                trav_recs_d,
+                amp_srcs_d,
+                amp_recs_d,
+                aperturemin_d,
+                aperturemax_d,
+                aperturetap_d,
+                nz_d,
+                six_d,
+                rix_d,
+                angleaperturemin_d,
+                angleaperturemax_d,
+                angles_srcs_d,
+                angles_recs_d,
+            )
+        cuda.synchronize()
+
+        return y_d
+
+    def _matvec_cuda(self, *inputs):
+        """Forward with dispatching to appropriate CUDA kernels.
+
+        This method selects the appropriate kernel to execute based on the
+        computation flags, and performs the forward operation.
+
+        Parameters
+        ----------
+        *inputs : :obj:`list`
+            List of input parameters required by the kernels.
+
+        Returns
+        -------
+        y_d : :obj:`cupy.ndarray`
+            Output (seismic data) of the forward operator.
+
+        """
+        if self.dynamic:
+            y_d = self._call_dynamic("_matvec", *inputs)
+        else:
+            y_d = self._call_kinematic("_matvec", *inputs)
+
+        return y_d
+
+    def _rmatvec_cuda(self, *inputs):
+        """Adjoint with dispatching to appropriate CUDA kernels.
+
+        This method selects the appropriate kernel to execute based on the
+        computation flags, and performs the adjoint operation.
+
+        Parameters
+        ----------
+        *inputs : :obj:`list`
+            List of input parameters required by the kernels.
+
+        Returns
+        -------
+        y_d : :obj:`cupy.ndarray`
+            Output data (image) of the adjoint operator.
+
+        """
+        if self.dynamic:
+            y_d = self._call_dynamic("_rmatvec", *inputs)
+        else:
+            y_d = self._call_kinematic("_rmatvec", *inputs)
+
+        return y_d
diff --git a/pylops/waveeqprocessing/kirchhoff.py b/pylops/waveeqprocessing/kirchhoff.py
index bdd5be878..9eed7dd6b 100644
--- a/pylops/waveeqprocessing/kirchhoff.py
+++ b/pylops/waveeqprocessing/kirchhoff.py
@@ -12,6 +12,7 @@
 from pylops.signalprocessing import Convolve1D
 from pylops.utils import deps
 from pylops.utils._internal import _value_or_sized_to_array
+from pylops.utils.backend import get_array_module
 from pylops.utils.decorators import reshaped
 from pylops.utils.tapers import taper
 from pylops.utils.typing import DTypeLike, NDArray
@@ -25,6 +26,8 @@
 if jit_message is None:
     from numba import jit, prange
 
+    from ._kirchhoff_cuda import _KirchhoffCudaHelper
+
     # detect whether to use parallel or not
     numba_threads = int(os.getenv("NUMBA_NUM_THREADS", "1"))
     parallel = True if numba_threads != 1 else False
@@ -82,8 +85,8 @@ class Kirchhoff(LinearOperator):
         :math:`\lbrack (n_y) n_x n_z \times n_s n_r \rbrack` or pair of traveltime tables
         of size :math:`\lbrack (n_y) n_x n_z \times n_s \rbrack` and :math:`\lbrack (n_y) n_x n_z \times n_r \rbrack`
         (to be provided if ``mode='byot'``). Note that the latter approach is recommended as less memory demanding
-        than the former. Moreover, only ``mode='dynamic'`` is only possible when traveltimes are provided in
-        the latter form.
+        than the former. Moreover, ``mode='dynamic'`` and ``engine='cuda'`` are only possible when traveltimes are
+        provided in the latter form.
     amp : :obj:`numpy.ndarray`, optional
         .. versionadded:: 2.0.0
 
@@ -108,7 +111,8 @@ class Kirchhoff(LinearOperator):
         Deprecated, will be removed in v3.0.0. Simply kept for back-compatibility with previous implementation,
         but effectively not affecting the behaviour of the operator.
     engine : :obj:`str`, optional
-        Engine used for computations (``numpy`` or ``numba``).
+        Engine used for computations (``numpy``, ``numba`` or ``cuda``). Note that the ``cuda`` engine
+        currently supports only ``dynamic=False``.
     dtype : :obj:`str`, optional
         Type of elements in input array.
     name : :obj:`str`, optional
@@ -127,7 +131,10 @@ class Kirchhoff(LinearOperator):
     Raises
     ------
     NotImplementedError
-        If ``mode`` is neither ``analytic``, ``eikonal``, or ``byot``
+        If ``mode`` is neither ``analytic``, ``eikonal``, or ``byot``.
+
+    NotImplementedError
+        If ``dynamic=True`` and ``engine="cuda"``
 
     Notes
     -----
@@ -857,7 +864,7 @@ def _ampsrcrec_kirch_matvec(
                                 ]
                             )
 
-                        # identify x-index of image point
+                        # identify z-index of image point
                         iz = ii % nz
                         # aperture check
                         aperture = abs(sixisrcrec - rixisrcrec) / (iz + 1)
@@ -917,7 +924,7 @@ def _ampsrcrec_kirch_rmatvec(
             velii = vel[ii]
             angle_srcsii = angles_srcs[ii]
             angle_recsii = angles_recs[ii]
-            # identify x-index of image point
+            # identify z-index of image point
             iz = ii % nz
             for isrc in range(ns):
                 trav_srcii = trav_srcsii[isrc]
@@ -995,10 +1002,9 @@ def _ampsrcrec_kirch_rmatvec(
         return y
 
     def _register_multiplications(self, engine: str) -> None:
-        if engine not in ["numpy", "numba"]:
-            raise KeyError("engine must be numpy or numba")
+        if engine not in ["numpy", "numba", "cuda"]:
+            raise KeyError("engine must be numpy or numba or cuda")
         if engine == "numba" and jit_message is None:
-            # numba
             numba_opts = dict(
                 nopython=True, nogil=True, parallel=parallel
             )  # fastmath=True,
@@ -1011,7 +1017,21 @@ def _register_multiplications(self, engine: str) -> None:
             elif not self.travsrcrec:
                 self._kirch_matvec = jit(**numba_opts)(self._trav_kirch_matvec)
                 self._kirch_rmatvec = jit(**numba_opts)(self._trav_kirch_rmatvec)
-
+        elif engine == "cuda":
+            if self.dynamic and self.travsrcrec:
+                self.cuda_helper = _KirchhoffCudaHelper(
+                    self.ns, self.nr, self.nt, self.ni, True
+                )
+            elif self.travsrcrec:
+                self.cuda_helper = _KirchhoffCudaHelper(
+                    self.ns, self.nr, self.nt, self.ni, False
+                )
+            elif not self.travsrcrec:
+                raise NotImplementedError(
+                    "cuda not implemented for traveltimes " "provided in one table"
+                )
+            self._kirch_matvec = self.cuda_helper._matvec_cuda
+            self._kirch_rmatvec = self.cuda_helper._rmatvec_cuda
         else:
             if engine == "numba" and jit_message is not None:
                 logging.warning(jit_message)
@@ -1027,7 +1047,8 @@ def _register_multiplications(self, engine: str) -> None:
 
     @reshaped
     def _matvec(self, x: NDArray) -> NDArray:
-        y = np.zeros((self.nsnr, self.nt), dtype=self.dtype)
+        ncp = get_array_module(x)
+        y = ncp.zeros((self.nsnr, self.nt), dtype=self.dtype)
         if self.dynamic and self.travsrcrec:
             inputs = (
                 x.ravel(),
@@ -1074,9 +1095,10 @@ def _matvec(self, x: NDArray) -> NDArray:
 
     @reshaped
     def _rmatvec(self, x: NDArray) -> NDArray:
+        ncp = get_array_module(x)
         x = self.cop._rmatvec(x.ravel())
         x = x.reshape(self.nsnr, self.nt)
-        y = np.zeros(self.ni, dtype=self.dtype)
+        y = ncp.zeros(self.ni, dtype=self.dtype)
         if self.dynamic and self.travsrcrec:
             inputs = (
                 x,

From b944dcb92d1831f7ba3180cede83c2f471ab44e3 Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Wed, 11 Jun 2025 23:11:28 +0100
Subject: [PATCH 2/5] minor: fix error message

---
 pylops/waveeqprocessing/kirchhoff.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pylops/waveeqprocessing/kirchhoff.py b/pylops/waveeqprocessing/kirchhoff.py
index 9eed7dd6b..3f8e0a10f 100644
--- a/pylops/waveeqprocessing/kirchhoff.py
+++ b/pylops/waveeqprocessing/kirchhoff.py
@@ -1028,7 +1028,8 @@ def _register_multiplications(self, engine: str) -> None:
                 )
             elif not self.travsrcrec:
                 raise NotImplementedError(
-                    "cuda not implemented for traveltimes " "provided in one table"
+                    "engine='cuda' not implemented for traveltimes "
+                    "provided in one table"
                 )
             self._kirch_matvec = self.cuda_helper._matvec_cuda
             self._kirch_rmatvec = self.cuda_helper._rmatvec_cuda

From b7cdbaba7b591e001cd6b4374f68b7ab6a9f85bb Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Wed, 18 Jun 2025 23:12:24 +0100
Subject: [PATCH 3/5] test: added cuda tests for Kirchhoff

---
 pytests/test_kirchhoff.py | 128 ++++++++++++++++++++++++--------------
 1 file changed, 82 insertions(+), 46 deletions(-)

diff --git a/pytests/test_kirchhoff.py b/pytests/test_kirchhoff.py
index 50de034bd..4a37008b6 100644
--- a/pytests/test_kirchhoff.py
+++ b/pytests/test_kirchhoff.py
@@ -1,8 +1,19 @@
 import os
 
 import numpy as np
+
+if int(os.environ.get("TEST_CUPY_PYLOPS", 0)):
+    import cupy as np
+    from cupy.testing import assert_array_almost_equal
+
+    backend = "cuda"
+else:
+    import numpy as np
+    from numpy.testing import assert_array_almost_equal
+
+    backend = "numpy"
+import numpy as npp
 import pytest
-from numpy.testing import assert_array_almost_equal
 
 from pylops.utils import dottest
 from pylops.utils.wavelets import ricker
@@ -34,22 +45,22 @@
     skfmm_enabled = False
 
 v0 = 500
-y = np.arange(PAR["ny"]) * PAR["dy"]
-x = np.arange(PAR["nx"]) * PAR["dx"]
-z = np.arange(PAR["nz"]) * PAR["dz"]
-t = np.arange(PAR["nt"]) * PAR["dt"]
-
-sy = np.linspace(y.min(), y.max(), PAR["nsy"])
-sx = np.linspace(x.min(), x.max(), PAR["nsx"])
-syy, sxx = np.meshgrid(sy, sx, indexing="ij")
-s2d = np.vstack((sx, 2 * np.ones(PAR["nsx"])))
-s3d = np.vstack((syy.ravel(), sxx.ravel(), 2 * np.ones(PAR["nsx"] * PAR["nsy"])))
-
-ry = np.linspace(y.min(), y.max(), PAR["nry"])
-rx = np.linspace(x.min(), x.max(), PAR["nrx"])
-ryy, rxx = np.meshgrid(ry, rx, indexing="ij")
-r2d = np.vstack((rx, 2 * np.ones(PAR["nrx"])))
-r3d = np.vstack((ryy.ravel(), rxx.ravel(), 2 * np.ones(PAR["nrx"] * PAR["nry"])))
+y = npp.arange(PAR["ny"]) * PAR["dy"]
+x = npp.arange(PAR["nx"]) * PAR["dx"]
+z = npp.arange(PAR["nz"]) * PAR["dz"]
+t = npp.arange(PAR["nt"]) * PAR["dt"]
+
+sy = npp.linspace(y.min(), y.max(), PAR["nsy"])
+sx = npp.linspace(x.min(), x.max(), PAR["nsx"])
+syy, sxx = npp.meshgrid(sy, sx, indexing="ij")
+s2d = npp.vstack((sx, 2 * npp.ones(PAR["nsx"])))
+s3d = npp.vstack((syy.ravel(), sxx.ravel(), 2 * npp.ones(PAR["nsx"] * PAR["nsy"])))
+
+ry = npp.linspace(y.min(), y.max(), PAR["nry"])
+rx = npp.linspace(x.min(), x.max(), PAR["nrx"])
+ryy, rxx = npp.meshgrid(ry, rx, indexing="ij")
+r2d = npp.vstack((rx, 2 * npp.ones(PAR["nrx"])))
+r3d = npp.vstack((ryy.ravel(), rxx.ravel(), 2 * npp.ones(PAR["nrx"] * PAR["nry"])))
 
 wav, _, wavc = ricker(t[:21], f0=40)
 
@@ -157,7 +168,7 @@ def test_traveltime_table():
             trav_ana,
             trav_srcs_ana,
             trav_recs_ana,
-            dist_ana,
+            _,
             _,
             _,
         ) = Kirchhoff._traveltime_table(z, x, s2d, r2d, v0, mode="analytic")
@@ -166,7 +177,7 @@ def test_traveltime_table():
             trav_eik,
             trav_srcs_eik,
             trav_recs_eik,
-            dist_eik,
+            _,
             _,
             _,
         ) = Kirchhoff._traveltime_table(
@@ -181,20 +192,13 @@ def test_traveltime_table():
         (
             trav_srcs_ana,
             trav_recs_ana,
-            dist_srcs_ana,
-            dist_recs_ana,
             _,
             _,
-        ) = Kirchhoff._traveltime_table(z, x, s3d, r3d, v0, y=y, mode="analytic")
-
-        (
-            trav_srcs_eik,
-            trav_recs_eik,
-            dist_srcs_eik,
-            dist_recs_eik,
             _,
             _,
-        ) = Kirchhoff._traveltime_table(
+        ) = Kirchhoff._traveltime_table(z, x, s3d, r3d, v0, y=y, mode="analytic")
+
+        (trav_srcs_eik, trav_recs_eik, _, _, _, _,) = Kirchhoff._traveltime_table(
             z,
             x,
             s3d,
@@ -206,12 +210,8 @@ def test_traveltime_table():
 
         assert_array_almost_equal(trav_srcs_ana, trav_srcs_eik, decimal=2)
         assert_array_almost_equal(trav_recs_ana, trav_recs_eik, decimal=2)
-        assert_array_almost_equal(trav_ana, trav_eik, decimal=2)
 
 
-@pytest.mark.skipif(
-    int(os.environ.get("TEST_CUPY_PYLOPS", 0)) == 1, reason="Not CuPy enabled"
-)
 @pytest.mark.parametrize("par", [(par1), (par2), (par3), (par1d), (par2d), (par3d)])
 def test_kirchhoff2d(par):
     """Dot-test for Kirchhoff operator"""
@@ -226,8 +226,6 @@ def test_kirchhoff2d(par):
         ) + trav_recs.reshape(PAR["nx"] * PAR["nz"], 1, PAR["nrx"])
         trav = trav.reshape(PAR["nx"] * PAR["nz"], PAR["nsx"] * PAR["nrx"])
         amp = None
-        # amp = 1 / (dist + 1e-2 * dist.max())
-
     else:
         trav = None
         amp = None
@@ -240,19 +238,31 @@ def test_kirchhoff2d(par):
             s2d,
             r2d,
             vel if par["mode"] == "eikonal" else v0,
-            wav,
+            np.asarray(wav),
             wavc,
             y=None,
             trav=trav,
             amp=amp,
             mode=par["mode"],
+            engine=backend,
+        )
+        if par["mode"] == "byot":
+            Dop.trav = np.asarray(Dop.trav)
+        else:
+            Dop.trav_srcs = np.asarray(Dop.trav_srcs)
+            Dop.trav_recs = np.asarray(Dop.trav_recs)
+        if par["mode"] == "dynamic":
+            Dop.amp_srcs = np.asarray(Dop.amp_srcs)
+            Dop.amp_recs = np.asarray(Dop.amp_recs)
+
+        assert dottest(
+            Dop,
+            PAR["nsx"] * PAR["nrx"] * PAR["nt"],
+            PAR["nz"] * PAR["nx"],
+            backend=backend,
         )
-        assert dottest(Dop, PAR["nsx"] * PAR["nrx"] * PAR["nt"], PAR["nz"] * PAR["nx"])
 
 
-@pytest.mark.skipif(
-    int(os.environ.get("TEST_CUPY_PYLOPS", 0)) == 1, reason="Not CuPy enabled"
-)
 @pytest.mark.parametrize("par", [(par1), (par2), (par3)])
 def test_kirchhoff3d(par):
     """Dot-test for Kirchhoff operator"""
@@ -287,17 +297,22 @@ def test_kirchhoff3d(par):
             y=y,
             trav=trav,
             mode=par["mode"],
+            engine=backend,
         )
+        if par["mode"] == "byot":
+            Dop.trav = np.asarray(Dop.trav)
+        else:
+            Dop.trav_srcs = np.asarray(Dop.trav_srcs)
+            Dop.trav_recs = np.asarray(Dop.trav_recs)
+
         assert dottest(
             Dop,
             PAR["nsx"] * PAR["nrx"] * PAR["nsy"] * PAR["nry"] * PAR["nt"],
             PAR["nz"] * PAR["nx"] * PAR["ny"],
+            backend=backend,
         )
 
 
-@pytest.mark.skipif(
-    int(os.environ.get("TEST_CUPY_PYLOPS", 0)) == 1, reason="Not CuPy enabled"
-)
 @pytest.mark.parametrize(
     "par",
     [
@@ -323,7 +338,13 @@ def test_kirchhoff2d_trav_vs_travsrcrec(par):
         mode=par["mode"],
         dynamic=par["dynamic"],
         angleaperture=None,
+        engine=backend,
     )
+    Dop.trav_srcs = np.asarray(Dop.trav_srcs)
+    Dop.trav_recs = np.asarray(Dop.trav_recs)
+    if par["dynamic"]:
+        Dop.amp_srcs = np.asarray(Dop.amp_srcs)
+        Dop.amp_recs = np.asarray(Dop.amp_recs)
 
     # old behaviour
     trav = Dop.trav_srcs.reshape(
@@ -348,7 +369,13 @@ def test_kirchhoff2d_trav_vs_travsrcrec(par):
         mode=par["mode"],
         dynamic=par["dynamic"],
         angleaperture=None,
+        engine=backend,
     )
+    D1op.trav_srcs = np.asarray(D1op.trav_srcs)
+    D1op.trav_recs = np.asarray(D1op.trav_recs)
+    if par["dynamic"]:
+        D1op.amp_srcs = np.asarray(D1op.amp_srcs)
+        D1op.amp_recs = np.asarray(D1op.amp_recs)
 
     # forward
     xx = np.random.normal(0, 1, PAR["nx"] * PAR["nz"])
@@ -359,9 +386,6 @@ def test_kirchhoff2d_trav_vs_travsrcrec(par):
     assert_array_almost_equal(Dop.H @ yy, D1op.H @ yy, decimal=2)
 
 
-@pytest.mark.skipif(
-    int(os.environ.get("TEST_CUPY_PYLOPS", 0)) == 1, reason="Not CuPy enabled"
-)
 @pytest.mark.parametrize(
     "par",
     [
@@ -384,7 +408,13 @@ def test_kirchhoff3d_trav_vs_travsrcrec(par):
         wavc,
         y=y,
         mode=par["mode"],
+        engine=backend,
     )
+    Dop.trav_srcs = np.asarray(Dop.trav_srcs)
+    Dop.trav_recs = np.asarray(Dop.trav_recs)
+    if par["dynamic"]:
+        Dop.amp_srcs = np.asarray(Dop.amp_srcs)
+        Dop.amp_recs = np.asarray(Dop.amp_recs)
 
     # old behaviour
     trav = Dop.trav_srcs.reshape(
@@ -409,7 +439,13 @@ def test_kirchhoff3d_trav_vs_travsrcrec(par):
         y=y,
         trav=trav,
         mode=par["mode"],
+        engine=backend,
     )
+    D1op.trav_srcs = np.asarray(D1op.trav_srcs)
+    D1op.trav_recs = np.asarray(D1op.trav_recs)
+    if par["dynamic"]:
+        D1op.amp_srcs = np.asarray(D1op.amp_srcs)
+        D1op.amp_recs = np.asarray(D1op.amp_recs)
 
     # forward
     xx = np.random.normal(0, 1, PAR["ny"] * PAR["nx"] * PAR["nz"])

From b66656777ddd987633940d60dca5a00267f6c6ae Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Wed, 18 Jun 2025 23:29:08 +0100
Subject: [PATCH 4/5] test: fix test_kirchhoff for cuda engine

---
 pytests/test_kirchhoff.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/pytests/test_kirchhoff.py b/pytests/test_kirchhoff.py
index 4a37008b6..da672b0dd 100644
--- a/pytests/test_kirchhoff.py
+++ b/pytests/test_kirchhoff.py
@@ -6,7 +6,7 @@
     import cupy as np
     from cupy.testing import assert_array_almost_equal
 
-    backend = "cuda"
+    backend = "cupy"
 else:
     import numpy as np
     from numpy.testing import assert_array_almost_equal
@@ -215,6 +215,8 @@ def test_traveltime_table():
 @pytest.mark.parametrize("par", [(par1), (par2), (par3), (par1d), (par2d), (par3d)])
 def test_kirchhoff2d(par):
     """Dot-test for Kirchhoff operator"""
+    if backend == "cupy" and par["mode"] == "byot":
+        pytest.skip("cuda engine not available for single trav table")
     vel = v0 * np.ones((PAR["nx"], PAR["nz"]))
 
     if par["mode"] == "byot":
@@ -244,7 +246,7 @@ def test_kirchhoff2d(par):
             trav=trav,
             amp=amp,
             mode=par["mode"],
-            engine=backend,
+            engine="numpy" if backend == "numpy" else "cuda",
         )
         if par["mode"] == "byot":
             Dop.trav = np.asarray(Dop.trav)
@@ -266,6 +268,8 @@ def test_kirchhoff2d(par):
 @pytest.mark.parametrize("par", [(par1), (par2), (par3)])
 def test_kirchhoff3d(par):
     """Dot-test for Kirchhoff operator"""
+    if backend == "cupy" and par["mode"] == "byot":
+        pytest.skip("cuda engine not available for single trav table")
     vel = v0 * np.ones((PAR["ny"], PAR["nx"], PAR["nz"]))
 
     if par["mode"] == "byot":
@@ -297,7 +301,7 @@ def test_kirchhoff3d(par):
             y=y,
             trav=trav,
             mode=par["mode"],
-            engine=backend,
+            engine="numpy" if backend == "numpy" else "cuda",
         )
         if par["mode"] == "byot":
             Dop.trav = np.asarray(Dop.trav)
@@ -338,7 +342,7 @@ def test_kirchhoff2d_trav_vs_travsrcrec(par):
         mode=par["mode"],
         dynamic=par["dynamic"],
         angleaperture=None,
-        engine=backend,
+        engine="numpy" if backend == "numpy" else "cuda",
     )
     Dop.trav_srcs = np.asarray(Dop.trav_srcs)
     Dop.trav_recs = np.asarray(Dop.trav_recs)
@@ -369,7 +373,7 @@ def test_kirchhoff2d_trav_vs_travsrcrec(par):
         mode=par["mode"],
         dynamic=par["dynamic"],
         angleaperture=None,
-        engine=backend,
+        engine="numpy" if backend == "numpy" else "cuda",
     )
     D1op.trav_srcs = np.asarray(D1op.trav_srcs)
     D1op.trav_recs = np.asarray(D1op.trav_recs)
@@ -408,7 +412,7 @@ def test_kirchhoff3d_trav_vs_travsrcrec(par):
         wavc,
         y=y,
         mode=par["mode"],
-        engine=backend,
+        engine="numpy" if backend == "numpy" else "cuda",
     )
     Dop.trav_srcs = np.asarray(Dop.trav_srcs)
     Dop.trav_recs = np.asarray(Dop.trav_recs)
@@ -439,7 +443,7 @@ def test_kirchhoff3d_trav_vs_travsrcrec(par):
         y=y,
         trav=trav,
         mode=par["mode"],
-        engine=backend,
+        engine="numpy" if backend == "numpy" else "cuda",
     )
     D1op.trav_srcs = np.asarray(D1op.trav_srcs)
     D1op.trav_recs = np.asarray(D1op.trav_recs)

From 768f9c55f1fe411d730818123d480f50cc639369 Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Thu, 19 Jun 2025 22:34:24 +0100
Subject: [PATCH 5/5] doc: minor updates to docstring removing mention of
 dynamic not implemented with cuda

---
 pylops/waveeqprocessing/_kirchhoff_cuda.py | 11 ++++++-----
 pylops/waveeqprocessing/kirchhoff.py       |  5 ++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pylops/waveeqprocessing/_kirchhoff_cuda.py b/pylops/waveeqprocessing/_kirchhoff_cuda.py
index 4a73347e1..f987960f5 100644
--- a/pylops/waveeqprocessing/_kirchhoff_cuda.py
+++ b/pylops/waveeqprocessing/_kirchhoff_cuda.py
@@ -7,10 +7,10 @@
 
 
 class _KirchhoffCudaHelper:
-    """A helper class to perform Kirchhoff demigration/migration using Numba CUDA.
+    """Helper class for Kirchhoff operator using Numba CUDA.
 
-    This class provides methods to compute the forward and adjoint operations for the
-    Kirchhoff operator, utilizing GPU acceleration through Numba's CUDA capabilities.
+    This class provides methods to compute the forward and adjoint operations of the
+    Kirchhoff operator utilizing GPU acceleration through Numba's CUDA capabilities.
 
     Parameters
     ----------
@@ -39,7 +39,8 @@ def _grid_setup(self):
         This method configures the number of blocks and threads per block for
         CUDA kernels, depending on the number of sources and receivers.
         """
-        # use warp size as number of threads per block
+        # use half of warp size as number of threads per block
+        # although this will be ideally lifted up to the user
         current_device = cuda.get_current_device()
         warp = current_device.WARP_SIZE // 2
         self.num_threads_per_blocks = (warp, warp)
@@ -339,7 +340,7 @@ def _call_kinematic(self, opt, *inputs):
         return y_d
 
     def _call_dynamic(self, opt, *inputs):
-        """Synamic computations using CUDA.
+        """Dynamic computations using CUDA.
 
         This method handles data preparation and execution of CUDA kernels
         for both forward and adjoint operations of dynamic operator.
diff --git a/pylops/waveeqprocessing/kirchhoff.py b/pylops/waveeqprocessing/kirchhoff.py
index 3f8e0a10f..a3aa9a22b 100644
--- a/pylops/waveeqprocessing/kirchhoff.py
+++ b/pylops/waveeqprocessing/kirchhoff.py
@@ -111,8 +111,7 @@ class Kirchhoff(LinearOperator):
         Deprecated, will be removed in v3.0.0. Simply kept for back-compatibility with previous implementation,
         but effectively not affecting the behaviour of the operator.
     engine : :obj:`str`, optional
-        Engine used for computations (``numpy``, ``numba`` or ``cuda``). Note that the ``cuda`` engine
-        currently supports only ``dynamic=False``.
+        Engine used for computations (``numpy``, ``numba`` or ``cuda``).
     dtype : :obj:`str`, optional
         Type of elements in input array.
     name : :obj:`str`, optional
@@ -134,7 +133,7 @@ class Kirchhoff(LinearOperator):
         If ``mode`` is neither ``analytic``, ``eikonal``, or ``byot``.
 
     NotImplementedError
-        If ``dynamic=True`` and ``engine="cuda"``
+        If ``engine="cuda"`` and ``trav`` is provided as a single table
 
     Notes
     -----