Skip to content

Commit 1cacf8b

Browse files
authored
Merge pull request #165 from tharittk/dual_test
Dual test for NumPy and CuPy in tests
2 parents fb4003b + af8977f commit 1cacf8b

14 files changed

Lines changed: 361 additions & 158 deletions

Makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ dev-install:
2929

3030
dev-install_nccl:
3131
make pipcheck
32-
$(PIP) install -r requirements-dev.txt && $(PIP) install cupy-cuda12x nvidia-nccl-cu12 $(PIP) install -e .
32+
$(PIP) install -r requirements-dev.txt && $(PIP) install cupy-cuda12x nvidia-nccl-cu12 && $(PIP) install -e .
3333

3434
install_conda:
3535
conda env create -f environment.yml && conda activate pylops_mpi && pip install .
@@ -49,6 +49,10 @@ lint:
4949
tests:
5050
mpiexec -n $(NUM_PROCESSES) pytest tests/ --with-mpi
5151

52+
# assuming NUM_PROCESSES <= number of gpus available
53+
tests_gpu:
54+
export TEST_CUPY_PYLOPS=1 && mpiexec -n $(NUM_PROCESSES) pytest tests/ --with-mpi
55+
5256
# assuming NUM_PROCESSES <= number of gpus available
5357
tests_nccl:
5458
mpiexec -n $(NUM_PROCESSES) pytest tests_nccl/ --with-mpi

docs/source/contributing.rst

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,18 @@ that the both old and new tests pass successfully:
6969
7070
>> make tests
7171
72+
If you run PyLops-MPI with GPUs you may also do:
73+
74+
.. code-block:: bash
75+
76+
>> make tests_gpu
77+
78+
Additionally, if you have a NCCL-enabled environment, you may also check:
79+
80+
.. code-block:: bash
81+
82+
>> make tests_nccl
83+
7284
4. Make sure the ``examples`` python scripts are executed using 3 processes without any errors:
7385

7486
.. code-block:: bash
@@ -123,8 +135,11 @@ Project structure
123135
This repository is organized as follows:
124136

125137
* **pylops_mpi**: Python library containing various mpi linear operators.
126-
* **tests**: Set of tests using pytest-mpi.
138+
* **tests**: Set of tests using pytest-mpi for both CPU and GPU.
139+
* **tests_nccl** Set of tests for NCCL-enabled environment using pytest-mpi
127140
* **testdata**: Sample datasets used in tests and documentation.
128141
* **docs**: Sphinx documentation.
129142
* **examples**: Set of python script examples for each mpi linear operator to be embedded in documentation using sphinx-gallery.
130-
* **tutorials**: Set of python script tutorials to be embedded in documentation using sphinx-gallery.
143+
* **tutorials**: Set of python script tutorials (NumPy & MPI) to be embedded in documentation using sphinx-gallery.
144+
* **tutorials_cupy**: Same set of scripts as above but with CuPy & MPI
145+
* **tutorials_nccl**: Same set of scripts as above but with CuPy & NCCL

pylops_mpi/DistributedArray.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -694,14 +694,25 @@ def _compute_vector_norm(self, local_array: NDArray,
694694
recv_buf = self._allreduce_subcomm(ncp.count_nonzero(local_array, axis=axis).astype(ncp.float64))
695695
elif ord == ncp.inf:
696696
# Calculate max followed by max reduction
697-
recv_buf = self._allreduce_subcomm(ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64),
698-
recv_buf, op=MPI.MAX)
699-
recv_buf = ncp.squeeze(recv_buf, axis=axis)
697+
# TODO (tharitt): currently CuPy + MPI does not work well with buffered communication, particularly
698+
# with MAX, MIN operator. Here we copy the array back to CPU, transfer, and copy them back to GPUs
699+
send_buf = ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64)
700+
if self.engine == "cupy" and self.base_comm_nccl is None:
701+
recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MAX)
702+
recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
703+
else:
704+
recv_buf = self._allreduce_subcomm(send_buf, recv_buf, op=MPI.MAX)
705+
recv_buf = ncp.squeeze(recv_buf, axis=axis)
700706
elif ord == -ncp.inf:
701707
# Calculate min followed by min reduction
702-
recv_buf = self._allreduce_subcomm(ncp.min(ncp.abs(local_array), axis=axis).astype(ncp.float64),
703-
recv_buf, op=MPI.MIN)
704-
recv_buf = ncp.squeeze(recv_buf, axis=axis)
708+
# TODO (tharitt): see the comment above in infinity norm
709+
send_buf = ncp.min(ncp.abs(local_array), axis=axis).astype(ncp.float64)
710+
if self.engine == "cupy" and self.base_comm_nccl is None:
711+
recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MIN)
712+
recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
713+
else:
714+
recv_buf = self._allreduce_subcomm(send_buf, recv_buf, op=MPI.MIN)
715+
recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
705716

706717
else:
707718
recv_buf = self._allreduce_subcomm(ncp.sum(ncp.abs(ncp.float_power(local_array, ord)), axis=axis))

pylops_mpi/basicoperators/MatrixMult.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,8 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
232232
mask=x.mask,
233233
partition=Partition.SCATTER,
234234
dtype=self.dtype,
235-
base_comm=self.base_comm
235+
base_comm=self.base_comm,
236+
engine=x.engine
236237
)
237238

238239
my_own_cols = self._rank_col_lens[self.rank]
@@ -257,7 +258,8 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray:
257258
mask=x.mask,
258259
partition=Partition.SCATTER,
259260
dtype=self.dtype,
260-
base_comm=self.base_comm
261+
base_comm=self.base_comm,
262+
engine=x.engine
261263
)
262264

263265
x_arr = x.local_array.reshape((self.N, self._local_ncols)).astype(self.dtype)

tests/test_blockdiag.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,19 @@
22
Designed to run with n processes
33
$ mpiexec -n 10 pytest test_blockdiag.py --with-mpi
44
"""
5+
import os
6+
7+
if int(os.environ.get("TEST_CUPY_PYLOPS", 0)):
8+
import cupy as np
9+
from cupy.testing import assert_allclose
10+
11+
backend = "cupy"
12+
else:
13+
import numpy as np
14+
from numpy.testing import assert_allclose
15+
16+
backend = "numpy"
517
from mpi4py import MPI
6-
import numpy as np
7-
from numpy.testing import assert_allclose
818
import pytest
919

1020
import pylops
@@ -17,6 +27,10 @@
1727
par2j = {'ny': 301, 'nx': 101, 'dtype': np.complex128}
1828

1929
np.random.seed(42)
30+
rank = MPI.COMM_WORLD.Get_rank()
31+
if backend == "cupy":
32+
device_id = rank % np.cuda.runtime.getDeviceCount()
33+
np.cuda.Device(device_id).use()
2034

2135

2236
@pytest.mark.mpi(min_size=2)
@@ -27,11 +41,11 @@ def test_blockdiag(par):
2741
Op = pylops.MatrixMult(A=((rank + 1) * np.ones(shape=(par['ny'], par['nx']))).astype(par['dtype']))
2842
BDiag_MPI = pylops_mpi.MPIBlockDiag(ops=[Op, ])
2943

30-
x = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'])
44+
x = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend)
3145
x[:] = np.ones(shape=par['nx'], dtype=par['dtype'])
3246
x_global = x.asarray()
3347

34-
y = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'])
48+
y = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend)
3549
y[:] = np.ones(shape=par['ny'], dtype=par['dtype'])
3650
y_global = y.asarray()
3751

@@ -68,16 +82,16 @@ def test_stacked_blockdiag(par):
6882
FirstDeriv_MPI = pylops_mpi.MPIFirstDerivative(dims=(par['ny'], par['nx']), dtype=par['dtype'])
6983
StackedBDiag_MPI = pylops_mpi.MPIStackedBlockDiag(ops=[BDiag_MPI, FirstDeriv_MPI])
7084

71-
dist1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'])
85+
dist1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend)
7286
dist1[:] = np.ones(dist1.local_shape, dtype=par['dtype'])
73-
dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'])
87+
dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend)
7488
dist2[:] = np.ones(dist2.local_shape, dtype=par['dtype'])
7589
x = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2])
7690
x_global = x.asarray()
7791

78-
dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'])
92+
dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend)
7993
dist1[:] = np.ones(dist1.local_shape, dtype=par['dtype'])
80-
dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'])
94+
dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend)
8195
dist2[:] = np.ones(dist2.local_shape, dtype=par['dtype'])
8296
y = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2])
8397
y_global = y.asarray()

tests/test_derivative.py

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,20 @@
22
Designed to run with n processes
33
$ mpiexec -n 10 pytest test_derivative.py --with-mpi
44
"""
5-
import numpy as np
5+
import os
6+
7+
if int(os.environ.get("TEST_CUPY_PYLOPS", 0)):
8+
import cupy as np
9+
from cupy.testing import assert_allclose
10+
11+
backend = "cupy"
12+
else:
13+
import numpy as np
14+
from numpy.testing import assert_allclose
15+
16+
backend = "numpy"
17+
import numpy as npp
618
from mpi4py import MPI
7-
from numpy.testing import assert_allclose
819
import pytest
920

1021
import pylops
@@ -14,6 +25,10 @@
1425
np.random.seed(42)
1526
rank = MPI.COMM_WORLD.Get_rank()
1627
size = MPI.COMM_WORLD.Get_size()
28+
if backend == "cupy":
29+
device_id = rank % np.cuda.runtime.getDeviceCount()
30+
np.cuda.Device(device_id).use()
31+
1732

1833
par1 = {
1934
"nz": 600,
@@ -189,8 +204,8 @@ def test_first_derivative_forward(par):
189204
Fop_MPI = pylops_mpi.MPIFirstDerivative(dims=par['nz'], sampling=par['dz'],
190205
kind="forward", edge=par['edge'],
191206
dtype=par['dtype'])
192-
x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
193-
partition=par['partition'])
207+
x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
208+
partition=par['partition'], engine=backend)
194209
x[:] = np.random.normal(rank, 10, x.local_shape)
195210
x_global = x.asarray()
196211
# Forward
@@ -200,7 +215,7 @@ def test_first_derivative_forward(par):
200215
y_adj_dist = Fop_MPI.H @ x
201216
y_adj = y_adj_dist.asarray()
202217
# Dot test
203-
dottest(Fop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
218+
dottest(Fop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
204219

205220
if rank == 0:
206221
Fop = pylops.FirstDerivative(dims=par['nz'], axis=0,
@@ -223,8 +238,8 @@ def test_first_derivative_backward(par):
223238
Fop_MPI = pylops_mpi.MPIFirstDerivative(dims=par['nz'], sampling=par['dz'],
224239
kind="backward", edge=par['edge'],
225240
dtype=par['dtype'])
226-
x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
227-
partition=par['partition'])
241+
x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
242+
partition=par['partition'], engine=backend)
228243
x[:] = np.random.normal(rank, 10, x.local_shape)
229244
x_global = x.asarray()
230245
# Forward
@@ -234,7 +249,7 @@ def test_first_derivative_backward(par):
234249
y_adj_dist = Fop_MPI.H @ x
235250
y_adj = y_adj_dist.asarray()
236251
# Dot test
237-
dottest(Fop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
252+
dottest(Fop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
238253

239254
if rank == 0:
240255
Fop = pylops.FirstDerivative(dims=par['nz'], axis=0,
@@ -258,8 +273,8 @@ def test_first_derivative_centered(par):
258273
Fop_MPI = pylops_mpi.MPIFirstDerivative(dims=par['nz'], sampling=par['dz'],
259274
kind="centered", edge=par['edge'],
260275
order=order, dtype=par['dtype'])
261-
x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
262-
partition=par['partition'])
276+
x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
277+
partition=par['partition'], engine=backend)
263278
x[:] = np.random.normal(rank, 10, x.local_shape)
264279
x_global = x.asarray()
265280
# Forward
@@ -269,7 +284,7 @@ def test_first_derivative_centered(par):
269284
y_adj_dist = Fop_MPI.H @ x
270285
y_adj = y_adj_dist.asarray()
271286
# Dot test
272-
dottest(Fop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
287+
dottest(Fop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
273288

274289
if rank == 0:
275290
Fop = pylops.FirstDerivative(dims=par['nz'], axis=0,
@@ -292,8 +307,8 @@ def test_second_derivative_forward(par):
292307
Sop_MPI = pylops_mpi.basicoperators.MPISecondDerivative(dims=par['nz'], sampling=par['dz'],
293308
kind="forward", edge=par['edge'],
294309
dtype=par['dtype'])
295-
x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
296-
partition=par['partition'])
310+
x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
311+
partition=par['partition'], engine=backend)
297312
x[:] = np.random.normal(rank, 10, x.local_shape)
298313
x_global = x.asarray()
299314
# Forward
@@ -303,7 +318,7 @@ def test_second_derivative_forward(par):
303318
y_adj_dist = Sop_MPI.H @ x
304319
y_adj = y_adj_dist.asarray()
305320
# Dot test
306-
dottest(Sop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
321+
dottest(Sop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
307322

308323
if rank == 0:
309324
Sop = pylops.SecondDerivative(dims=par['nz'], axis=0,
@@ -326,8 +341,8 @@ def test_second_derivative_backward(par):
326341
Sop_MPI = pylops_mpi.basicoperators.MPISecondDerivative(dims=par['nz'], sampling=par['dz'],
327342
kind="backward", edge=par['edge'],
328343
dtype=par['dtype'])
329-
x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
330-
partition=par['partition'])
344+
x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
345+
partition=par['partition'], engine=backend)
331346
x[:] = np.random.normal(rank, 10, x.local_shape)
332347
x_global = x.asarray()
333348
# Forward
@@ -337,7 +352,7 @@ def test_second_derivative_backward(par):
337352
y_adj_dist = Sop_MPI.H @ x
338353
y_adj = y_adj_dist.asarray()
339354
# Dot test
340-
dottest(Sop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
355+
dottest(Sop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
341356

342357
if rank == 0:
343358
Sop = pylops.SecondDerivative(dims=par['nz'], axis=0,
@@ -360,8 +375,8 @@ def test_second_derivative_centered(par):
360375
Sop_MPI = pylops_mpi.basicoperators.MPISecondDerivative(dims=par['nz'], sampling=par['dz'],
361376
kind="centered", edge=par['edge'],
362377
dtype=par['dtype'])
363-
x = pylops_mpi.DistributedArray(global_shape=np.prod(par['nz']), dtype=par['dtype'],
364-
partition=par['partition'])
378+
x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['nz']), dtype=par['dtype'],
379+
partition=par['partition'], engine=backend)
365380
x[:] = np.random.normal(rank, 10, x.local_shape)
366381
x_global = x.asarray()
367382
# Forward
@@ -371,7 +386,7 @@ def test_second_derivative_centered(par):
371386
y_adj_dist = Sop_MPI.H @ x
372387
y_adj = y_adj_dist.asarray()
373388
# Dot test
374-
dottest(Sop_MPI, x, y_dist, np.prod(par['nz']), np.prod(par['nz']))
389+
dottest(Sop_MPI, x, y_dist, npp.prod(par['nz']), npp.prod(par['nz']))
375390

376391
if rank == 0:
377392
Sop = pylops.SecondDerivative(dims=par['nz'], axis=0,
@@ -394,7 +409,7 @@ def test_laplacian(par):
394409
weights=par['weights'], sampling=par['sampling'],
395410
kind=kind, edge=par['edge'],
396411
dtype=par['dtype'])
397-
x = pylops_mpi.DistributedArray(global_shape=np.prod(par['n']), dtype=par['dtype'])
412+
x = pylops_mpi.DistributedArray(global_shape=npp.prod(par['n']), dtype=par['dtype'], engine=backend)
398413
x[:] = np.random.normal(rank, 10, x.local_shape)
399414
x_global = x.asarray()
400415
# Forward
@@ -404,7 +419,7 @@ def test_laplacian(par):
404419
y_adj_dist = Lop_MPI.H @ x
405420
y_adj = y_adj_dist.asarray()
406421
# Dot test
407-
dottest(Lop_MPI, x, y_dist, np.prod(par['n']), np.prod(par['n']))
422+
dottest(Lop_MPI, x, y_dist, npp.prod(par['n']), npp.prod(par['n']))
408423

409424
if rank == 0:
410425
Lop = pylops.Laplacian(dims=par['n'], axes=par['axes'],
@@ -426,7 +441,7 @@ def test_gradient(par):
426441
Gop_MPI = pylops_mpi.basicoperators.MPIGradient(dims=par['n'], sampling=par['sampling'],
427442
kind=kind, edge=par['edge'],
428443
dtype=par['dtype'])
429-
x_fwd = pylops_mpi.DistributedArray(global_shape=np.prod(par['n']), dtype=par['dtype'])
444+
x_fwd = pylops_mpi.DistributedArray(global_shape=npp.prod(par['n']), dtype=par['dtype'], engine=backend)
430445
x_fwd[:] = np.random.normal(rank, 10, x_fwd.local_shape)
431446
x_global = x_fwd.asarray()
432447

@@ -436,11 +451,11 @@ def test_gradient(par):
436451
y = y_dist.asarray()
437452

438453
# Adjoint
439-
x_adj_dist1 = pylops_mpi.DistributedArray(global_shape=int(np.prod(par['n'])), dtype=par['dtype'])
454+
x_adj_dist1 = pylops_mpi.DistributedArray(global_shape=int(npp.prod(par['n'])), dtype=par['dtype'], engine=backend)
440455
x_adj_dist1[:] = np.random.normal(rank, 10, x_adj_dist1.local_shape)
441-
x_adj_dist2 = pylops_mpi.DistributedArray(global_shape=int(np.prod(par['n'])), dtype=par['dtype'])
456+
x_adj_dist2 = pylops_mpi.DistributedArray(global_shape=int(npp.prod(par['n'])), dtype=par['dtype'], engine=backend)
442457
x_adj_dist2[:] = np.random.normal(rank, 20, x_adj_dist2.local_shape)
443-
x_adj_dist3 = pylops_mpi.DistributedArray(global_shape=int(np.prod(par['n'])), dtype=par['dtype'])
458+
x_adj_dist3 = pylops_mpi.DistributedArray(global_shape=int(npp.prod(par['n'])), dtype=par['dtype'], engine=backend)
444459
x_adj_dist3[:] = np.random.normal(rank, 30, x_adj_dist3.local_shape)
445460
x_adj = pylops_mpi.StackedDistributedArray(distarrays=[x_adj_dist1, x_adj_dist2, x_adj_dist3])
446461
x_adj_global = x_adj.asarray()
@@ -449,7 +464,7 @@ def test_gradient(par):
449464
y_adj = y_adj_dist.asarray()
450465

451466
# Dot test
452-
dottest(Gop_MPI, x_fwd, y_dist, len(par['n']) * np.prod(par['n']), np.prod(par['n']))
467+
dottest(Gop_MPI, x_fwd, y_dist, len(par['n']) * npp.prod(par['n']), npp.prod(par['n']))
453468

454469
if rank == 0:
455470
Gop = pylops.Gradient(dims=par['n'], sampling=par['sampling'],

0 commit comments

Comments
 (0)