cudapy/test_multigpu

acosmicflamingo · acosmicflamingo · commit b1d58f9845a4 · 2026-05-09T18:23:50.000-05:00
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py b/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py
@@ -1,145 +1,143 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
-from numba import cuda
-import numpy as np
-from numba.cuda.testing import skip_on_cudasim, CUDATestCase
 import threading
-import unittest
 
+import numpy as np
+import pytest
 
-class TestMultiGPUContext(CUDATestCase):
-    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
-    def test_multigpu_context(self):
-        @cuda.jit("void(float64[:], float64[:])")
-        def copy_plus_1(inp, out):
-            i = cuda.grid(1)
-            if i < out.size:
-                out[i] = inp[i] + 1
+from numba import cuda
+from numba.cuda.testing import skip_on_cudasim
 
-        def check(inp, out):
-            np.testing.assert_equal(inp + 1, out)
 
-        N = 32
-        A = np.arange(N, dtype=np.float64)
-        B = np.arange(N, dtype=np.float64)
+@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus")
+def test_multigpu_context():
+    @cuda.jit("void(float64[:], float64[:])")
+    def copy_plus_1(inp, out):
+        i = cuda.grid(1)
+        if i < out.size:
+            out[i] = inp[i] + 1
 
-        with cuda.gpus[0]:
-            copy_plus_1[1, N](A, B)
+    def check(inp, out):
+        np.testing.assert_equal(inp + 1, out)
 
-        check(A, B)
+    N = 32
+    A = np.arange(N, dtype=np.float64)
+    B = np.arange(N, dtype=np.float64)
 
+    with cuda.gpus[0]:
         copy_plus_1[1, N](A, B)
-        check(A, B)
 
-        with cuda.gpus[0]:
-            A0 = np.arange(N, dtype=np.float64)
-            B0 = np.arange(N, dtype=np.float64)
-            copy_plus_1[1, N](A0, B0)
+    check(A, B)
 
-            with cuda.gpus[1]:
-                A1 = np.arange(N, dtype=np.float64)
-                B1 = np.arange(N, dtype=np.float64)
-                copy_plus_1[1, N](A1, B1)
+    copy_plus_1[1, N](A, B)
+    check(A, B)
 
-        check(A0, B0)
-        check(A1, B1)
-
-        A = np.arange(N, dtype=np.float64)
-        B = np.arange(N, dtype=np.float64)
-        copy_plus_1[1, N](A, B)
-        check(A, B)
-
-    @skip_on_cudasim("Simulator does not support multiple threads")
-    def test_multithreaded(self):
-        def work(gpu, dA, results, ridx):
-            try:
-                with gpu:
-                    arr = dA.copy_to_host()
-
-            except Exception as e:
-                results[ridx] = e
-
-            else:
-                results[ridx] = np.all(arr == np.arange(10))
-
-        dA = cuda.to_device(np.arange(10))
-
-        nthreads = 10
-        results = [None] * nthreads
-        threads = [
-            threading.Thread(
-                target=work, args=(cuda.gpus.current, dA, results, i)
-            )
-            for i in range(nthreads)
-        ]
-        for th in threads:
-            th.start()
-
-        for th in threads:
-            th.join()
-
-        for r in results:
-            if isinstance(r, BaseException):
-                raise r
-            else:
-                self.assertTrue(r)
-
-    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
-    def test_with_context(self):
-        @cuda.jit
-        def vector_add_scalar(arr, val):
-            i = cuda.grid(1)
-            if i < arr.size:
-                arr[i] += val
-
-        hostarr = np.arange(10, dtype=np.float32)
-        with cuda.gpus[0]:
-            arr1 = cuda.to_device(hostarr)
+    with cuda.gpus[0]:
+        A0 = np.arange(N, dtype=np.float64)
+        B0 = np.arange(N, dtype=np.float64)
+        copy_plus_1[1, N](A0, B0)
 
         with cuda.gpus[1]:
-            arr2 = cuda.to_device(hostarr)
+            A1 = np.arange(N, dtype=np.float64)
+            B1 = np.arange(N, dtype=np.float64)
+            copy_plus_1[1, N](A1, B1)
 
-        with cuda.gpus[0]:
-            vector_add_scalar[1, 10](arr1, 1)
+    check(A0, B0)
+    check(A1, B1)
 
-        with cuda.gpus[1]:
-            vector_add_scalar[1, 10](arr2, 2)
+    A = np.arange(N, dtype=np.float64)
+    B = np.arange(N, dtype=np.float64)
+    copy_plus_1[1, N](A, B)
+    check(A, B)
 
-        with cuda.gpus[0]:
-            np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))
 
-        with cuda.gpus[1]:
-            np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))
-
-    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
-    def test_with_context_peer_copy(self):
-        # Peer access is not always possible - for example, with one GPU in TCC
-        # mode and one in WDDM - if that is the case, this test would fail so
-        # we need to skip it.
-        with cuda.gpus[0]:
-            ctx = cuda.current_context()
-            if not ctx.can_access_peer(1):
-                self.skipTest("Peer access between GPUs disabled")
-
-        # 1. Create a range in an array
-        hostarr = np.arange(10, dtype=np.float32)
-
-        # 2. Copy range array from host -> GPU 0
-        with cuda.gpus[0]:
-            arr1 = cuda.to_device(hostarr)
-
-        # 3. Initialize a zero-filled array on GPU 1
-        with cuda.gpus[1]:
-            arr2 = cuda.to_device(np.zeros_like(hostarr))
+@skip_on_cudasim("Simulator does not support multiple threads")
+def test_multithreaded():
+    def work(gpu, dA, results, ridx):
+        try:
+            with gpu:
+                arr = dA.copy_to_host()
+
+        except Exception as e:
+            results[ridx] = e
+
+        else:
+            results[ridx] = np.all(arr == np.arange(10))
+
+    dA = cuda.to_device(np.arange(10))
+
+    nthreads = 10
+    results = [None] * nthreads
+    threads = [
+        threading.Thread(target=work, args=(cuda.gpus.current, dA, results, i))
+        for i in range(nthreads)
+    ]
+    for th in threads:
+        th.start()
+
+    for th in threads:
+        th.join()
+
+    for r in results:
+        if isinstance(r, BaseException):
+            raise r
+        else:
+            assert r
+
+
+@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus")
+def test_with_context():
+    @cuda.jit
+    def vector_add_scalar(arr, val):
+        i = cuda.grid(1)
+        if i < arr.size:
+            arr[i] += val
+
+    hostarr = np.arange(10, dtype=np.float32)
+    with cuda.gpus[0]:
+        arr1 = cuda.to_device(hostarr)
+
+    with cuda.gpus[1]:
+        arr2 = cuda.to_device(hostarr)
+
+    with cuda.gpus[0]:
+        vector_add_scalar[1, 10](arr1, 1)
+
+    with cuda.gpus[1]:
+        vector_add_scalar[1, 10](arr2, 2)
+
+    with cuda.gpus[0]:
+        np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))
+
+    with cuda.gpus[1]:
+        np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))
+
+
+@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus")
+def test_with_context_peer_copy():
+    # Peer access is not always possible - for example, with one GPU in TCC
+    # mode and one in WDDM - if that is the case, this test would fail so
+    # we need to skip it.
+    with cuda.gpus[0]:
+        ctx = cuda.current_context()
+        if not ctx.can_access_peer(1):
+            pytest.skip("Peer access between GPUs disabled")
+
+    # 1. Create a range in an array
+    hostarr = np.arange(10, dtype=np.float32)
 
-        with cuda.gpus[0]:
-            # 4. Copy range from GPU 0 -> GPU 1
-            arr2.copy_to_device(arr1)
+    # 2. Copy range array from host -> GPU 0
+    with cuda.gpus[0]:
+        arr1 = cuda.to_device(hostarr)
 
-            # 5. Copy range from GPU 1 -> host and check contents
-            np.testing.assert_equal(arr2.copy_to_host(), hostarr)
+    # 3. Initialize a zero-filled array on GPU 1
+    with cuda.gpus[1]:
+        arr2 = cuda.to_device(np.zeros_like(hostarr))
 
+    with cuda.gpus[0]:
+        # 4. Copy range from GPU 0 -> GPU 1
+        arr2.copy_to_device(arr1)
 
-if __name__ == "__main__":
-    unittest.main()
+        # 5. Copy range from GPU 1 -> host and check contents
+        np.testing.assert_equal(arr2.copy_to_host(), hostarr)