|
1 | 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
2 | 2 | # SPDX-License-Identifier: BSD-2-Clause |
3 | 3 |
|
4 | | -from numba import cuda |
5 | | -import numpy as np |
6 | | -from numba.cuda.testing import skip_on_cudasim, CUDATestCase |
7 | 4 | import threading |
8 | | -import unittest |
9 | 5 |
|
| 6 | +import numpy as np |
| 7 | +import pytest |
10 | 8 |
|
11 | | -class TestMultiGPUContext(CUDATestCase): |
12 | | - @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus") |
13 | | - def test_multigpu_context(self): |
14 | | - @cuda.jit("void(float64[:], float64[:])") |
15 | | - def copy_plus_1(inp, out): |
16 | | - i = cuda.grid(1) |
17 | | - if i < out.size: |
18 | | - out[i] = inp[i] + 1 |
| 9 | +from numba import cuda |
| 10 | +from numba.cuda.testing import skip_on_cudasim |
19 | 11 |
|
20 | | - def check(inp, out): |
21 | | - np.testing.assert_equal(inp + 1, out) |
22 | 12 |
|
23 | | - N = 32 |
24 | | - A = np.arange(N, dtype=np.float64) |
25 | | - B = np.arange(N, dtype=np.float64) |
| 13 | +@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus") |
| 14 | +def test_multigpu_context(): |
| 15 | + @cuda.jit("void(float64[:], float64[:])") |
| 16 | + def copy_plus_1(inp, out): |
| 17 | + i = cuda.grid(1) |
| 18 | + if i < out.size: |
| 19 | + out[i] = inp[i] + 1 |
26 | 20 |
|
27 | | - with cuda.gpus[0]: |
28 | | - copy_plus_1[1, N](A, B) |
| 21 | + def check(inp, out): |
| 22 | + np.testing.assert_equal(inp + 1, out) |
29 | 23 |
|
30 | | - check(A, B) |
| 24 | + N = 32 |
| 25 | + A = np.arange(N, dtype=np.float64) |
| 26 | + B = np.arange(N, dtype=np.float64) |
31 | 27 |
|
| 28 | + with cuda.gpus[0]: |
32 | 29 | copy_plus_1[1, N](A, B) |
33 | | - check(A, B) |
34 | 30 |
|
35 | | - with cuda.gpus[0]: |
36 | | - A0 = np.arange(N, dtype=np.float64) |
37 | | - B0 = np.arange(N, dtype=np.float64) |
38 | | - copy_plus_1[1, N](A0, B0) |
| 31 | + check(A, B) |
39 | 32 |
|
40 | | - with cuda.gpus[1]: |
41 | | - A1 = np.arange(N, dtype=np.float64) |
42 | | - B1 = np.arange(N, dtype=np.float64) |
43 | | - copy_plus_1[1, N](A1, B1) |
| 33 | + copy_plus_1[1, N](A, B) |
| 34 | + check(A, B) |
44 | 35 |
|
45 | | - check(A0, B0) |
46 | | - check(A1, B1) |
47 | | - |
48 | | - A = np.arange(N, dtype=np.float64) |
49 | | - B = np.arange(N, dtype=np.float64) |
50 | | - copy_plus_1[1, N](A, B) |
51 | | - check(A, B) |
52 | | - |
53 | | - @skip_on_cudasim("Simulator does not support multiple threads") |
54 | | - def test_multithreaded(self): |
55 | | - def work(gpu, dA, results, ridx): |
56 | | - try: |
57 | | - with gpu: |
58 | | - arr = dA.copy_to_host() |
59 | | - |
60 | | - except Exception as e: |
61 | | - results[ridx] = e |
62 | | - |
63 | | - else: |
64 | | - results[ridx] = np.all(arr == np.arange(10)) |
65 | | - |
66 | | - dA = cuda.to_device(np.arange(10)) |
67 | | - |
68 | | - nthreads = 10 |
69 | | - results = [None] * nthreads |
70 | | - threads = [ |
71 | | - threading.Thread( |
72 | | - target=work, args=(cuda.gpus.current, dA, results, i) |
73 | | - ) |
74 | | - for i in range(nthreads) |
75 | | - ] |
76 | | - for th in threads: |
77 | | - th.start() |
78 | | - |
79 | | - for th in threads: |
80 | | - th.join() |
81 | | - |
82 | | - for r in results: |
83 | | - if isinstance(r, BaseException): |
84 | | - raise r |
85 | | - else: |
86 | | - self.assertTrue(r) |
87 | | - |
88 | | - @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus") |
89 | | - def test_with_context(self): |
90 | | - @cuda.jit |
91 | | - def vector_add_scalar(arr, val): |
92 | | - i = cuda.grid(1) |
93 | | - if i < arr.size: |
94 | | - arr[i] += val |
95 | | - |
96 | | - hostarr = np.arange(10, dtype=np.float32) |
97 | | - with cuda.gpus[0]: |
98 | | - arr1 = cuda.to_device(hostarr) |
| 36 | + with cuda.gpus[0]: |
| 37 | + A0 = np.arange(N, dtype=np.float64) |
| 38 | + B0 = np.arange(N, dtype=np.float64) |
| 39 | + copy_plus_1[1, N](A0, B0) |
99 | 40 |
|
100 | 41 | with cuda.gpus[1]: |
101 | | - arr2 = cuda.to_device(hostarr) |
| 42 | + A1 = np.arange(N, dtype=np.float64) |
| 43 | + B1 = np.arange(N, dtype=np.float64) |
| 44 | + copy_plus_1[1, N](A1, B1) |
102 | 45 |
|
103 | | - with cuda.gpus[0]: |
104 | | - vector_add_scalar[1, 10](arr1, 1) |
| 46 | + check(A0, B0) |
| 47 | + check(A1, B1) |
105 | 48 |
|
106 | | - with cuda.gpus[1]: |
107 | | - vector_add_scalar[1, 10](arr2, 2) |
| 49 | + A = np.arange(N, dtype=np.float64) |
| 50 | + B = np.arange(N, dtype=np.float64) |
| 51 | + copy_plus_1[1, N](A, B) |
| 52 | + check(A, B) |
108 | 53 |
|
109 | | - with cuda.gpus[0]: |
110 | | - np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1)) |
111 | 54 |
|
112 | | - with cuda.gpus[1]: |
113 | | - np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2)) |
114 | | - |
115 | | - @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus") |
116 | | - def test_with_context_peer_copy(self): |
117 | | - # Peer access is not always possible - for example, with one GPU in TCC |
118 | | - # mode and one in WDDM - if that is the case, this test would fail so |
119 | | - # we need to skip it. |
120 | | - with cuda.gpus[0]: |
121 | | - ctx = cuda.current_context() |
122 | | - if not ctx.can_access_peer(1): |
123 | | - self.skipTest("Peer access between GPUs disabled") |
124 | | - |
125 | | - # 1. Create a range in an array |
126 | | - hostarr = np.arange(10, dtype=np.float32) |
127 | | - |
128 | | - # 2. Copy range array from host -> GPU 0 |
129 | | - with cuda.gpus[0]: |
130 | | - arr1 = cuda.to_device(hostarr) |
131 | | - |
132 | | - # 3. Initialize a zero-filled array on GPU 1 |
133 | | - with cuda.gpus[1]: |
134 | | - arr2 = cuda.to_device(np.zeros_like(hostarr)) |
| 55 | +@skip_on_cudasim("Simulator does not support multiple threads") |
| 56 | +def test_multithreaded(): |
| 57 | + def work(gpu, dA, results, ridx): |
| 58 | + try: |
| 59 | + with gpu: |
| 60 | + arr = dA.copy_to_host() |
| 61 | + |
| 62 | + except Exception as e: |
| 63 | + results[ridx] = e |
| 64 | + |
| 65 | + else: |
| 66 | + results[ridx] = np.all(arr == np.arange(10)) |
| 67 | + |
| 68 | + dA = cuda.to_device(np.arange(10)) |
| 69 | + |
| 70 | + nthreads = 10 |
| 71 | + results = [None] * nthreads |
| 72 | + threads = [ |
| 73 | + threading.Thread(target=work, args=(cuda.gpus.current, dA, results, i)) |
| 74 | + for i in range(nthreads) |
| 75 | + ] |
| 76 | + for th in threads: |
| 77 | + th.start() |
| 78 | + |
| 79 | + for th in threads: |
| 80 | + th.join() |
| 81 | + |
| 82 | + for r in results: |
| 83 | + if isinstance(r, BaseException): |
| 84 | + raise r |
| 85 | + else: |
| 86 | + assert r |
| 87 | + |
| 88 | + |
| 89 | +@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus") |
| 90 | +def test_with_context(): |
| 91 | + @cuda.jit |
| 92 | + def vector_add_scalar(arr, val): |
| 93 | + i = cuda.grid(1) |
| 94 | + if i < arr.size: |
| 95 | + arr[i] += val |
| 96 | + |
| 97 | + hostarr = np.arange(10, dtype=np.float32) |
| 98 | + with cuda.gpus[0]: |
| 99 | + arr1 = cuda.to_device(hostarr) |
| 100 | + |
| 101 | + with cuda.gpus[1]: |
| 102 | + arr2 = cuda.to_device(hostarr) |
| 103 | + |
| 104 | + with cuda.gpus[0]: |
| 105 | + vector_add_scalar[1, 10](arr1, 1) |
| 106 | + |
| 107 | + with cuda.gpus[1]: |
| 108 | + vector_add_scalar[1, 10](arr2, 2) |
| 109 | + |
| 110 | + with cuda.gpus[0]: |
| 111 | + np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1)) |
| 112 | + |
| 113 | + with cuda.gpus[1]: |
| 114 | + np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2)) |
| 115 | + |
| 116 | + |
| 117 | +@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus") |
| 118 | +def test_with_context_peer_copy(): |
| 119 | + # Peer access is not always possible - for example, with one GPU in TCC |
| 120 | + # mode and one in WDDM - if that is the case, this test would fail so |
| 121 | + # we need to skip it. |
| 122 | + with cuda.gpus[0]: |
| 123 | + ctx = cuda.current_context() |
| 124 | + if not ctx.can_access_peer(1): |
| 125 | + pytest.skip("Peer access between GPUs disabled") |
| 126 | + |
| 127 | + # 1. Create a range in an array |
| 128 | + hostarr = np.arange(10, dtype=np.float32) |
135 | 129 |
|
136 | | - with cuda.gpus[0]: |
137 | | - # 4. Copy range from GPU 0 -> GPU 1 |
138 | | - arr2.copy_to_device(arr1) |
| 130 | + # 2. Copy range array from host -> GPU 0 |
| 131 | + with cuda.gpus[0]: |
| 132 | + arr1 = cuda.to_device(hostarr) |
139 | 133 |
|
140 | | - # 5. Copy range from GPU 1 -> host and check contents |
141 | | - np.testing.assert_equal(arr2.copy_to_host(), hostarr) |
| 134 | + # 3. Initialize a zero-filled array on GPU 1 |
| 135 | + with cuda.gpus[1]: |
| 136 | + arr2 = cuda.to_device(np.zeros_like(hostarr)) |
142 | 137 |
|
| 138 | + with cuda.gpus[0]: |
| 139 | + # 4. Copy range from GPU 0 -> GPU 1 |
| 140 | + arr2.copy_to_device(arr1) |
143 | 141 |
|
144 | | -if __name__ == "__main__": |
145 | | - unittest.main() |
| 142 | + # 5. Copy range from GPU 1 -> host and check contents |
| 143 | + np.testing.assert_equal(arr2.copy_to_host(), hostarr) |
0 commit comments