Skip to content

Commit b1d58f9

Browse files
cudapy/test_multigpu
1 parent a3e686a commit b1d58f9

1 file changed

Lines changed: 118 additions & 120 deletions

File tree

Lines changed: 118 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -1,145 +1,143 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: BSD-2-Clause
33

4-
from numba import cuda
5-
import numpy as np
6-
from numba.cuda.testing import skip_on_cudasim, CUDATestCase
74
import threading
8-
import unittest
95

6+
import numpy as np
7+
import pytest
108

11-
class TestMultiGPUContext(CUDATestCase):
12-
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
13-
def test_multigpu_context(self):
14-
@cuda.jit("void(float64[:], float64[:])")
15-
def copy_plus_1(inp, out):
16-
i = cuda.grid(1)
17-
if i < out.size:
18-
out[i] = inp[i] + 1
9+
from numba import cuda
10+
from numba.cuda.testing import skip_on_cudasim
1911

20-
def check(inp, out):
21-
np.testing.assert_equal(inp + 1, out)
2212

23-
N = 32
24-
A = np.arange(N, dtype=np.float64)
25-
B = np.arange(N, dtype=np.float64)
13+
@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus")
14+
def test_multigpu_context():
15+
@cuda.jit("void(float64[:], float64[:])")
16+
def copy_plus_1(inp, out):
17+
i = cuda.grid(1)
18+
if i < out.size:
19+
out[i] = inp[i] + 1
2620

27-
with cuda.gpus[0]:
28-
copy_plus_1[1, N](A, B)
21+
def check(inp, out):
22+
np.testing.assert_equal(inp + 1, out)
2923

30-
check(A, B)
24+
N = 32
25+
A = np.arange(N, dtype=np.float64)
26+
B = np.arange(N, dtype=np.float64)
3127

28+
with cuda.gpus[0]:
3229
copy_plus_1[1, N](A, B)
33-
check(A, B)
3430

35-
with cuda.gpus[0]:
36-
A0 = np.arange(N, dtype=np.float64)
37-
B0 = np.arange(N, dtype=np.float64)
38-
copy_plus_1[1, N](A0, B0)
31+
check(A, B)
3932

40-
with cuda.gpus[1]:
41-
A1 = np.arange(N, dtype=np.float64)
42-
B1 = np.arange(N, dtype=np.float64)
43-
copy_plus_1[1, N](A1, B1)
33+
copy_plus_1[1, N](A, B)
34+
check(A, B)
4435

45-
check(A0, B0)
46-
check(A1, B1)
47-
48-
A = np.arange(N, dtype=np.float64)
49-
B = np.arange(N, dtype=np.float64)
50-
copy_plus_1[1, N](A, B)
51-
check(A, B)
52-
53-
@skip_on_cudasim("Simulator does not support multiple threads")
54-
def test_multithreaded(self):
55-
def work(gpu, dA, results, ridx):
56-
try:
57-
with gpu:
58-
arr = dA.copy_to_host()
59-
60-
except Exception as e:
61-
results[ridx] = e
62-
63-
else:
64-
results[ridx] = np.all(arr == np.arange(10))
65-
66-
dA = cuda.to_device(np.arange(10))
67-
68-
nthreads = 10
69-
results = [None] * nthreads
70-
threads = [
71-
threading.Thread(
72-
target=work, args=(cuda.gpus.current, dA, results, i)
73-
)
74-
for i in range(nthreads)
75-
]
76-
for th in threads:
77-
th.start()
78-
79-
for th in threads:
80-
th.join()
81-
82-
for r in results:
83-
if isinstance(r, BaseException):
84-
raise r
85-
else:
86-
self.assertTrue(r)
87-
88-
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
89-
def test_with_context(self):
90-
@cuda.jit
91-
def vector_add_scalar(arr, val):
92-
i = cuda.grid(1)
93-
if i < arr.size:
94-
arr[i] += val
95-
96-
hostarr = np.arange(10, dtype=np.float32)
97-
with cuda.gpus[0]:
98-
arr1 = cuda.to_device(hostarr)
36+
with cuda.gpus[0]:
37+
A0 = np.arange(N, dtype=np.float64)
38+
B0 = np.arange(N, dtype=np.float64)
39+
copy_plus_1[1, N](A0, B0)
9940

10041
with cuda.gpus[1]:
101-
arr2 = cuda.to_device(hostarr)
42+
A1 = np.arange(N, dtype=np.float64)
43+
B1 = np.arange(N, dtype=np.float64)
44+
copy_plus_1[1, N](A1, B1)
10245

103-
with cuda.gpus[0]:
104-
vector_add_scalar[1, 10](arr1, 1)
46+
check(A0, B0)
47+
check(A1, B1)
10548

106-
with cuda.gpus[1]:
107-
vector_add_scalar[1, 10](arr2, 2)
49+
A = np.arange(N, dtype=np.float64)
50+
B = np.arange(N, dtype=np.float64)
51+
copy_plus_1[1, N](A, B)
52+
check(A, B)
10853

109-
with cuda.gpus[0]:
110-
np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))
11154

112-
with cuda.gpus[1]:
113-
np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))
114-
115-
@unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
116-
def test_with_context_peer_copy(self):
117-
# Peer access is not always possible - for example, with one GPU in TCC
118-
# mode and one in WDDM - if that is the case, this test would fail so
119-
# we need to skip it.
120-
with cuda.gpus[0]:
121-
ctx = cuda.current_context()
122-
if not ctx.can_access_peer(1):
123-
self.skipTest("Peer access between GPUs disabled")
124-
125-
# 1. Create a range in an array
126-
hostarr = np.arange(10, dtype=np.float32)
127-
128-
# 2. Copy range array from host -> GPU 0
129-
with cuda.gpus[0]:
130-
arr1 = cuda.to_device(hostarr)
131-
132-
# 3. Initialize a zero-filled array on GPU 1
133-
with cuda.gpus[1]:
134-
arr2 = cuda.to_device(np.zeros_like(hostarr))
55+
@skip_on_cudasim("Simulator does not support multiple threads")
56+
def test_multithreaded():
57+
def work(gpu, dA, results, ridx):
58+
try:
59+
with gpu:
60+
arr = dA.copy_to_host()
61+
62+
except Exception as e:
63+
results[ridx] = e
64+
65+
else:
66+
results[ridx] = np.all(arr == np.arange(10))
67+
68+
dA = cuda.to_device(np.arange(10))
69+
70+
nthreads = 10
71+
results = [None] * nthreads
72+
threads = [
73+
threading.Thread(target=work, args=(cuda.gpus.current, dA, results, i))
74+
for i in range(nthreads)
75+
]
76+
for th in threads:
77+
th.start()
78+
79+
for th in threads:
80+
th.join()
81+
82+
for r in results:
83+
if isinstance(r, BaseException):
84+
raise r
85+
else:
86+
assert r
87+
88+
89+
@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus")
90+
def test_with_context():
91+
@cuda.jit
92+
def vector_add_scalar(arr, val):
93+
i = cuda.grid(1)
94+
if i < arr.size:
95+
arr[i] += val
96+
97+
hostarr = np.arange(10, dtype=np.float32)
98+
with cuda.gpus[0]:
99+
arr1 = cuda.to_device(hostarr)
100+
101+
with cuda.gpus[1]:
102+
arr2 = cuda.to_device(hostarr)
103+
104+
with cuda.gpus[0]:
105+
vector_add_scalar[1, 10](arr1, 1)
106+
107+
with cuda.gpus[1]:
108+
vector_add_scalar[1, 10](arr2, 2)
109+
110+
with cuda.gpus[0]:
111+
np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))
112+
113+
with cuda.gpus[1]:
114+
np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))
115+
116+
117+
@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus")
118+
def test_with_context_peer_copy():
119+
# Peer access is not always possible - for example, with one GPU in TCC
120+
# mode and one in WDDM - if that is the case, this test would fail so
121+
# we need to skip it.
122+
with cuda.gpus[0]:
123+
ctx = cuda.current_context()
124+
if not ctx.can_access_peer(1):
125+
pytest.skip("Peer access between GPUs disabled")
126+
127+
# 1. Create a range in an array
128+
hostarr = np.arange(10, dtype=np.float32)
135129

136-
with cuda.gpus[0]:
137-
# 4. Copy range from GPU 0 -> GPU 1
138-
arr2.copy_to_device(arr1)
130+
# 2. Copy range array from host -> GPU 0
131+
with cuda.gpus[0]:
132+
arr1 = cuda.to_device(hostarr)
139133

140-
# 5. Copy range from GPU 1 -> host and check contents
141-
np.testing.assert_equal(arr2.copy_to_host(), hostarr)
134+
# 3. Initialize a zero-filled array on GPU 1
135+
with cuda.gpus[1]:
136+
arr2 = cuda.to_device(np.zeros_like(hostarr))
142137

138+
with cuda.gpus[0]:
139+
# 4. Copy range from GPU 0 -> GPU 1
140+
arr2.copy_to_device(arr1)
143141

144-
if __name__ == "__main__":
145-
unittest.main()
142+
# 5. Copy range from GPU 1 -> host and check contents
143+
np.testing.assert_equal(arr2.copy_to_host(), hostarr)

0 commit comments

Comments
 (0)