Skip to content

Commit 7b6da27

Browse files
committed
Replace CUDA.synchronize() with CuEvent-based synchronization
1 parent 988a92f commit 7b6da27

1 file changed

Lines changed: 36 additions & 20 deletions

File tree

ext/CUDAExt.jl

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ if isdefined(Base, :get_extension)
1616
else
1717
import ..CUDA
1818
end
19-
import CUDA: CuDevice, CuContext, CuStream, CuArray, CUDABackend
19+
import CUDA: CuDevice, CuContext, CuStream, CuArray, CUDABackend, CuEvent
2020
import CUDA: devices, attribute, context, context!, stream, stream!
2121
import CUDA: CUBLAS, CUSOLVER
2222

@@ -103,8 +103,11 @@ function with_context(f, x)
103103
end
104104

105105
function _sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace})
106+
caller_stream = stream()
106107
with_context(x) do
107-
CUDA.synchronize()
108+
ev = CUDA.CuEvent()
109+
CUDA.record(ev, stream())
110+
CUDA.wait(ev, caller_stream)
108111
end
109112
end
110113
function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace})
@@ -164,7 +167,6 @@ function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::Chunk)
164167
cpu_data = remotecall_fetch(unwrap, from_w, x)
165168
with_context(to_proc) do
166169
arr = adapt(CuArray, cpu_data)
167-
CUDA.synchronize()
168170
return arr
169171
end
170172
end
@@ -175,7 +177,6 @@ function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::CuArray)
175177
with_context(to_proc) do
176178
_x = similar(x)
177179
copyto!(_x, x)
178-
CUDA.synchronize()
179180
return _x
180181
end
181182
end
@@ -184,9 +185,7 @@ end
184185
function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x)
185186
with_context(from_proc) do
186187
CUDA.synchronize()
187-
_x = adapt(Array, x)
188-
CUDA.synchronize()
189-
return _x
188+
return adapt(Array, x)
190189
end
191190
end
192191
function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x::Chunk)
@@ -203,26 +202,29 @@ function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x::CuArray{
203202
CUDA.synchronize()
204203
_x = Array{T,N}(undef, size(x))
205204
copyto!(_x, x)
206-
CUDA.synchronize()
207205
return _x
208206
end
209207
end
210208

211209
# Out-of-place DtoD
212210
function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CuArrayDeviceProc, x::Dagger.Chunk{T}) where T<:CuArray
213211
if from_proc == to_proc
214-
# Same process and GPU, no change
215-
arr = unwrap(x)
216-
with_context(CUDA.synchronize, from_proc)
217-
return arr
212+
# Same process and GPU, no change.
213+
# Stream ordering guarantees safety; no sync needed.
214+
return unwrap(x)
215+
218216
elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc)
219217
# Same process but different GPUs, use DtoD copy
220218
from_arr = unwrap(x)
221-
with_context(CUDA.synchronize, from_proc)
219+
ev = CUDA.CuEvent()
220+
with_context(from_proc) do
221+
CUDA.record(ev, stream())
222+
end
223+
222224
return with_context(to_proc) do
225+
CUDA.wait(ev, stream())
223226
to_arr = similar(from_arr)
224227
copyto!(to_arr, from_arr)
225-
CUDA.synchronize()
226228
return to_arr
227229
end
228230
elseif Dagger.system_uuid(from_proc.owner) == Dagger.system_uuid(to_proc.owner) && from_proc.device_uuid == to_proc.device_uuid
@@ -252,11 +254,12 @@ function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CuArrayDeviceProc, x
252254
end
253255
else
254256
return arr
255-
end
257+
end
256258
else
257259
# Different node, use DtoH, serialization, HtoD
258260
host_copy = remotecall_fetch(from_proc.owner, from_proc, x) do from_proc, x
259261
return with_context(from_proc) do
262+
CUDA.synchronize()
260263
Array(unwrap(x))
261264
end
262265
end
@@ -268,20 +271,26 @@ end
268271

269272
function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CuArrayDeviceProc, x::CuArray)
270273
if from_proc == to_proc
271-
with_context(CUDA.synchronize, from_proc)
272274
return x
273275
elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc)
274-
with_context(CUDA.synchronize, from_proc)
276+
ev = CUDA.CuEvent()
277+
with_context(from_proc) do
278+
CUDA.record(ev, stream())
279+
end
280+
275281
return with_context(to_proc) do
282+
CUDA.wait(ev, stream())
276283
to_arr = similar(x)
277284
copyto!(to_arr, x)
278-
CUDA.synchronize()
279285
return to_arr
280286
end
287+
281288
else
282289
host_copy = with_context(from_proc) do
290+
CUDA.synchronize()
283291
return Array(x)
284292
end
293+
285294
return with_context(to_proc) do
286295
return CuArray(host_copy)
287296
end
@@ -390,13 +399,20 @@ Dagger.gpu_kernel_backend(::CuArrayDeviceProc) = CUDABackend()
390399
Dagger.gpu_with_device(f, proc::CuArrayDeviceProc) =
391400
CUDA.device!(f, proc.device)
392401
function Dagger.gpu_synchronize(proc::CuArrayDeviceProc)
402+
user_stream = stream()
403+
393404
with_context(proc) do
394-
CUDA.synchronize()
405+
ev = CUDA.CuEvent()
406+
CUDA.record(ev, stream())
407+
CUDA.wait(ev, user_stream)
408+
395409
end
396410
end
397411
function Dagger.gpu_synchronize(::Val{:CUDA})
412+
user_stream = stream()
398413
for dev in CUDA.devices()
399-
_sync_with_context(CuArrayDeviceProc(myid(), dev.handle, CUDA.uuid(dev)))
414+
proc = CuArrayDeviceProc(myid(), dev.handle, CUDA.uuid(dev))
415+
Dagger.gpu_synchronize(proc)
400416
end
401417
end
402418

0 commit comments

Comments
 (0)