@@ -16,7 +16,7 @@ if isdefined(Base, :get_extension)
1616else
1717 import .. CUDA
1818end
19- import CUDA: CuDevice, CuContext, CuStream, CuArray, CUDABackend
19+ import CUDA: CuDevice, CuContext, CuStream, CuArray, CUDABackend, CuEvent
2020import CUDA: devices, attribute, context, context!, stream, stream!
2121import CUDA: CUBLAS, CUSOLVER
2222
@@ -103,8 +103,11 @@ function with_context(f, x)
103103end
104104
105105function _sync_with_context (x:: Union{Dagger.Processor,Dagger.MemorySpace} )
106+ caller_stream = stream ()
106107 with_context (x) do
107- CUDA. synchronize ()
108+ ev = CUDA. CuEvent ()
109+ CUDA. record (ev, stream ())
110+ CUDA. wait (ev, caller_stream)
108111 end
109112end
110113function sync_with_context (x:: Union{Dagger.Processor,Dagger.MemorySpace} )
@@ -164,7 +167,6 @@ function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::Chunk)
164167 cpu_data = remotecall_fetch (unwrap, from_w, x)
165168 with_context (to_proc) do
166169 arr = adapt (CuArray, cpu_data)
167- CUDA. synchronize ()
168170 return arr
169171 end
170172end
@@ -175,7 +177,6 @@ function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::CuArray)
175177 with_context (to_proc) do
176178 _x = similar (x)
177179 copyto! (_x, x)
178- CUDA. synchronize ()
179180 return _x
180181 end
181182end
184185function Dagger. move (from_proc:: CuArrayDeviceProc , to_proc:: CPUProc , x)
185186 with_context (from_proc) do
186187 CUDA. synchronize ()
187- _x = adapt (Array, x)
188- CUDA. synchronize ()
189- return _x
188+ return adapt (Array, x)
190189 end
191190end
192191function Dagger. move (from_proc:: CuArrayDeviceProc , to_proc:: CPUProc , x:: Chunk )
@@ -203,26 +202,29 @@ function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x::CuArray{
203202 CUDA. synchronize ()
204203 _x = Array {T,N} (undef, size (x))
205204 copyto! (_x, x)
206- CUDA. synchronize ()
207205 return _x
208206 end
209207end
210208
211209# Out-of-place DtoD
212210function Dagger. move (from_proc:: CuArrayDeviceProc , to_proc:: CuArrayDeviceProc , x:: Dagger.Chunk{T} ) where T<: CuArray
213211 if from_proc == to_proc
214- # Same process and GPU, no change
215- arr = unwrap (x)
216- with_context (CUDA . synchronize, from_proc )
217- return arr
212+ # Same process and GPU, no change.
213+ # Stream ordering guarantees safety; no sync needed.
214+ return unwrap (x )
215+
218216 elseif Dagger. root_worker_id (from_proc) == Dagger. root_worker_id (to_proc)
219217 # Same process but different GPUs, use DtoD copy
220218 from_arr = unwrap (x)
221- with_context (CUDA. synchronize, from_proc)
219+ ev = CUDA. CuEvent ()
220+ with_context (from_proc) do
221+ CUDA. record (ev, stream ())
222+ end
223+
222224 return with_context (to_proc) do
225+ CUDA. wait (ev, stream ())
223226 to_arr = similar (from_arr)
224227 copyto! (to_arr, from_arr)
225- CUDA. synchronize ()
226228 return to_arr
227229 end
228230 elseif Dagger. system_uuid (from_proc. owner) == Dagger. system_uuid (to_proc. owner) && from_proc. device_uuid == to_proc. device_uuid
@@ -252,11 +254,12 @@ function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CuArrayDeviceProc, x
252254 end
253255 else
254256 return arr
255- end
257+ end
256258 else
257259 # Different node, use DtoH, serialization, HtoD
258260 host_copy = remotecall_fetch (from_proc. owner, from_proc, x) do from_proc, x
259261 return with_context (from_proc) do
262+ CUDA. synchronize ()
260263 Array (unwrap (x))
261264 end
262265 end
@@ -268,20 +271,26 @@ end
268271
269272function Dagger. move (from_proc:: CuArrayDeviceProc , to_proc:: CuArrayDeviceProc , x:: CuArray )
270273 if from_proc == to_proc
271- with_context (CUDA. synchronize, from_proc)
272274 return x
273275 elseif Dagger. root_worker_id (from_proc) == Dagger. root_worker_id (to_proc)
274- with_context (CUDA. synchronize, from_proc)
276+ ev = CUDA. CuEvent ()
277+ with_context (from_proc) do
278+ CUDA. record (ev, stream ())
279+ end
280+
275281 return with_context (to_proc) do
282+ CUDA. wait (ev, stream ())
276283 to_arr = similar (x)
277284 copyto! (to_arr, x)
278- CUDA. synchronize ()
279285 return to_arr
280286 end
287+
281288 else
282289 host_copy = with_context (from_proc) do
290+ CUDA. synchronize ()
283291 return Array (x)
284292 end
293+
285294 return with_context (to_proc) do
286295 return CuArray (host_copy)
287296 end
@@ -390,13 +399,20 @@ Dagger.gpu_kernel_backend(::CuArrayDeviceProc) = CUDABackend()
390399Dagger. gpu_with_device (f, proc:: CuArrayDeviceProc ) =
391400 CUDA. device! (f, proc. device)
392401function Dagger. gpu_synchronize (proc:: CuArrayDeviceProc )
402+ user_stream = stream ()
403+
393404 with_context (proc) do
394- CUDA. synchronize ()
405+ ev = CUDA. CuEvent ()
406+ CUDA. record (ev, stream ())
407+ CUDA. wait (ev, user_stream)
408+
395409 end
396410end
397411function Dagger. gpu_synchronize (:: Val{:CUDA} )
412+ user_stream = stream ()
398413 for dev in CUDA. devices ()
399- _sync_with_context (CuArrayDeviceProc (myid (), dev. handle, CUDA. uuid (dev)))
414+ proc = CuArrayDeviceProc (myid (), dev. handle, CUDA. uuid (dev))
415+ Dagger. gpu_synchronize (proc)
400416 end
401417end
402418
0 commit comments