@@ -1253,15 +1253,18 @@ contains
12531253
12541254#ifdef MFC_MPI
12551255 integer :: i, j, k, pack_pos, unpack_pos, buf_size, ierr
1256- integer :: send_neighbor, recv_neighbor, recv_count, pid, tag
1257- real(wp), dimension(3) :: fval, tval
1256+ integer :: send_neighbor, recv_neighbor, recv_count, tag
12581257 real(wp), allocatable :: recv_forces_snap(:,:), recv_torques_snap(:,:)
12591258 character(len=1), allocatable :: send_buf(:), recv_buf(:)
1259+ integer, allocatable :: send_ids(:), recv_ids(:)
1260+ real(wp), allocatable :: send_ft(:,:), recv_ft(:,:)
12601261
12611262 if (num_procs == 1) return
12621263
12631264 buf_size = storage_size(0)/8 + (storage_size(0)/8 + 6*storage_size(0._wp)/8)*size(patch_ib)
12641265 allocate (send_buf(buf_size), recv_buf(buf_size), recv_forces_snap(num_ibs, 3), recv_torques_snap(num_ibs, 3))
1266+ @:ALLOCATE(send_ids(num_ibs), send_ft(num_ibs, 6))
1267+ allocate (recv_ids(size(patch_ib)), recv_ft(size(patch_ib), 6))
12651268
12661269 ! Accumulation phase: propagate contributions toward the high-index corner.
12671270 #:for X, ID in [(' x' , 1), (' y' , 2), (' z' , 3)]
@@ -1276,32 +1279,38 @@ contains
12761279 do k = 1, 2*ib_neighborhood_radius
12771280 ! send forces to +${X}$ neighbor; receive from -${X}$ neighbor. Add received values then
12781281 pack_pos = 0
1279- call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr )
1282+ $:GPU_PARALLEL_LOOP(private= ' [i] ' , copyin= ' [forces, torques] ' )
12801283 do i = 1, num_ibs
1281- call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1282- fval(:) = forces(i,:); tval(:) = torques(i,:)
1283- call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1284- call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1284+ send_ids(i) = patch_ib(i)%gbl_patch_id
1285+ send_ft(i,1:3) = forces(i,:)
1286+ send_ft(i,4:6) = torques(i,:)
12851287 end do
1288+ $:END_GPU_PARALLEL_LOOP()
1289+ $:GPU_UPDATE(host=' [send_ids, send_ft]' )
1290+ call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1291+ call MPI_PACK(send_ids, num_ibs, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1292+ call MPI_PACK(send_ft, 6*num_ibs, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
12861293 call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, recv_buf, buf_size, MPI_PACKED, &
12871294 & recv_neighbor, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
12881295
12891296 if (recv_neighbor /= MPI_PROC_NULL) then
12901297 unpack_pos = 0
12911298 call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1299+ call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_ids, recv_count, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1300+ call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_ft, 6*recv_count, mpi_p, MPI_COMM_WORLD, ierr)
1301+ ! $:GPU_PARALLEL_LOOP(private=' [i, j]' , copyin=' [recv_ft, recv_ids]' , copy=' [forces, torques,
1302+ ! recv_forces_snap, recv_torques_snap]' )
12921303 do i = 1, recv_count
1293- call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1294- call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
1295- call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
1296- call s_get_neighborhood_idx(pid, j)
1304+ call s_get_neighborhood_idx(recv_ids(i), j)
12971305 if (j > 0) then
12981306 ! add forces and subtract recv_snap prevent double-counting
1299- forces(j,:) = forces(j,:) + fval(: ) - recv_forces_snap(j,:)
1300- torques(j,:) = torques(j,:) + tval(: ) - recv_torques_snap(j,:)
1301- recv_forces_snap(j,:) = fval(: )
1302- recv_torques_snap(j,:) = tval(: )
1307+ forces(j,:) = forces(j,:) + recv_ft(i,1:3 ) - recv_forces_snap(j,:)
1308+ torques(j,:) = torques(j,:) + recv_ft(i,4:6 ) - recv_torques_snap(j,:)
1309+ recv_forces_snap(j,:) = recv_ft(i,1:3 )
1310+ recv_torques_snap(j,:) = recv_ft(i,4:6 )
13031311 end if
13041312 end do
1313+ ! $:END_GPU_PARALLEL_LOOP()
13051314 end if
13061315 tag = tag + 2
13071316 end do
@@ -1316,35 +1325,40 @@ contains
13161325
13171326 do k = 1, 2*ib_neighborhood_radius
13181327 pack_pos = 0
1319- call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr )
1328+ $:GPU_PARALLEL_LOOP(private= ' [i] ' , copy= ' [send_ids, send_ft] ' , copyin= ' [forces, torques] ' )
13201329 do i = 1, num_ibs
1321- call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1322- fval(:) = forces(i,:); tval(:) = torques(i,:)
1323- call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1324- call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1330+ send_ids(i) = patch_ib(i)%gbl_patch_id
1331+ send_ft(i,1:3) = forces(i,:)
1332+ send_ft(i,4:6) = torques(i,:)
13251333 end do
1334+ $:END_GPU_PARALLEL_LOOP()
1335+ call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1336+ call MPI_PACK(send_ids, num_ibs, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1337+ call MPI_PACK(send_ft, 6*num_ibs, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
13261338 call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, recv_buf, buf_size, MPI_PACKED, &
13271339 & recv_neighbor, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
13281340 if (recv_neighbor /= MPI_PROC_NULL) then
13291341 unpack_pos = 0
13301342 call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1343+ call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_ids, recv_count, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1344+ call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_ft, 6*recv_count, mpi_p, MPI_COMM_WORLD, ierr)
1345+ ! $:GPU_PARALLEL_LOOP(private=' [i, j]' , copyin=' [recv_ft, recv_ids]' , copy=' [forces, torques]' )
13311346 do i = 1, recv_count
1332- call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1333- call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
1334- call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
1335- call s_get_neighborhood_idx(pid, j)
1347+ call s_get_neighborhood_idx(recv_ids(i), j)
13361348 if (j > 0) then
1337- forces(j,:) = fval(: )
1338- torques(j,:) = tval(: )
1349+ forces(j,:) = recv_ft(i,1:3 )
1350+ torques(j,:) = recv_ft(i,4:6 )
13391351 end if
13401352 end do
1353+ ! $:END_GPU_PARALLEL_LOOP()
13411354 end if
13421355 tag = tag + 2
13431356 end do
13441357 end if
13451358 #:endfor
13461359
1347- deallocate (send_buf, recv_buf, recv_forces_snap, recv_torques_snap)
1360+ @:DEALLOCATE(send_ids, send_ft)
1361+ deallocate (send_buf, recv_buf, recv_forces_snap, recv_torques_snap, recv_ids, recv_ft)
13481362#endif
13491363
13501364 end subroutine s_communicate_ib_forces
0 commit comments