Skip to content

Commit a631916

Browse files
author
Daniel Vickers
committed
CHanges to allow rocm profiling on frontier and to parallelize array packing in force communication
1 parent c8379a0 commit a631916

3 files changed

Lines changed: 63 additions & 34 deletions

File tree

src/simulation/m_ibm.fpp

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1253,15 +1253,18 @@ contains
12531253
12541254
#ifdef MFC_MPI
12551255
integer :: i, j, k, pack_pos, unpack_pos, buf_size, ierr
1256-
integer :: send_neighbor, recv_neighbor, recv_count, pid, tag
1257-
real(wp), dimension(3) :: fval, tval
1256+
integer :: send_neighbor, recv_neighbor, recv_count, tag
12581257
real(wp), allocatable :: recv_forces_snap(:,:), recv_torques_snap(:,:)
12591258
character(len=1), allocatable :: send_buf(:), recv_buf(:)
1259+
integer, allocatable :: send_ids(:), recv_ids(:)
1260+
real(wp), allocatable :: send_ft(:,:), recv_ft(:,:)
12601261
12611262
if (num_procs == 1) return
12621263
12631264
buf_size = storage_size(0)/8 + (storage_size(0)/8 + 6*storage_size(0._wp)/8)*size(patch_ib)
12641265
allocate (send_buf(buf_size), recv_buf(buf_size), recv_forces_snap(num_ibs, 3), recv_torques_snap(num_ibs, 3))
1266+
@:ALLOCATE(send_ids(num_ibs), send_ft(num_ibs, 6))
1267+
allocate (recv_ids(size(patch_ib)), recv_ft(size(patch_ib), 6))
12651268
12661269
! Accumulation phase: propagate contributions toward the high-index corner.
12671270
#:for X, ID in [('x', 1), ('y', 2), ('z', 3)]
@@ -1276,32 +1279,38 @@ contains
12761279
do k = 1, 2*ib_neighborhood_radius
12771280
! send forces to +${X}$ neighbor; receive from -${X}$ neighbor. Add received values then
12781281
pack_pos = 0
1279-
call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1282+
$:GPU_PARALLEL_LOOP(private='[i]', copyin='[forces, torques]')
12801283
do i = 1, num_ibs
1281-
call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1282-
fval(:) = forces(i,:); tval(:) = torques(i,:)
1283-
call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1284-
call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1284+
send_ids(i) = patch_ib(i)%gbl_patch_id
1285+
send_ft(i,1:3) = forces(i,:)
1286+
send_ft(i,4:6) = torques(i,:)
12851287
end do
1288+
$:END_GPU_PARALLEL_LOOP()
1289+
$:GPU_UPDATE(host='[send_ids, send_ft]')
1290+
call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1291+
call MPI_PACK(send_ids, num_ibs, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1292+
call MPI_PACK(send_ft, 6*num_ibs, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
12861293
call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, recv_buf, buf_size, MPI_PACKED, &
12871294
& recv_neighbor, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
12881295
12891296
if (recv_neighbor /= MPI_PROC_NULL) then
12901297
unpack_pos = 0
12911298
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1299+
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_ids, recv_count, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1300+
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_ft, 6*recv_count, mpi_p, MPI_COMM_WORLD, ierr)
1301+
! $:GPU_PARALLEL_LOOP(private='[i, j]', copyin='[recv_ft, recv_ids]', copy='[forces, torques,
1302+
! recv_forces_snap, recv_torques_snap]')
12921303
do i = 1, recv_count
1293-
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1294-
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
1295-
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
1296-
call s_get_neighborhood_idx(pid, j)
1304+
call s_get_neighborhood_idx(recv_ids(i), j)
12971305
if (j > 0) then
12981306
! add forces and subtract recv_snap prevent double-counting
1299-
forces(j,:) = forces(j,:) + fval(:) - recv_forces_snap(j,:)
1300-
torques(j,:) = torques(j,:) + tval(:) - recv_torques_snap(j,:)
1301-
recv_forces_snap(j,:) = fval(:)
1302-
recv_torques_snap(j,:) = tval(:)
1307+
forces(j,:) = forces(j,:) + recv_ft(i,1:3) - recv_forces_snap(j,:)
1308+
torques(j,:) = torques(j,:) + recv_ft(i,4:6) - recv_torques_snap(j,:)
1309+
recv_forces_snap(j,:) = recv_ft(i,1:3)
1310+
recv_torques_snap(j,:) = recv_ft(i,4:6)
13031311
end if
13041312
end do
1313+
! $:END_GPU_PARALLEL_LOOP()
13051314
end if
13061315
tag = tag + 2
13071316
end do
@@ -1316,35 +1325,40 @@ contains
13161325
13171326
do k = 1, 2*ib_neighborhood_radius
13181327
pack_pos = 0
1319-
call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1328+
$:GPU_PARALLEL_LOOP(private='[i]', copy='[send_ids, send_ft]', copyin='[forces, torques]')
13201329
do i = 1, num_ibs
1321-
call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1322-
fval(:) = forces(i,:); tval(:) = torques(i,:)
1323-
call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1324-
call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1330+
send_ids(i) = patch_ib(i)%gbl_patch_id
1331+
send_ft(i,1:3) = forces(i,:)
1332+
send_ft(i,4:6) = torques(i,:)
13251333
end do
1334+
$:END_GPU_PARALLEL_LOOP()
1335+
call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1336+
call MPI_PACK(send_ids, num_ibs, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
1337+
call MPI_PACK(send_ft, 6*num_ibs, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
13261338
call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, recv_buf, buf_size, MPI_PACKED, &
13271339
& recv_neighbor, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
13281340
if (recv_neighbor /= MPI_PROC_NULL) then
13291341
unpack_pos = 0
13301342
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1343+
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_ids, recv_count, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1344+
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_ft, 6*recv_count, mpi_p, MPI_COMM_WORLD, ierr)
1345+
! $:GPU_PARALLEL_LOOP(private='[i, j]', copyin='[recv_ft, recv_ids]', copy='[forces, torques]')
13311346
do i = 1, recv_count
1332-
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
1333-
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
1334-
call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
1335-
call s_get_neighborhood_idx(pid, j)
1347+
call s_get_neighborhood_idx(recv_ids(i), j)
13361348
if (j > 0) then
1337-
forces(j,:) = fval(:)
1338-
torques(j,:) = tval(:)
1349+
forces(j,:) = recv_ft(i,1:3)
1350+
torques(j,:) = recv_ft(i,4:6)
13391351
end if
13401352
end do
1353+
! $:END_GPU_PARALLEL_LOOP()
13411354
end if
13421355
tag = tag + 2
13431356
end do
13441357
end if
13451358
#:endfor
13461359
1347-
deallocate (send_buf, recv_buf, recv_forces_snap, recv_torques_snap)
1360+
@:DEALLOCATE(send_ids, send_ft)
1361+
deallocate (send_buf, recv_buf, recv_forces_snap, recv_torques_snap, recv_ids, recv_ft)
13481362
#endif
13491363
13501364
end subroutine s_communicate_ib_forces

toolchain/mfc/run/run.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,25 @@ def __profiler_prepend() -> typing.List[str]:
5252

5353
return ["rocprof-compute", "profile", "-n", ARG("name").replace("-", "_").replace(".", "_")] + ARG("rcu") + ["--"]
5454

55-
if ARG("rsys") is not None:
56-
if not does_command_exist("rocprof"):
57-
raise MFCException("Failed to locate [bold red]ROCM rocprof-systems[/bold red] (rocprof-systems).")
55+
return []
5856

59-
return ["rocprof"] + ARG("rsys")
6057

61-
return []
58+
def __rsys_profiler_str() -> str:
59+
if not does_command_exist("rocprof"):
60+
raise MFCException("Failed to locate [bold red]ROCM rocprof-systems[/bold red] (rocprof-systems).")
61+
62+
# Write a wrapper script so $SLURM_PROCID is expanded inside each srun task
63+
# rather than by the calling shell (which would give every rank rank 0's value).
64+
extra = shlex.join(ARG("rsys")) if ARG("rsys") else ""
65+
wrapper_path = os.path.abspath(os.path.join(os.path.dirname(ARG("input")), "rocprof_wrapper.sh"))
66+
wrapper_lines = [
67+
"#!/bin/bash",
68+
"RANK=${SLURM_PROCID:-${FLUX_TASK_RANK:-${OMPI_COMM_WORLD_RANK:-0}}}",
69+
f'exec rocprof -o "rocprof_rank_${{RANK}}.csv" {extra} "$@"',
70+
]
71+
file_write(wrapper_path, "\n".join(wrapper_lines) + "\n")
72+
os.chmod(wrapper_path, 0o755)
73+
return wrapper_path
6274

6375

6476
def get_baked_templates() -> dict:
@@ -111,7 +123,7 @@ def __generate_job_script(targets, case: input.MFCInputFile):
111123
MFC_ROOT_DIR=MFC_ROOT_DIR,
112124
SIMULATION=SIMULATION,
113125
qsystem=queues.get_system(),
114-
profiler=shlex.join(__profiler_prepend()),
126+
profiler=__rsys_profiler_str() if ARG("rsys") is not None else shlex.join(__profiler_prepend()),
115127
gpu_enabled=gpu_enabled,
116128
gpu_acc=gpu_acc,
117129
gpu_mp=gpu_mp,

toolchain/templates/frontier.mako

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,10 @@ ulimit -s unlimited
6969
% if gpu_enabled:
7070
--gpus-per-task 1 --gpu-bind closest \
7171
% endif
72-
${profiler} "${target.get_install_binpath(case)}")
72+
% if target.name == 'simulation':
73+
${profiler} \
74+
% endif
75+
"${target.get_install_binpath(case)}")
7376
% else:
7477
${profiler} "/mnt/bb/$USER/${target.name}")
7578
% endif

0 commit comments

Comments
 (0)