Skip to content

Commit 21347c1

Browse files
authored
Fix 1D multi-rank MPI_GATHERV bug in post-process silo output (#1138)
* Update m_data_output.fpp * Fix 1D multi-rank MPI_GATHERV undefined behavior in post-process The 1D paths in s_mpi_gather_spatial_extents and s_mpi_gather_data_extents reused recvcounts/displs arrays sized for grid defragmentation (m+1 per rank), but each rank only sends 1 scalar value. This sendcount/recvcounts mismatch is undefined behavior per the MPI standard and caused nondeterministic crashes with Intel MPI, preventing silo files from being written. Replace MPI_GATHERV with MPI_GATHER + temp buffer for the 1D case. Multi-D paths and 1D defragmentation functions are unchanged.
1 parent 3639574 commit 21347c1

2 files changed

Lines changed: 42 additions & 19 deletions

File tree

src/post_process/m_data_output.fpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ module m_data_output
1111

1212
use m_derived_types ! Definitions of the derived types
1313

14-
use m_global_parameters ! Global parameters for the code
14+
use m_global_parameters ! Global parameters
1515

1616
use m_derived_variables !< Procedures used to compute quantities derived
1717

src/post_process/m_mpi_proxy.fpp

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ contains
168168

169169
#ifdef MFC_MPI
170170
integer :: ierr !< Generic flag used to identify and report MPI errors
171+
real(wp) :: ext_temp(0:num_procs - 1)
171172

172173
! Simulation is 3D
173174
if (p > 0) then
@@ -273,17 +274,20 @@ contains
273274
! Simulation is 1D
274275
else
275276

277+
! For 1D, recvcounts/displs are sized for grid defragmentation
278+
! (m+1 per rank), not for scalar gathers. Use MPI_GATHER instead.
279+
276280
! Minimum spatial extent in the x-direction
277-
call MPI_GATHERV(minval(x_cb), 1, mpi_p, &
278-
spatial_extents(1, 0), recvcounts, 4*displs, &
279-
mpi_p, 0, MPI_COMM_WORLD, &
280-
ierr)
281+
call MPI_GATHER(minval(x_cb), 1, mpi_p, &
282+
ext_temp, 1, mpi_p, 0, &
283+
MPI_COMM_WORLD, ierr)
284+
if (proc_rank == 0) spatial_extents(1, :) = ext_temp
281285

282286
! Maximum spatial extent in the x-direction
283-
call MPI_GATHERV(maxval(x_cb), 1, mpi_p, &
284-
spatial_extents(2, 0), recvcounts, 4*displs, &
285-
mpi_p, 0, MPI_COMM_WORLD, &
286-
ierr)
287+
call MPI_GATHER(maxval(x_cb), 1, mpi_p, &
288+
ext_temp, 1, mpi_p, 0, &
289+
MPI_COMM_WORLD, ierr)
290+
if (proc_rank == 0) spatial_extents(2, :) = ext_temp
287291
end if
288292

289293
#endif
@@ -339,16 +343,35 @@ contains
339343

340344
#ifdef MFC_MPI
341345
integer :: ierr !< Generic flag used to identify and report MPI errors
342-
343-
! Minimum flow variable extent
344-
call MPI_GATHERV(minval(q_sf), 1, mpi_p, &
345-
data_extents(1, 0), recvcounts, 2*displs, &
346-
mpi_p, 0, MPI_COMM_WORLD, ierr)
347-
348-
! Maximum flow variable extent
349-
call MPI_GATHERV(maxval(q_sf), 1, mpi_p, &
350-
data_extents(2, 0), recvcounts, 2*displs, &
351-
mpi_p, 0, MPI_COMM_WORLD, ierr)
346+
real(wp) :: ext_temp(0:num_procs - 1)
347+
348+
if (n > 0) then
349+
! Multi-D: recvcounts = 1, so strided MPI_GATHERV works correctly
350+
! Minimum flow variable extent
351+
call MPI_GATHERV(minval(q_sf), 1, mpi_p, &
352+
data_extents(1, 0), recvcounts, 2*displs, &
353+
mpi_p, 0, MPI_COMM_WORLD, ierr)
354+
355+
! Maximum flow variable extent
356+
call MPI_GATHERV(maxval(q_sf), 1, mpi_p, &
357+
data_extents(2, 0), recvcounts, 2*displs, &
358+
mpi_p, 0, MPI_COMM_WORLD, ierr)
359+
else
360+
! 1D: recvcounts/displs are sized for grid defragmentation
361+
! (m+1 per rank), not for scalar gathers. Use MPI_GATHER instead.
362+
363+
! Minimum flow variable extent
364+
call MPI_GATHER(minval(q_sf), 1, mpi_p, &
365+
ext_temp, 1, mpi_p, 0, &
366+
MPI_COMM_WORLD, ierr)
367+
if (proc_rank == 0) data_extents(1, :) = ext_temp
368+
369+
! Maximum flow variable extent
370+
call MPI_GATHER(maxval(q_sf), 1, mpi_p, &
371+
ext_temp, 1, mpi_p, 0, &
372+
MPI_COMM_WORLD, ierr)
373+
if (proc_rank == 0) data_extents(2, :) = ext_temp
374+
end if
352375

353376
#endif
354377

0 commit comments

Comments
 (0)