Skip to content

Commit 9b6cc6a

Browse files
committed
fix: resolve nvfortran OpenMP and AMD HLLD GPU regressions on refactor/derived-types
- grid_axis: change allocatable members to pointer (=> null()), matching scalar_field pattern; nvfortran OpenMP requires pointer members for correct device descriptor attachment via map(alloc:) - Add ACC_SETUP_grid_axis macro (macros.fpp) for explicit GPU pointer setup on Cray/LLVMFlang, following ACC_SETUP_SFs/VFs pattern - Add OMP_AMD_ATTACH_FIX_1D (omp_macros.fpp) for rank-1 real(wp) pointer attachment on AMD AFAR; OMP_AMD_ATTACH_FIX is rank-3 real(stp) only - Unify GPU_DECLARE for x/y/z to single [x, y, z] form for both OpenACC and OpenMP (previously OpenACC used member-level x%cb/cc/spacing form) - HLLD solver: replace type(riemann_states_arr7) with plain real(wp), dimension(7) flat arrays; AMD GPU compiler miscompiles array expressions on private struct member arrays in parallel loops - Remove unused riemann_states_arr7 type from m_derived_types.fpp
1 parent 48d4df1 commit 9b6cc6a

5 files changed

Lines changed: 133 additions & 92 deletions

File tree

src/common/include/macros.fpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,35 @@
126126
#endif
127127
#:enddef
128128
129-
! Cray-specific GPU pointer setup for acoustic source spatials
129+
! GPU pointer setup for grid_axis structs (Cray and bare LLVMFlang OpenMP target, e.g. AMD AFAR).
130+
! For OpenACC on Cray, copyin of the struct is sufficient (OpenACC handles pointer attachment automatically).
131+
! For OpenMP, explicit pointer attach is needed: standard attach clause on Cray, OMP_AMD_ATTACH_FIX_1D on AMD
132+
! (OMP_AMD_ATTACH_FIX is rank-3 stp only; grid_axis members are rank-1 wp).
133+
#:def ACC_SETUP_grid_axis(*args)
134+
#if defined(_CRAYFTN) || defined(MFC_LLVMFlang)
135+
block
136+
@:LOG({'@:ACC_SETUP_grid_axis(${', '.join(args)}$)'})
137+
138+
#:for arg in args
139+
$:GPU_ENTER_DATA(copyin=('[' + arg + ']'))
140+
#:for member in ['cb', 'cc', 'spacing']
141+
if (associated(${arg}$%${member}$)) then
142+
$:GPU_ENTER_DATA(copyin=('[' + arg + '%' + member + ']'))
143+
#if defined(MFC_OpenMP)
144+
#:if USING_AMD
145+
$:OMP_AMD_ATTACH_FIX_1D('[' + arg + '%' + member + ']')
146+
#:else
147+
$:GPU_ENTER_DATA(attach=('[' + arg + '%' + member + ']'))
148+
#:endif
149+
#endif
150+
end if
151+
#:endfor
152+
#:endfor
153+
end block
154+
#endif
155+
#:enddef
156+
157+
! GPU pointer setup for acoustic source spatials (Cray only - source_spatial pointer members are not used on AMD/LLVMFlang paths)
130158
#:def ACC_SETUP_source_spatials(*args)
131159
#ifdef _CRAYFTN
132160
block

src/common/include/omp_macros.fpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,4 +427,40 @@
427427
#:endfor
428428
#:endif
429429
#:enddef
430+
! AMD AFAR (LLVMFlang) does not support the OpenMP 5.1 attach() clause on target enter data.
431+
! This macro performs pointer attachment manually for rank-1 real(wp) pointer arrays (e.g. grid_axis
432+
! members cb/cc/spacing): it gets the device address of the already-mapped array data, then in a small
433+
! target region reassigns the Fortran POINTER member in the device-side struct to that device address.
434+
! Counterpart to OMP_AMD_ATTACH_FIX (which is rank-3 real(stp) only).
435+
#:def OMP_AMD_ATTACH_FIX_1D(attach)
436+
#:if attach is not None
437+
#:set clause_regex = re.compile(',(?![^(]*\\))')
438+
#:set attach_str = re.sub(clause_regex, ';', attach.strip('[]'))
439+
#:set attach_list = [x.strip() for x in attach_str.split(';')]
440+
#:for var_expr in attach_list
441+
block
442+
use m_precision_select, only: wp
443+
use iso_c_binding, only: c_ptr, c_loc, c_associated, c_f_pointer
444+
use omp_lib, only: omp_get_mapped_ptr, omp_get_default_device
445+
type(c_ptr) :: amd_dev_ptr
446+
integer :: amd_lb1, amd_n1
447+
if (associated(${var_expr}$)) then
448+
amd_lb1 = lbound(${var_expr}$, 1)
449+
amd_n1 = size(${var_expr}$, 1)
450+
amd_dev_ptr = omp_get_mapped_ptr(c_loc(${var_expr}$), omp_get_default_device())
451+
if (c_associated(amd_dev_ptr)) then
452+
!$omp target firstprivate(amd_dev_ptr, amd_lb1, amd_n1)
453+
block
454+
use m_precision_select, only: wp
455+
real(wp), pointer :: amd_1d_view(:)
456+
call c_f_pointer(amd_dev_ptr, amd_1d_view, [amd_n1])
457+
${var_expr}$(amd_lb1:) => amd_1d_view
458+
end block
459+
!$omp end target
460+
end if
461+
end if
462+
end block
463+
#:endfor
464+
#:endif
465+
#:enddef
430466
! New line at end of file is required for FYPP

src/common/m_derived_types.fpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@ module m_derived_types
1313

1414
implicit none
1515

16-
!> Derived type for a single spatial grid axis: cell-boundary, cell-center, per-cell spacing arrays, and minimum spacing scalar
16+
!> Derived type for a single spatial grid axis: cell-boundary, cell-center, per-cell spacing pointer arrays, and minimum spacing
17+
!! scalar. GPU pointer attachment is handled by @:ACC_SETUP_grid_axis for Cray/LLVMFlang builds. Note: spacing is not allocated
18+
!! in pre_process (only cb and cc are); guard spacing accesses with #ifndef MFC_PRE_PROCESS.
1719
type grid_axis
18-
real(wp), pointer, dimension(:) :: cb => null(), cc => null(), spacing => null()
20+
real(wp), pointer, dimension(:) :: cb => null(), cc => null()
21+
real(wp), pointer, dimension(:) :: spacing => null() !< Not allocated in pre_process
1922
real(wp) :: min_spacing = 0._wp
2023
end type grid_axis
2124

@@ -104,7 +107,7 @@ module m_derived_types
104107
end type riemann_states_vec3
105108

106109
!> Left and right Riemann states for fixed-size arrays
107-
#:for n in [2, 6, 7]
110+
#:for n in [2, 6]
108111
type riemann_states_arr${n}$
109112
real(wp) :: L(${n}$), R(${n}$)
110113
end type riemann_states_arr${n}$

src/simulation/m_global_parameters.fpp

Lines changed: 26 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -48,19 +48,12 @@ module m_global_parameters
4848
!> @}
4949
$:GPU_DECLARE(create='[cyl_coord, grid_geometry]')
5050

51-
!> @name Cell-boundary (cb), cell-center (cc), and spacing arrays per direction
51+
!> @name Grid axis structs for x, y, z: cell-boundary (cb), cell-center (cc), and spacing pointer arrays. GPU pointer attachment
52+
!! is performed by @:ACC_SETUP_grid_axis after allocation (Cray/LLVMFlang builds).
5253
!> @{
5354
type(grid_axis) :: x, y, z
5455
!> @}
5556

56-
!> @name Flat GPU-accessible aliases for grid arrays (used in GPU kernels)
57-
!> @{
58-
real(wp), allocatable, target :: dx(:), dy(:), dz(:)
59-
real(wp), allocatable, target :: x_cc(:), y_cc(:), z_cc(:)
60-
real(wp), allocatable, target :: x_cb(:), y_cb(:), z_cb(:)
61-
!> @}
62-
$:GPU_DECLARE(create='[dx, dy, dz, x_cc, y_cc, z_cc, x_cb, y_cb, z_cb]')
63-
6457
real(wp) :: dt !< Size of the time-step
6558
$:GPU_DECLARE(create='[x, y, z, dt, m, n, p]')
6659

@@ -1220,43 +1213,31 @@ contains
12201213
$:GPU_UPDATE(device='[relax, relax_model, palpha_eps, ptgalpha_eps]')
12211214
12221215
! Allocating grid variables for the x-, y- and z-directions
1223-
@:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
1224-
@:ALLOCATE(x_cc(-buff_size:m + buff_size))
1225-
@:ALLOCATE(dx(-buff_size:m + buff_size))
1226-
@:PREFER_GPU(x_cb)
1227-
@:PREFER_GPU(x_cc)
1228-
@:PREFER_GPU(dx)
1229-
x%cb => x_cb; x%cc => x_cc; x%spacing => dx
1230-
$:GPU_ENTER_DATA(attach='[x%cb, x%cc, x%spacing]')
1231-
#:call GPU_PARALLEL(default=None)
1232-
x%cb => x_cb; x%cc => x_cc; x%spacing => dx
1233-
#:endcall GPU_PARALLEL
1216+
@:ALLOCATE(x%cb(-1 - buff_size:m + buff_size))
1217+
@:ALLOCATE(x%cc(-buff_size:m + buff_size))
1218+
@:ALLOCATE(x%spacing(-buff_size:m + buff_size))
1219+
@:PREFER_GPU(x%cb)
1220+
@:PREFER_GPU(x%cc)
1221+
@:PREFER_GPU(x%spacing)
1222+
@:ACC_SETUP_grid_axis(x)
12341223
12351224
if (n == 0) return
1236-
@:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
1237-
@:ALLOCATE(y_cc(-buff_size:n + buff_size))
1238-
@:ALLOCATE(dy(-buff_size:n + buff_size))
1239-
@:PREFER_GPU(y_cb)
1240-
@:PREFER_GPU(y_cc)
1241-
@:PREFER_GPU(dy)
1242-
y%cb => y_cb; y%cc => y_cc; y%spacing => dy
1243-
$:GPU_ENTER_DATA(attach='[y%cb, y%cc, y%spacing]')
1244-
#:call GPU_PARALLEL(default=None)
1245-
y%cb => y_cb; y%cc => y_cc; y%spacing => dy
1246-
#:endcall GPU_PARALLEL
1225+
@:ALLOCATE(y%cb(-1 - buff_size:n + buff_size))
1226+
@:ALLOCATE(y%cc(-buff_size:n + buff_size))
1227+
@:ALLOCATE(y%spacing(-buff_size:n + buff_size))
1228+
@:PREFER_GPU(y%cb)
1229+
@:PREFER_GPU(y%cc)
1230+
@:PREFER_GPU(y%spacing)
1231+
@:ACC_SETUP_grid_axis(y)
12471232
12481233
if (p == 0) return
1249-
@:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
1250-
@:ALLOCATE(z_cc(-buff_size:p + buff_size))
1251-
@:ALLOCATE(dz(-buff_size:p + buff_size))
1252-
@:PREFER_GPU(z_cb)
1253-
@:PREFER_GPU(z_cc)
1254-
@:PREFER_GPU(dz)
1255-
z%cb => z_cb; z%cc => z_cc; z%spacing => dz
1256-
$:GPU_ENTER_DATA(attach='[z%cb, z%cc, z%spacing]')
1257-
#:call GPU_PARALLEL(default=None)
1258-
z%cb => z_cb; z%cc => z_cc; z%spacing => dz
1259-
#:endcall GPU_PARALLEL
1234+
@:ALLOCATE(z%cb(-1 - buff_size:p + buff_size))
1235+
@:ALLOCATE(z%cc(-buff_size:p + buff_size))
1236+
@:ALLOCATE(z%spacing(-buff_size:p + buff_size))
1237+
@:PREFER_GPU(z%cb)
1238+
@:PREFER_GPU(z%cc)
1239+
@:PREFER_GPU(z%spacing)
1240+
@:ACC_SETUP_grid_axis(z)
12601241
12611242
end subroutine s_initialize_global_parameters_module
12621243
@@ -1337,19 +1318,13 @@ contains
13371318
if (ib) MPI_IO_IB_DATA%var%sf => null()
13381319
13391320
! Deallocating grid variables for the x-, y- and z-directions
1340-
$:GPU_EXIT_DATA(detach='[x%cb, x%cc, x%spacing]')
1341-
nullify (x%cb, x%cc, x%spacing)
1342-
@:DEALLOCATE(x_cb, x_cc, dx)
1321+
@:DEALLOCATE(x%cb, x%cc, x%spacing)
13431322
13441323
if (n == 0) return
1345-
$:GPU_EXIT_DATA(detach='[y%cb, y%cc, y%spacing]')
1346-
nullify (y%cb, y%cc, y%spacing)
1347-
@:DEALLOCATE(y_cb, y_cc, dy)
1324+
@:DEALLOCATE(y%cb, y%cc, y%spacing)
13481325
13491326
if (p == 0) return
1350-
$:GPU_EXIT_DATA(detach='[z%cb, z%cc, z%spacing]')
1351-
nullify (z%cb, z%cc, z%spacing)
1352-
@:DEALLOCATE(z_cb, z_cc, dz)
1327+
@:DEALLOCATE(z%cb, z%cc, z%spacing)
13531328
13541329
end subroutine s_finalize_global_parameters_module
13551330

src/simulation/m_riemann_solvers.fpp

Lines changed: 36 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3301,15 +3301,14 @@ contains
33013301
type(riemann_states) :: c, c_fast, pres_mag
33023302

33033303
! HLLD speeds and intermediate state variables:
3304-
type(riemann_states) :: s, pTot
3305-
real(wp) :: p_star, s_M, s_starL, s_starR, denom_ds, sign_Bx
3306-
type(riemann_states) :: rho_star, E_star, v_star, w_star, sqrt_rho_star, E_double_lr
3307-
type(riemann_states_arr7) :: U, U_star, U_double, F, F_star
3308-
real(wp), dimension(7) :: F_hlld
3304+
type(riemann_states) :: s, pTot
3305+
real(wp) :: p_star, s_M, s_starL, s_starR, denom_ds, sign_Bx
3306+
type(riemann_states) :: rho_star, E_star, v_star, w_star, sqrt_rho_star, E_double_lr
3307+
real(wp), dimension(7) :: U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR
3308+
real(wp), dimension(7) :: F_L, F_R, F_starL, F_starR, F_hlld
33093309

3310-
! Indices for U and F: (rho, rho*vel(1), rho*vel(2), rho*vel(3), By, Bz, E) Note: vel and B are permutated, so vel(1) is the
3311-
! normal velocity, and x is the normal direction Note: Bx is omitted as the magnetic flux is always zero in the normal
3312-
! direction
3310+
! Indices for U and F: (rho, rho*vel(1), rho*vel(2), rho*vel(3), By, Bz, E). vel and B are permuted by dir_idx so vel(1) is
3311+
! always the normal velocity. Bx is omitted as the normal magnetic flux is always zero.
33133312

33143313
real(wp) :: v_double, w_double, By_double, Bz_double, E_double
33153314
integer :: i, j, k, l
@@ -3327,10 +3326,10 @@ contains
33273326
#:set SF = lambda offs: COORDS.format(STENCIL_IDX = SV + offs)
33283327
if (norm_dir == ${NORM_DIR}$) then
33293328
$:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, rho, pres, E, &
3330-
& H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U, U_star, U_double, F, &
3331-
& F_star, F_hlld, s, s_M, s_starL, s_starR, pTot, p_star, rho_star, E_star, sqrt_rho_star, &
3332-
& denom_ds, sign_Bx, v_star, w_star, v_double, w_double, By_double, Bz_double, E_double_lr, &
3333-
& E_double]', copyin='[norm_dir]')
3329+
& H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U_L, U_R, U_starL, U_starR, &
3330+
& U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld, s, s_M, s_starL, s_starR, pTot, &
3331+
& p_star, rho_star, E_star, sqrt_rho_star, denom_ds, sign_Bx, v_star, w_star, v_double, &
3332+
& w_double, By_double, Bz_double, E_double_lr, E_double]', copyin='[norm_dir]')
33343333
do l = ${Z_BND}$%beg, ${Z_BND}$%end
33353334
do k = ${Y_BND}$%beg, ${Y_BND}$%end
33363335
do j = ${X_BND}$%beg, ${X_BND}$%end
@@ -3424,26 +3423,26 @@ contains
34243423
E_star%R = ((s%R - vel%R(1))*E%R - pTot%R*vel%R(1) + p_star*s_M)/(s%R - s_M)
34253424

34263425
! (5) Compute left/right state vectors and fluxes
3427-
U%L = [rho%L, rho%L*vel%L(1:3), B%L(2:3), E%L]
3428-
U_star%L = [rho_star%L, rho_star%L*s_M, rho_star%L*vel%L(2:3), B%L(2:3), E_star%L]
3429-
U%R = [rho%R, rho%R*vel%R(1:3), B%R(2:3), E%R]
3430-
U_star%R = [rho_star%R, rho_star%R*s_M, rho_star%R*vel%R(2:3), B%R(2:3), E_star%R]
3426+
U_L = [rho%L, rho%L*vel%L(1:3), B%L(2:3), E%L]
3427+
U_starL = [rho_star%L, rho_star%L*s_M, rho_star%L*vel%L(2:3), B%L(2:3), E_star%L]
3428+
U_R = [rho%R, rho%R*vel%R(1:3), B%R(2:3), E%R]
3429+
U_starR = [rho_star%R, rho_star%R*s_M, rho_star%R*vel%R(2:3), B%R(2:3), E_star%R]
34313430

34323431
! Compute the left/right fluxes
3433-
F%L(1) = U%L(2)
3434-
F%L(2) = U%L(2)*vel%L(1) - B%L(1)*B%L(1) + pTot%L
3435-
F%L(3:4) = U%L(2)*vel%L(2:3) - B%L(1)*B%L(2:3)
3436-
F%L(5:6) = vel%L(1)*B%L(2:3) - vel%L(2:3)*B%L(1)
3437-
F%L(7) = (E%L + pTot%L)*vel%L(1) - B%L(1)*(vel%L(1)*B%L(1) + vel%L(2)*B%L(2) + vel%L(3)*B%L(3))
3438-
3439-
F%R(1) = U%R(2)
3440-
F%R(2) = U%R(2)*vel%R(1) - B%R(1)*B%R(1) + pTot%R
3441-
F%R(3:4) = U%R(2)*vel%R(2:3) - B%R(1)*B%R(2:3)
3442-
F%R(5:6) = vel%R(1)*B%R(2:3) - vel%R(2:3)*B%R(1)
3443-
F%R(7) = (E%R + pTot%R)*vel%R(1) - B%R(1)*(vel%R(1)*B%R(1) + vel%R(2)*B%R(2) + vel%R(3)*B%R(3))
3432+
F_L(1) = U_L(2)
3433+
F_L(2) = U_L(2)*vel%L(1) - B%L(1)*B%L(1) + pTot%L
3434+
F_L(3:4) = U_L(2)*vel%L(2:3) - B%L(1)*B%L(2:3)
3435+
F_L(5:6) = vel%L(1)*B%L(2:3) - vel%L(2:3)*B%L(1)
3436+
F_L(7) = (E%L + pTot%L)*vel%L(1) - B%L(1)*(vel%L(1)*B%L(1) + vel%L(2)*B%L(2) + vel%L(3)*B%L(3))
3437+
3438+
F_R(1) = U_R(2)
3439+
F_R(2) = U_R(2)*vel%R(1) - B%R(1)*B%R(1) + pTot%R
3440+
F_R(3:4) = U_R(2)*vel%R(2:3) - B%R(1)*B%R(2:3)
3441+
F_R(5:6) = vel%R(1)*B%R(2:3) - vel%R(2:3)*B%R(1)
3442+
F_R(7) = (E%R + pTot%R)*vel%R(1) - B%R(1)*(vel%R(1)*B%R(1) + vel%R(2)*B%R(2) + vel%R(3)*B%R(3))
34443443
! HLLD star-state fluxes via HLL jump relation
3445-
F_star%L = F%L + s%L*(U_star%L - U%L)
3446-
F_star%R = F%R + s%R*(U_star%R - U%R)
3444+
F_starL = F_L + s%L*(U_starL - U_L)
3445+
F_starR = F_R + s%R*(U_starR - U_R)
34473446
! Alfven wave speeds bounding the rotational discontinuities
34483447
s_starL = s_M - abs(B%L(1))/sqrt(rho_star%L)
34493448
s_starR = s_M + abs(B%L(1))/sqrt(rho_star%R)
@@ -3468,24 +3467,24 @@ contains
34683467
& + w_double*Bz_double))*sign_Bx
34693468
E_double = 0.5_wp*(E_double_lr%L + E_double_lr%R)
34703469

3471-
U_double%L = [rho_star%L, rho_star%L*s_M, rho_star%L*v_double, rho_star%L*w_double, By_double, &
3470+
U_doubleL = [rho_star%L, rho_star%L*s_M, rho_star%L*v_double, rho_star%L*w_double, By_double, &
34723471
& Bz_double, E_double]
3473-
U_double%R = [rho_star%R, rho_star%R*s_M, rho_star%R*v_double, rho_star%R*w_double, By_double, &
3472+
U_doubleR = [rho_star%R, rho_star%R*s_M, rho_star%R*v_double, rho_star%R*w_double, By_double, &
34743473
& Bz_double, E_double]
34753474

34763475
! Select HLLD flux region
34773476
if (0.0_wp <= s%L) then
3478-
F_hlld = F%L
3477+
F_hlld = F_L
34793478
else if (0.0_wp <= s_starL) then
3480-
F_hlld = F%L + s%L*(U_star%L - U%L)
3479+
F_hlld = F_L + s%L*(U_starL - U_L)
34813480
else if (0.0_wp <= s_M) then
3482-
F_hlld = F_star%L + s_starL*(U_double%L - U_star%L)
3481+
F_hlld = F_starL + s_starL*(U_doubleL - U_starL)
34833482
else if (0.0_wp <= s_starR) then
3484-
F_hlld = F_star%R + s_starR*(U_double%R - U_star%R)
3483+
F_hlld = F_starR + s_starR*(U_doubleR - U_starR)
34853484
else if (0.0_wp <= s%R) then
3486-
F_hlld = F%R + s%R*(U_star%R - U%R)
3485+
F_hlld = F_R + s%R*(U_starR - U_R)
34873486
else
3488-
F_hlld = F%R
3487+
F_hlld = F_R
34893488
end if
34903489

34913490
! (12) Write HLLD flux to output arrays

0 commit comments

Comments
 (0)