fix: resolve nvfortran OpenMP and AMD HLLD GPU regressions on refactor/derived-types

sbryngelson · sbryngelson · commit 9b6cc6a332fc · 2026-05-18T12:53:14.000-05:00
- grid_axis: change allocatable members to pointer (=&gt; null()), matching
  scalar_field pattern; nvfortran OpenMP requires pointer members for
  correct device descriptor attachment via map(alloc:)
- Add ACC_SETUP_grid_axis macro (macros.fpp) for explicit GPU pointer
  setup on Cray/LLVMFlang, following ACC_SETUP_SFs/VFs pattern
- Add OMP_AMD_ATTACH_FIX_1D (omp_macros.fpp) for rank-1 real(wp) pointer
  attachment on AMD AFAR; OMP_AMD_ATTACH_FIX is rank-3 real(stp) only
- Unify GPU_DECLARE for x/y/z to single [x, y, z] form for both OpenACC
  and OpenMP (previously OpenACC used member-level x%cb/cc/spacing form)
- HLLD solver: replace type(riemann_states_arr7) with plain
  real(wp), dimension(7) flat arrays; AMD GPU compiler miscompiles
  array expressions on private struct member arrays in parallel loops
- Remove unused riemann_states_arr7 type from m_derived_types.fpp
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
@@ -126,7 +126,35 @@
 #endif
 #:enddef
 
-! Cray-specific GPU pointer setup for acoustic source spatials
+! GPU pointer setup for grid_axis structs (Cray and bare LLVMFlang OpenMP target, e.g. AMD AFAR).
+! For OpenACC on Cray, copyin of the struct is sufficient (OpenACC handles pointer attachment automatically).
+! For OpenMP, explicit pointer attach is needed: standard attach clause on Cray, OMP_AMD_ATTACH_FIX_1D on AMD
+! (OMP_AMD_ATTACH_FIX is rank-3 stp only; grid_axis members are rank-1 wp).
+#:def ACC_SETUP_grid_axis(*args)
+#if defined(_CRAYFTN) || defined(MFC_LLVMFlang)
+    block
+        @:LOG({'@:ACC_SETUP_grid_axis(${', '.join(args)}$)'})
+
+        #:for arg in args
+            $:GPU_ENTER_DATA(copyin=('[' + arg + ']'))
+            #:for member in ['cb', 'cc', 'spacing']
+                if (associated(${arg}$%${member}$)) then
+                    $:GPU_ENTER_DATA(copyin=('[' + arg + '%' + member + ']'))
+#if defined(MFC_OpenMP)
+                    #:if USING_AMD
+                        $:OMP_AMD_ATTACH_FIX_1D('[' + arg + '%' + member + ']')
+                    #:else
+                        $:GPU_ENTER_DATA(attach=('[' + arg + '%' + member + ']'))
+                    #:endif
+#endif
+                end if
+            #:endfor
+        #:endfor
+    end block
+#endif
+#:enddef
+
+! GPU pointer setup for acoustic source spatials (Cray only - source_spatial pointer members are not used on AMD/LLVMFlang paths)
 #:def ACC_SETUP_source_spatials(*args)
 #ifdef _CRAYFTN
     block
diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp
@@ -427,4 +427,40 @@
         #:endfor
     #:endif
 #:enddef
+! AMD AFAR (LLVMFlang) does not support the OpenMP 5.1 attach() clause on target enter data.
+! This macro performs pointer attachment manually for rank-1 real(wp) pointer arrays (e.g. grid_axis
+! members cb/cc/spacing): it gets the device address of the already-mapped array data, then in a small
+! target region reassigns the Fortran POINTER member in the device-side struct to that device address.
+! Counterpart to OMP_AMD_ATTACH_FIX (which is rank-3 real(stp) only).
+#:def OMP_AMD_ATTACH_FIX_1D(attach)
+    #:if attach is not None
+        #:set clause_regex = re.compile(',(?![^(]*\\))')
+        #:set attach_str = re.sub(clause_regex, ';', attach.strip('[]'))
+        #:set attach_list = [x.strip() for x in attach_str.split(';')]
+        #:for var_expr in attach_list
+            block
+                use m_precision_select, only: wp
+                use iso_c_binding, only: c_ptr, c_loc, c_associated, c_f_pointer
+                use omp_lib, only: omp_get_mapped_ptr, omp_get_default_device
+                type(c_ptr) :: amd_dev_ptr
+                integer     :: amd_lb1, amd_n1
+                if (associated(${var_expr}$)) then
+                    amd_lb1 = lbound(${var_expr}$, 1)
+                    amd_n1 = size(${var_expr}$, 1)
+                    amd_dev_ptr = omp_get_mapped_ptr(c_loc(${var_expr}$), omp_get_default_device())
+                    if (c_associated(amd_dev_ptr)) then
+                        !$omp target firstprivate(amd_dev_ptr, amd_lb1, amd_n1)
+                        block
+                            use m_precision_select, only: wp
+                            real(wp), pointer :: amd_1d_view(:)
+                            call c_f_pointer(amd_dev_ptr, amd_1d_view, [amd_n1])
+                            ${var_expr}$(amd_lb1:) => amd_1d_view
+                        end block
+                        !$omp end target
+                    end if
+                end if
+            end block
+        #:endfor
+    #:endif
+#:enddef
 ! New line at end of file is required for FYPP
diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
@@ -13,9 +13,12 @@ module m_derived_types
 
     implicit none
 
-    !> Derived type for a single spatial grid axis: cell-boundary, cell-center, per-cell spacing arrays, and minimum spacing scalar
+    !> Derived type for a single spatial grid axis: cell-boundary, cell-center, per-cell spacing pointer arrays, and minimum spacing
+    !! scalar. GPU pointer attachment is handled by @:ACC_SETUP_grid_axis for Cray/LLVMFlang builds. Note: spacing is not allocated
+    !! in pre_process (only cb and cc are); guard spacing accesses with #ifndef MFC_PRE_PROCESS.
     type grid_axis
-        real(wp), pointer, dimension(:) :: cb => null(), cc => null(), spacing => null()
+        real(wp), pointer, dimension(:) :: cb => null(), cc => null()
+        real(wp), pointer, dimension(:) :: spacing => null()  !< Not allocated in pre_process
         real(wp)                        :: min_spacing = 0._wp
     end type grid_axis
 
@@ -104,7 +107,7 @@ module m_derived_types
     end type riemann_states_vec3
 
     !> Left and right Riemann states for fixed-size arrays
-    #:for n in [2, 6, 7]
+    #:for n in [2, 6]
         type riemann_states_arr${n}$
             real(wp) :: L(${n}$), R(${n}$)
         end type riemann_states_arr${n}$
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
@@ -48,19 +48,12 @@ module m_global_parameters
     !> @}
     $:GPU_DECLARE(create='[cyl_coord, grid_geometry]')
 
-    !> @name Cell-boundary (cb), cell-center (cc), and spacing arrays per direction
+    !> @name Grid axis structs for x, y, z: cell-boundary (cb), cell-center (cc), and spacing pointer arrays. GPU pointer attachment
+    !! is performed by @:ACC_SETUP_grid_axis after allocation (Cray/LLVMFlang builds).
     !> @{
     type(grid_axis) :: x, y, z
     !> @}
 
-    !> @name Flat GPU-accessible aliases for grid arrays (used in GPU kernels)
-    !> @{
-    real(wp), allocatable, target :: dx(:), dy(:), dz(:)
-    real(wp), allocatable, target :: x_cc(:), y_cc(:), z_cc(:)
-    real(wp), allocatable, target :: x_cb(:), y_cb(:), z_cb(:)
-    !> @}
-    $:GPU_DECLARE(create='[dx, dy, dz, x_cc, y_cc, z_cc, x_cb, y_cb, z_cb]')
-
     real(wp) :: dt  !< Size of the time-step
     $:GPU_DECLARE(create='[x, y, z, dt, m, n, p]')
 
@@ -1220,43 +1213,31 @@ contains
         $:GPU_UPDATE(device='[relax, relax_model, palpha_eps, ptgalpha_eps]')
 
         ! Allocating grid variables for the x-, y- and z-directions
-        @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
-        @:ALLOCATE(x_cc(-buff_size:m + buff_size))
-        @:ALLOCATE(dx(-buff_size:m + buff_size))
-        @:PREFER_GPU(x_cb)
-        @:PREFER_GPU(x_cc)
-        @:PREFER_GPU(dx)
-        x%cb => x_cb; x%cc => x_cc; x%spacing => dx
-        $:GPU_ENTER_DATA(attach='[x%cb, x%cc, x%spacing]')
-        #:call GPU_PARALLEL(default=None)
-            x%cb => x_cb; x%cc => x_cc; x%spacing => dx
-        #:endcall GPU_PARALLEL
+        @:ALLOCATE(x%cb(-1 - buff_size:m + buff_size))
+        @:ALLOCATE(x%cc(-buff_size:m + buff_size))
+        @:ALLOCATE(x%spacing(-buff_size:m + buff_size))
+        @:PREFER_GPU(x%cb)
+        @:PREFER_GPU(x%cc)
+        @:PREFER_GPU(x%spacing)
+        @:ACC_SETUP_grid_axis(x)
 
         if (n == 0) return
-        @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
-        @:ALLOCATE(y_cc(-buff_size:n + buff_size))
-        @:ALLOCATE(dy(-buff_size:n + buff_size))
-        @:PREFER_GPU(y_cb)
-        @:PREFER_GPU(y_cc)
-        @:PREFER_GPU(dy)
-        y%cb => y_cb; y%cc => y_cc; y%spacing => dy
-        $:GPU_ENTER_DATA(attach='[y%cb, y%cc, y%spacing]')
-        #:call GPU_PARALLEL(default=None)
-            y%cb => y_cb; y%cc => y_cc; y%spacing => dy
-        #:endcall GPU_PARALLEL
+        @:ALLOCATE(y%cb(-1 - buff_size:n + buff_size))
+        @:ALLOCATE(y%cc(-buff_size:n + buff_size))
+        @:ALLOCATE(y%spacing(-buff_size:n + buff_size))
+        @:PREFER_GPU(y%cb)
+        @:PREFER_GPU(y%cc)
+        @:PREFER_GPU(y%spacing)
+        @:ACC_SETUP_grid_axis(y)
 
         if (p == 0) return
-        @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
-        @:ALLOCATE(z_cc(-buff_size:p + buff_size))
-        @:ALLOCATE(dz(-buff_size:p + buff_size))
-        @:PREFER_GPU(z_cb)
-        @:PREFER_GPU(z_cc)
-        @:PREFER_GPU(dz)
-        z%cb => z_cb; z%cc => z_cc; z%spacing => dz
-        $:GPU_ENTER_DATA(attach='[z%cb, z%cc, z%spacing]')
-        #:call GPU_PARALLEL(default=None)
-            z%cb => z_cb; z%cc => z_cc; z%spacing => dz
-        #:endcall GPU_PARALLEL
+        @:ALLOCATE(z%cb(-1 - buff_size:p + buff_size))
+        @:ALLOCATE(z%cc(-buff_size:p + buff_size))
+        @:ALLOCATE(z%spacing(-buff_size:p + buff_size))
+        @:PREFER_GPU(z%cb)
+        @:PREFER_GPU(z%cc)
+        @:PREFER_GPU(z%spacing)
+        @:ACC_SETUP_grid_axis(z)
 
     end subroutine s_initialize_global_parameters_module
 
@@ -1337,19 +1318,13 @@ contains
         if (ib) MPI_IO_IB_DATA%var%sf => null()
 
         ! Deallocating grid variables for the x-, y- and z-directions
-        $:GPU_EXIT_DATA(detach='[x%cb, x%cc, x%spacing]')
-        nullify (x%cb, x%cc, x%spacing)
-        @:DEALLOCATE(x_cb, x_cc, dx)
+        @:DEALLOCATE(x%cb, x%cc, x%spacing)
 
         if (n == 0) return
-        $:GPU_EXIT_DATA(detach='[y%cb, y%cc, y%spacing]')
-        nullify (y%cb, y%cc, y%spacing)
-        @:DEALLOCATE(y_cb, y_cc, dy)
+        @:DEALLOCATE(y%cb, y%cc, y%spacing)
 
         if (p == 0) return
-        $:GPU_EXIT_DATA(detach='[z%cb, z%cc, z%spacing]')
-        nullify (z%cb, z%cc, z%spacing)
-        @:DEALLOCATE(z_cb, z_cc, dz)
+        @:DEALLOCATE(z%cb, z%cc, z%spacing)
 
     end subroutine s_finalize_global_parameters_module
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
@@ -3301,15 +3301,14 @@ contains
         type(riemann_states)      :: c, c_fast, pres_mag
 
         ! HLLD speeds and intermediate state variables:
-        type(riemann_states)      :: s, pTot
-        real(wp)                  :: p_star, s_M, s_starL, s_starR, denom_ds, sign_Bx
-        type(riemann_states)      :: rho_star, E_star, v_star, w_star, sqrt_rho_star, E_double_lr
-        type(riemann_states_arr7) :: U, U_star, U_double, F, F_star
-        real(wp), dimension(7)    :: F_hlld
+        type(riemann_states)   :: s, pTot
+        real(wp)               :: p_star, s_M, s_starL, s_starR, denom_ds, sign_Bx
+        type(riemann_states)   :: rho_star, E_star, v_star, w_star, sqrt_rho_star, E_double_lr
+        real(wp), dimension(7) :: U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR
+        real(wp), dimension(7) :: F_L, F_R, F_starL, F_starR, F_hlld
 
-        ! Indices for U and F: (rho, rho*vel(1), rho*vel(2), rho*vel(3), By, Bz, E) Note: vel and B are permutated, so vel(1) is the
-        ! normal velocity, and x is the normal direction Note: Bx is omitted as the magnetic flux is always zero in the normal
-        ! direction
+        ! Indices for U and F: (rho, rho*vel(1), rho*vel(2), rho*vel(3), By, Bz, E). vel and B are permuted by dir_idx so vel(1) is
+        ! always the normal velocity. Bx is omitted as the normal magnetic flux is always zero.
 
         real(wp) :: v_double, w_double, By_double, Bz_double, E_double
         integer  :: i, j, k, l
@@ -3327,10 +3326,10 @@ contains
             #:set SF = lambda offs: COORDS.format(STENCIL_IDX = SV + offs)
             if (norm_dir == ${NORM_DIR}$) then
                 $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, rho, pres, E, &
-                                    & H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U, U_star, U_double, F, &
-                                    & F_star, F_hlld, s, s_M, s_starL, s_starR, pTot, p_star, rho_star, E_star, sqrt_rho_star, &
-                                    & denom_ds, sign_Bx, v_star, w_star, v_double, w_double, By_double, Bz_double, E_double_lr, &
-                                    & E_double]', copyin='[norm_dir]')
+                                    & H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U_L, U_R, U_starL, U_starR, &
+                                    & U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld, s, s_M, s_starL, s_starR, pTot, &
+                                    & p_star, rho_star, E_star, sqrt_rho_star, denom_ds, sign_Bx, v_star, w_star, v_double, &
+                                    & w_double, By_double, Bz_double, E_double_lr, E_double]', copyin='[norm_dir]')
                 do l = ${Z_BND}$%beg, ${Z_BND}$%end
                     do k = ${Y_BND}$%beg, ${Y_BND}$%end
                         do j = ${X_BND}$%beg, ${X_BND}$%end
@@ -3424,26 +3423,26 @@ contains
                             E_star%R = ((s%R - vel%R(1))*E%R - pTot%R*vel%R(1) + p_star*s_M)/(s%R - s_M)
 
                             ! (5) Compute left/right state vectors and fluxes
-                            U%L = [rho%L, rho%L*vel%L(1:3), B%L(2:3), E%L]
-                            U_star%L = [rho_star%L, rho_star%L*s_M, rho_star%L*vel%L(2:3), B%L(2:3), E_star%L]
-                            U%R = [rho%R, rho%R*vel%R(1:3), B%R(2:3), E%R]
-                            U_star%R = [rho_star%R, rho_star%R*s_M, rho_star%R*vel%R(2:3), B%R(2:3), E_star%R]
+                            U_L = [rho%L, rho%L*vel%L(1:3), B%L(2:3), E%L]
+                            U_starL = [rho_star%L, rho_star%L*s_M, rho_star%L*vel%L(2:3), B%L(2:3), E_star%L]
+                            U_R = [rho%R, rho%R*vel%R(1:3), B%R(2:3), E%R]
+                            U_starR = [rho_star%R, rho_star%R*s_M, rho_star%R*vel%R(2:3), B%R(2:3), E_star%R]
 
                             ! Compute the left/right fluxes
-                            F%L(1) = U%L(2)
-                            F%L(2) = U%L(2)*vel%L(1) - B%L(1)*B%L(1) + pTot%L
-                            F%L(3:4) = U%L(2)*vel%L(2:3) - B%L(1)*B%L(2:3)
-                            F%L(5:6) = vel%L(1)*B%L(2:3) - vel%L(2:3)*B%L(1)
-                            F%L(7) = (E%L + pTot%L)*vel%L(1) - B%L(1)*(vel%L(1)*B%L(1) + vel%L(2)*B%L(2) + vel%L(3)*B%L(3))
-
-                            F%R(1) = U%R(2)
-                            F%R(2) = U%R(2)*vel%R(1) - B%R(1)*B%R(1) + pTot%R
-                            F%R(3:4) = U%R(2)*vel%R(2:3) - B%R(1)*B%R(2:3)
-                            F%R(5:6) = vel%R(1)*B%R(2:3) - vel%R(2:3)*B%R(1)
-                            F%R(7) = (E%R + pTot%R)*vel%R(1) - B%R(1)*(vel%R(1)*B%R(1) + vel%R(2)*B%R(2) + vel%R(3)*B%R(3))
+                            F_L(1) = U_L(2)
+                            F_L(2) = U_L(2)*vel%L(1) - B%L(1)*B%L(1) + pTot%L
+                            F_L(3:4) = U_L(2)*vel%L(2:3) - B%L(1)*B%L(2:3)
+                            F_L(5:6) = vel%L(1)*B%L(2:3) - vel%L(2:3)*B%L(1)
+                            F_L(7) = (E%L + pTot%L)*vel%L(1) - B%L(1)*(vel%L(1)*B%L(1) + vel%L(2)*B%L(2) + vel%L(3)*B%L(3))
+
+                            F_R(1) = U_R(2)
+                            F_R(2) = U_R(2)*vel%R(1) - B%R(1)*B%R(1) + pTot%R
+                            F_R(3:4) = U_R(2)*vel%R(2:3) - B%R(1)*B%R(2:3)
+                            F_R(5:6) = vel%R(1)*B%R(2:3) - vel%R(2:3)*B%R(1)
+                            F_R(7) = (E%R + pTot%R)*vel%R(1) - B%R(1)*(vel%R(1)*B%R(1) + vel%R(2)*B%R(2) + vel%R(3)*B%R(3))
                             ! HLLD star-state fluxes via HLL jump relation
-                            F_star%L = F%L + s%L*(U_star%L - U%L)
-                            F_star%R = F%R + s%R*(U_star%R - U%R)
+                            F_starL = F_L + s%L*(U_starL - U_L)
+                            F_starR = F_R + s%R*(U_starR - U_R)
                             ! Alfven wave speeds bounding the rotational discontinuities
                             s_starL = s_M - abs(B%L(1))/sqrt(rho_star%L)
                             s_starR = s_M + abs(B%L(1))/sqrt(rho_star%R)
@@ -3468,24 +3467,24 @@ contains
                                 & + w_double*Bz_double))*sign_Bx
                             E_double = 0.5_wp*(E_double_lr%L + E_double_lr%R)
 
-                            U_double%L = [rho_star%L, rho_star%L*s_M, rho_star%L*v_double, rho_star%L*w_double, By_double, &
+                            U_doubleL = [rho_star%L, rho_star%L*s_M, rho_star%L*v_double, rho_star%L*w_double, By_double, &
                                 & Bz_double, E_double]
-                            U_double%R = [rho_star%R, rho_star%R*s_M, rho_star%R*v_double, rho_star%R*w_double, By_double, &
+                            U_doubleR = [rho_star%R, rho_star%R*s_M, rho_star%R*v_double, rho_star%R*w_double, By_double, &
                                 & Bz_double, E_double]
 
                             ! Select HLLD flux region
                             if (0.0_wp <= s%L) then
-                                F_hlld = F%L
+                                F_hlld = F_L
                             else if (0.0_wp <= s_starL) then
-                                F_hlld = F%L + s%L*(U_star%L - U%L)
+                                F_hlld = F_L + s%L*(U_starL - U_L)
                             else if (0.0_wp <= s_M) then
-                                F_hlld = F_star%L + s_starL*(U_double%L - U_star%L)
+                                F_hlld = F_starL + s_starL*(U_doubleL - U_starL)
                             else if (0.0_wp <= s_starR) then
-                                F_hlld = F_star%R + s_starR*(U_double%R - U_star%R)
+                                F_hlld = F_starR + s_starR*(U_doubleR - U_starR)
                             else if (0.0_wp <= s%R) then
-                                F_hlld = F%R + s%R*(U_star%R - U%R)
+                                F_hlld = F_R + s%R*(U_starR - U_R)
                             else
-                                F_hlld = F%R
+                                F_hlld = F_R
                             end if
 
                             ! (12) Write HLLD flux to output arrays