fix: AMD AFAR (LLVMFlang) GPU pointer attachment for derived-type field arrays

sbryngelson · sbryngelson · commit 48d4df137abb · 2026-05-18T12:51:09.000-05:00
AMD AFAR 23.2.1 does not support the OpenMP 5.1 attach() clause on
target enter data. Add OMP_AMD_ATTACH_FIX macro in omp_macros.fpp
that manually performs pointer attachment using omp_get_mapped_ptr:
  1. Gets device address of already-mapped array data
  2. Reconstructs bounds from host pointer metadata
  3. Reassigns device-side pointer in a small !omp target region

Key fixes applied:
- OMP_ENTER_DATA: suppress attach clause on AMD, call OMP_AMD_ATTACH_FIX instead
- OMP_AMD_ATTACH_FIX: guard c_loc call with associated() to avoid null
  pointer from unassociated aliases (e.g. flux_src_n(i&gt;1) components
  that alias unallocated flux_src_n(1) members)
- Bounds-spec syntax (lb:) instead of bounds-remapping (lb:ub) to avoid
  'target must be rank-1 or simply contiguous' error with c_f_pointer
  results

Also enables ACC_SETUP_VFs attach path for MFC_OpenMP builds so that
all vector field structs get proper device-side pointer setup on AMD.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -710,12 +710,23 @@ exit 0
                         PRIVATE -DFRONTIER_UNIFIED)
                 endif()
 
-		        find_library(HIP_LIB amdhip64
-                    HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
-                find_library(HIPFORT_AMDGCN_LIB hipfort-amdgcn
-                    HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
-                target_include_directories(${a_target} PRIVATE
-                    "$ENV{OLCF_AFAR_ROOT}/include/hipfort/amdgcn")
+                # Use direct paths from OLCF_AFAR_ROOT to avoid system ROCm shadowing.
+                # therock-afar-23.x layout: hipfort-amdgcn at lib/llvm/lib/ and .mod at lib/llvm/include/
+                # rocm-afar-22.x layout:    hipfort-amdgcn at lib/         and .mod at include/hipfort/amdgcn/
+                find_library(HIP_LIB amdhip64
+                    PATHS "$ENV{OLCF_AFAR_ROOT}/lib"
+                    NO_DEFAULT_PATH REQUIRED)
+                if(EXISTS "$ENV{OLCF_AFAR_ROOT}/lib/llvm/lib/libhipfort-amdgcn.a")
+                    set(HIPFORT_AMDGCN_LIB "$ENV{OLCF_AFAR_ROOT}/lib/llvm/lib/libhipfort-amdgcn.a")
+                    target_include_directories(${a_target} PRIVATE
+                        "$ENV{OLCF_AFAR_ROOT}/lib/llvm/include/hipfort/amdgcn")
+                else()
+                    find_library(HIPFORT_AMDGCN_LIB hipfort-amdgcn
+                        PATHS "$ENV{OLCF_AFAR_ROOT}/lib"
+                        NO_DEFAULT_PATH REQUIRED)
+                    target_include_directories(${a_target} PRIVATE
+                        "$ENV{OLCF_AFAR_ROOT}/include/hipfort/amdgcn")
+                endif()
                 target_link_libraries(${a_target} PRIVATE
                     ${HIP_LIB} ${HIPFORT_AMDGCN_LIB})
 
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
@@ -83,9 +83,9 @@
     deallocate (${allocated_variables}$)
 #:enddef DEALLOCATE
 
-! Cray-specific GPU pointer setup for vector fields
+! GPU pointer setup for vector fields (Cray and bare LLVMFlang OpenMP target, e.g. AMD AFAR)
 #:def ACC_SETUP_VFs(*args)
-#ifdef _CRAYFTN
+#if defined(_CRAYFTN) || defined(MFC_LLVMFlang)
     block
         integer :: macros_setup_vfs_i
 
@@ -99,6 +99,9 @@
                     if (associated(${arg}$%vf(macros_setup_vfs_i)%sf)) then
                         $:GPU_ENTER_DATA(copyin=('[' + arg + '%vf(macros_setup_vfs_i)]'))
                         $:GPU_ENTER_DATA(copyin=('[' + arg + '%vf(macros_setup_vfs_i)%sf]'))
+#if defined(MFC_OpenMP)
+                        $:GPU_ENTER_DATA(attach=('[' + arg + '%vf(macros_setup_vfs_i)%sf]'))
+#endif
                     end if
                 end do
             end if
@@ -107,9 +110,9 @@
 #endif
 #:enddef
 
-! Cray-specific GPU pointer setup for scalar fields
+! GPU pointer setup for scalar fields (Cray and bare LLVMFlang OpenMP target, e.g. AMD AFAR)
 #:def ACC_SETUP_SFs(*args)
-#ifdef _CRAYFTN
+#if defined(_CRAYFTN) || defined(MFC_LLVMFlang)
     block
         @:LOG({'@:ACC_SETUP_SFs(${', '.join(args)}$)'})
 
diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp
@@ -293,11 +293,18 @@
 #:def OMP_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraOmpArgs=None)
     #:set copyin_val = OMP_COPYIN_STR(copyin).strip('\n') + OMP_COPYIN_STR(copyinReadOnly).strip('\n')
     #:set create_val = OMP_CREATE_STR(create)
-    #:set attach_val = OMP_MAP_STR('always,to', attach)
+    #:if USING_AMD
+        #:set attach_val = ''
+        $:OMP_AMD_ATTACH_FIX(attach)
+    #:else
+        #:set attach_val = GEN_PARENTHESES_CLAUSE('attach', attach)
+    #:endif
     #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs)
     #:set omp_clause_val = copyin_val.strip('\n') + create_val.strip('\n') + attach_val.strip('\n')
-    #:set omp_directive = '!$omp target enter data ' + omp_clause_val + extraOmpArgs_val.strip('\n')
-    $:omp_directive
+    #:if omp_clause_val.strip()
+        #:set omp_directive = '!$omp target enter data ' + omp_clause_val + extraOmpArgs_val.strip('\n')
+        $:omp_directive
+    #:endif
 #:enddef
 
 #:def OMP_EXIT_DATA(copyout=None, delete=None, detach=None, extraOmpArgs=None)
@@ -382,4 +389,42 @@
         $:code
     #:endif
 #:enddef
+
+! AMD AFAR (LLVMFlang) does not support the OpenMP 5.1 attach() clause on target enter data.
+! This macro performs pointer attachment manually: it gets the device address of the
+! already-mapped array data, then in a small target region reassigns the Fortran POINTER
+! member in the device-side struct to that device address.
+#:def OMP_AMD_ATTACH_FIX(attach)
+    #:if attach is not None
+        #:set clause_regex = re.compile(',(?![^(]*\\))')
+        #:set attach_str = re.sub(clause_regex, ';', attach.strip('[]'))
+        #:set attach_list = [x.strip() for x in attach_str.split(';')]
+        #:for var_expr in attach_list
+            block
+                use iso_c_binding, only: c_ptr, c_loc, c_associated, c_f_pointer
+                use omp_lib, only: omp_get_mapped_ptr, omp_get_default_device
+                type(c_ptr) :: amd_dev_ptr
+                integer     :: amd_lb1, amd_lb2, amd_lb3, amd_n1, amd_n2, amd_n3
+                if (associated(${var_expr}$)) then
+                    amd_lb1 = lbound(${var_expr}$, 1)
+                    amd_lb2 = lbound(${var_expr}$, 2)
+                    amd_lb3 = lbound(${var_expr}$, 3)
+                    amd_n1 = size(${var_expr}$, 1)
+                    amd_n2 = size(${var_expr}$, 2)
+                    amd_n3 = size(${var_expr}$, 3)
+                    amd_dev_ptr = omp_get_mapped_ptr(c_loc(${var_expr}$), omp_get_default_device())
+                    if (c_associated(amd_dev_ptr)) then
+                        !$omp target firstprivate(amd_dev_ptr, amd_lb1, amd_lb2, amd_lb3, amd_n1, amd_n2, amd_n3)
+                        block
+                            real(stp), pointer :: amd_sf_view(:,:,:)
+                            call c_f_pointer(amd_dev_ptr, amd_sf_view, [amd_n1, amd_n2, amd_n3])
+                            ${var_expr}$(amd_lb1:,amd_lb2:,amd_lb3:) => amd_sf_view
+                        end block
+                        !$omp end target
+                    end if
+                end if
+            end block
+        #:endfor
+    #:endif
+#:enddef
 ! New line at end of file is required for FYPP
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
@@ -67,18 +67,6 @@ module m_rhs
     type(scalar_field), allocatable, dimension(:) :: tau_Re_vf
     $:GPU_DECLARE(create='[tau_Re_vf]')
 
-    !> @name The cell-boundary values of the fluxes (src - source, gsrc - geometrical source). These are computed by applying the
-    !! chosen Riemann problem solver on the left and right cell-boundary values of the primitive variables
-    !> @{
-    type(vector_field), allocatable, dimension(:) :: flux_n
-    type(vector_field), allocatable, dimension(:) :: flux_src_n
-    type(vector_field), allocatable, dimension(:) :: flux_gsrc_n
-
-#if defined(MFC_OpenACC)
-    $:GPU_DECLARE(create='[flux_n, flux_src_n, flux_gsrc_n]')
-#endif
-    !> @}
-
     type(vector_field), allocatable, dimension(:) :: qL_prim, qR_prim
 #if defined(MFC_OpenACC)
     $:GPU_DECLARE(create='[qL_prim, qR_prim]')
@@ -182,79 +170,6 @@ contains
             $:GPU_ENTER_DATA(attach='[q_prim_qp%vf(eqn_idx%psi)%sf]')
         end if
 
-        if (.not. igr) then
-            @:ALLOCATE(flux_n(1:num_dims))
-            @:ALLOCATE(flux_src_n(1:num_dims))
-            @:ALLOCATE(flux_gsrc_n(1:num_dims))
-
-            do i = 1, num_dims
-                @:ALLOCATE(flux_n(i)%vf(1:sys_size))
-                @:ALLOCATE(flux_src_n(i)%vf(1:sys_size))
-                @:ALLOCATE(flux_gsrc_n(i)%vf(1:sys_size))
-
-                if (i == 1) then
-                    do l = 1, sys_size
-                        @:ALLOCATE(flux_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
-                                   & idwbuff(3)%beg:idwbuff(3)%end))
-                        @:ALLOCATE(flux_gsrc_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
-                                   & idwbuff(3)%beg:idwbuff(3)%end))
-                    end do
-
-                    if (viscous .or. surface_tension) then
-                        do l = eqn_idx%mom%beg, eqn_idx%E
-                            @:ALLOCATE(flux_src_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
-                                       & idwbuff(3)%beg:idwbuff(3)%end))
-                        end do
-                    end if
-
-                    @:ALLOCATE(flux_src_n(i)%vf(eqn_idx%adv%beg)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
-                               & idwbuff(3)%beg:idwbuff(3)%end))
-
-                    if (riemann_solver == 1 .or. riemann_solver == 4) then
-                        do l = eqn_idx%adv%beg + 1, eqn_idx%adv%end
-                            @:ALLOCATE(flux_src_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
-                                       & idwbuff(3)%beg:idwbuff(3)%end))
-                        end do
-                    end if
-
-                    if (chemistry) then
-                        do l = eqn_idx%species%beg, eqn_idx%species%end
-                            @:ALLOCATE(flux_src_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
-                                       & idwbuff(3)%beg:idwbuff(3)%end))
-                        end do
-                        if (chem_params%diffusion .and. .not. viscous) then
-                            @:ALLOCATE(flux_src_n(i)%vf(eqn_idx%E)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                                       & idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
-                        end if
-                    end if
-                else
-                    do l = 1, sys_size
-                        @:ALLOCATE(flux_gsrc_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
-                                   & idwbuff(3)%beg:idwbuff(3)%end))
-                    end do
-                end if
-
-                @:ACC_SETUP_VFs(flux_n(i))
-                @:ACC_SETUP_VFs(flux_src_n(i), flux_gsrc_n(i))
-
-                if (i == 1) then
-                    if (riemann_solver /= 1) then
-                        do l = eqn_idx%adv%beg + 1, eqn_idx%adv%end
-                            flux_src_n(i)%vf(l)%sf => flux_src_n(i)%vf(eqn_idx%adv%beg)%sf
-                            $:GPU_ENTER_DATA(attach='[flux_src_n(i)%vf(l)%sf]')
-                        end do
-                    end if
-                else
-                    do l = 1, sys_size
-                        flux_n(i)%vf(l)%sf => flux_n(1)%vf(l)%sf
-                        $:GPU_ENTER_DATA(attach='[flux_n(i)%vf(l)%sf]')
-                        flux_src_n(i)%vf(l)%sf => flux_src_n(1)%vf(l)%sf
-                        $:GPU_ENTER_DATA(attach='[flux_src_n(i)%vf(l)%sf]')
-                    end do
-                end if
-            end do
-        end if
-
         if ((.not. igr)) then
             @:ALLOCATE(dq_prim_dx_qp(1:1))
             @:ALLOCATE(dq_prim_dy_qp(1:1))
@@ -411,20 +326,6 @@ contains
                     end if
                 end do
             end if
-
-            $:GPU_PARALLEL_LOOP(private='[i, j, k, l, id]', collapse=4)
-            do id = 1, num_dims
-                do i = 1, sys_size
-                    do l = idwbuff(3)%beg, idwbuff(3)%end
-                        do k = idwbuff(2)%beg, idwbuff(2)%end
-                            do j = idwbuff(1)%beg, idwbuff(1)%end
-                                flux_gsrc_n(id)%vf(i)%sf(j, k, l) = 0._wp
-                            end do
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
         end if
 
         if (qbmm) then
@@ -716,7 +617,7 @@ contains
                 call nvtxStartRange("RHS-RIEMANN-SOLVER")
                 call s_riemann_solver(qR_rsx_vf, dqR_prim_n%x(id)%vf, dqR_prim_n%y(id)%vf, dqR_prim_n%z(id)%vf, qR_prim(id)%vf, &
                                       & qL_rsx_vf, dqL_prim_n%x(id)%vf, dqL_prim_n%y(id)%vf, dqL_prim_n%z(id)%vf, qL_prim(id)%vf, &
-                                      & q_prim_qp%vf, flux_n(id)%vf, flux_src_n(id)%vf, flux_gsrc_n(id)%vf, id, irx, iry, irz)
+                                      & q_prim_qp%vf, id, irx, iry, irz)
                 call nvtxEndRange
 
                 ! Additional physics and source terms RHS addition for advection source
@@ -1805,49 +1706,6 @@ contains
             deallocate (alf_sum%sf)
         end if
 
-        if (.not. igr) then
-            do i = num_dims, 1, -1
-                if (i /= 1) then
-                    do l = 1, sys_size
-                        nullify (flux_n(i)%vf(l)%sf)
-                        nullify (flux_src_n(i)%vf(l)%sf)
-                        @:DEALLOCATE(flux_gsrc_n(i)%vf(l)%sf)
-                    end do
-                else
-                    do l = 1, sys_size
-                        @:DEALLOCATE(flux_n(i)%vf(l)%sf)
-                        @:DEALLOCATE(flux_gsrc_n(i)%vf(l)%sf)
-                    end do
-
-                    if (viscous) then
-                        do l = eqn_idx%mom%beg, eqn_idx%E
-                            @:DEALLOCATE(flux_src_n(i)%vf(l)%sf)
-                        end do
-                    end if
-
-                    if (chem_params%diffusion .and. .not. viscous) then
-                        @:DEALLOCATE(flux_src_n(i)%vf(eqn_idx%E)%sf)
-                    end if
-
-                    if (riemann_solver == 1 .or. riemann_solver == 4) then
-                        do l = eqn_idx%adv%beg + 1, eqn_idx%adv%end
-                            @:DEALLOCATE(flux_src_n(i)%vf(l)%sf)
-                        end do
-                    else
-                        do l = eqn_idx%adv%beg + 1, eqn_idx%adv%end
-                            nullify (flux_src_n(i)%vf(l)%sf)
-                        end do
-                    end if
-
-                    @:DEALLOCATE(flux_src_n(i)%vf(eqn_idx%adv%beg)%sf)
-                end if
-
-                @:DEALLOCATE(flux_n(i)%vf, flux_src_n(i)%vf, flux_gsrc_n(i)%vf)
-            end do
-
-            @:DEALLOCATE(flux_n, flux_src_n, flux_gsrc_n)
-        end if
-
     end subroutine s_finalize_rhs_module
 
 end module m_rhs
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp