Skip to content

Commit 2fb5899

Browse files
author
Spencer Bryngelson
committed
fix: AMD GPU OpenMP target HLLD solver failures on Frontier (AFAR 23.x)
Three classes of AMD flang bugs cause wrong/NaN results in the HLLD Riemann solver under --gpu mp builds: 1. VLA dimension(num_fluids) in OpenMP target private clause triggers a GPU memory access fault. Fixed with USING_AMD guards that use dimension(3) instead (non-case-optimized builds only). 2. Array constructors [a,b,c] used as whole-array RHS inside target loops produce silent wrong values. Fixed by replacing all B%L=[...], U%L=[...], U_star%L=[...], U_double%L=[...] with explicit element assignments. 3. Whole-array arithmetic a(:)=b(:)+s*(c(:)-b(:)) inside target loops also produces silent wrong values. Fixed by replacing F_star and F_hlld assignments with explicit do loops. Also replaces sum(B**2) with explicit elements in s_compute_fast_magnetosonic_speed (sum() called _FortranASumReal8 which was unavailable in AMD GPU device code on older AMD flang). Verified: tests 2E1EEFBE (1D dai_woodward_hlld) and 2ADA983F (2D Orszag-Tang HLLD) both pass on Frontier with AFAR 23.1.0. Adds miniapps/amd_omp/ diagnostic suite (tests 01-19) isolating each AMD GPU bug pattern for future compiler regression tracking.
1 parent 3ba0419 commit 2fb5899

27 files changed

Lines changed: 1849 additions & 53 deletions

miniapps/amd_omp/Makefile

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
FC = ftn
2+
# Flags matching MFC's AMD GPU build (CMakeLists.txt, gfx90a / Frontier MI250X)
3+
FFLAGS = -fopenmp --offload-arch=gfx90a -fopenmp-target-fast \
4+
-fopenmp-assume-threads-oversubscription \
5+
-fopenmp-assume-teams-oversubscription
6+
7+
TESTS = test01_dt_elem_access \
8+
test02_dt_array_constructor \
9+
test03_dt_whole_array_ops \
10+
test04_dt_zero_init \
11+
test05_device_routine_int_arg \
12+
test06_dt_member_as_array_arg \
13+
test07_module_param_array \
14+
test08_dt_private_array_elems \
15+
test09_dt_arr7_arithmetic \
16+
test10_hlld_flux
17+
18+
.PHONY: all clean run
19+
20+
all: $(TESTS)
21+
22+
%: %.f90
23+
$(FC) $(FFLAGS) -o $@ $<
24+
25+
run: $(TESTS)
26+
@echo "=== Running AMD OMP target miniapps ==="
27+
@for t in $(TESTS); do \
28+
echo "--- $$t ---"; \
29+
./$$t; \
30+
done
31+
@echo "=== Done ==="
32+
33+
clean:
34+
rm -f $(TESTS) *.mod *.o

miniapps/amd_omp/run_frontier.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash
2+
#SBATCH -A cfd154
3+
#SBATCH -J amd_omp_miniapps
4+
#SBATCH -o %x_%j.out
5+
#SBATCH -t 00:10:00
6+
#SBATCH -N 1
7+
#SBATCH -q hackathon
8+
9+
cd /lustre/orion/cfd154/scratch/sbryngelson/MFC-derivedtypes/miniapps/amd_omp
10+
11+
module load python cmake cpe/25.09 PrgEnv-amd amd/7.2.0 rocm/7.2.0
12+
13+
make clean
14+
make -j6 2>&1 | grep -E "Error|error|warning|Warning|FAIL|PASS|^make" || true
15+
echo ""
16+
echo "=== Unfiltered build for tests 07-10 ==="
17+
for t in test07_module_param_array test08_dt_private_array_elems test09_dt_arr7_arithmetic test10_hlld_flux; do
18+
echo "--- Building $t ---"
19+
ftn -fopenmp --offload-arch=gfx90a -fopenmp-target-fast \
20+
-fopenmp-assume-threads-oversubscription \
21+
-fopenmp-assume-teams-oversubscription \
22+
-o $t $t.f90 2>&1
23+
echo "exit: $?"
24+
done
25+
26+
echo ""
27+
echo "=== Running miniapps ==="
28+
for t in test01_dt_elem_access test02_dt_array_constructor test03_dt_whole_array_ops \
29+
test04_dt_zero_init test05_device_routine_int_arg test06_dt_member_as_array_arg \
30+
test07_module_param_array \
31+
test08_dt_private_array_elems test09_dt_arr7_arithmetic test10_hlld_flux; do
32+
if [ -x "./$t" ]; then
33+
echo "--- $t ---"
34+
./$t
35+
else
36+
echo "--- $t --- SKIPPED (did not build)"
37+
fi
38+
done
39+
echo "=== Done ==="
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
! test01_dt_elem_access.f90
2+
! Derived type with real array members as private in target parallel loop.
3+
! Basic element-by-element access: v%L(1), v%L(2), v%L(3).
4+
! Expected: always works (this is the baseline / reference pattern).
5+
program test01_dt_elem_access
6+
implicit none
7+
integer, parameter :: wp = 8
8+
integer, parameter :: N = 10000
9+
10+
type :: vec3
11+
real(wp) :: L(3), R(3)
12+
end type
13+
14+
real(wp) :: res(N)
15+
type(vec3) :: v
16+
real(wp) :: expected
17+
integer :: i, nerr
18+
19+
res = 0._wp
20+
21+
!$omp target teams distribute parallel do map(from:res) private(v)
22+
do i = 1, N
23+
v%L(1) = real(i, wp)
24+
v%L(2) = real(i, wp) * 2._wp
25+
v%L(3) = real(i, wp) * 3._wp
26+
v%R(1) = -real(i, wp)
27+
v%R(2) = -real(i, wp) * 2._wp
28+
v%R(3) = -real(i, wp) * 3._wp
29+
res(i) = v%L(1)**2._wp + v%L(2)**2._wp + v%L(3)**2._wp &
30+
+ v%R(1)**2._wp + v%R(2)**2._wp + v%R(3)**2._wp
31+
end do
32+
!$omp end target teams distribute parallel do
33+
34+
nerr = 0
35+
do i = 1, N
36+
expected = real(i, wp)**2._wp * (1._wp + 4._wp + 9._wp) * 2._wp
37+
if (abs(res(i) - expected) > 1.e-10_wp * expected) nerr = nerr + 1
38+
end do
39+
40+
if (nerr == 0) then
41+
print *, "PASS test01: derived type element access in private target var"
42+
else
43+
print *, "FAIL test01:", nerr, "errors -- derived type element access"
44+
end if
45+
end program test01_dt_elem_access
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
! test02_dt_array_constructor.f90
2+
! Array constructors assigned to derived type array members inside a target region.
3+
! v%L = [a, b, c]
4+
! Known AMD bug: this causes HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION.
5+
! Expected failure on AMD flang <= 6.x; may work on 7.x+.
6+
program test02_dt_array_constructor
7+
implicit none
8+
integer, parameter :: wp = 8
9+
integer, parameter :: N = 10000
10+
11+
type :: vec3
12+
real(wp) :: L(3), R(3)
13+
end type
14+
15+
real(wp) :: res(N)
16+
type(vec3) :: v
17+
real(wp) :: a, b, c, expected
18+
integer :: i, nerr
19+
20+
res = 0._wp
21+
22+
!$omp target teams distribute parallel do map(from:res) private(v, a, b, c)
23+
do i = 1, N
24+
a = real(i, wp)
25+
b = real(i, wp) * 2._wp
26+
c = real(i, wp) * 3._wp
27+
v%L = [a, b, c]
28+
v%R = [-a, -b, -c]
29+
res(i) = v%L(1)**2._wp + v%L(2)**2._wp + v%L(3)**2._wp &
30+
+ v%R(1)**2._wp + v%R(2)**2._wp + v%R(3)**2._wp
31+
end do
32+
!$omp end target teams distribute parallel do
33+
34+
nerr = 0
35+
do i = 1, N
36+
expected = real(i, wp)**2._wp * (1._wp + 4._wp + 9._wp) * 2._wp
37+
if (abs(res(i) - expected) > 1.e-10_wp * expected) nerr = nerr + 1
38+
end do
39+
40+
if (nerr == 0) then
41+
print *, "PASS test02: array constructor on derived type member"
42+
else
43+
print *, "FAIL test02:", nerr, "errors -- array constructor on derived type member"
44+
end if
45+
end program test02_dt_array_constructor
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
! test03_dt_whole_array_ops.f90
2+
! Whole-array intrinsics on derived type array members in target region:
3+
! sum(v%L**2) -- magnitude squared
4+
! v%L**2 -- element-wise squaring (whole-array expression)
5+
! Known AMD bug: produces wrong results or zero.
6+
program test03_dt_whole_array_ops
7+
implicit none
8+
integer, parameter :: wp = 8
9+
integer, parameter :: N = 10000
10+
11+
type :: vec3
12+
real(wp) :: L(3), R(3)
13+
end type
14+
15+
real(wp) :: res_sum(N), res_max(N)
16+
type(vec3) :: v
17+
real(wp) :: expected_sum, expected_max
18+
integer :: i, nerr_sum, nerr_max
19+
20+
res_sum = 0._wp
21+
res_max = 0._wp
22+
23+
!$omp target teams distribute parallel do map(from:res_sum,res_max) private(v)
24+
do i = 1, N
25+
v%L(1) = real(i, wp)
26+
v%L(2) = real(i, wp) * 2._wp
27+
v%L(3) = real(i, wp) * 3._wp
28+
res_sum(i) = sum(v%L**2._wp)
29+
res_max(i) = maxval(v%L)
30+
end do
31+
!$omp end target teams distribute parallel do
32+
33+
nerr_sum = 0
34+
nerr_max = 0
35+
do i = 1, N
36+
expected_sum = real(i, wp)**2._wp * (1._wp + 4._wp + 9._wp)
37+
expected_max = real(i, wp) * 3._wp
38+
if (abs(res_sum(i) - expected_sum) > 1.e-10_wp * expected_sum) nerr_sum = nerr_sum + 1
39+
if (abs(res_max(i) - expected_max) > 1.e-10_wp * expected_max) nerr_max = nerr_max + 1
40+
end do
41+
42+
if (nerr_sum == 0) then
43+
print *, "PASS test03a: sum(v%L**2) on private derived type member"
44+
else
45+
print *, "FAIL test03a:", nerr_sum, "errors -- sum(v%L**2) on private derived type member"
46+
end if
47+
48+
if (nerr_max == 0) then
49+
print *, "PASS test03b: maxval(v%L) on private derived type member"
50+
else
51+
print *, "FAIL test03b:", nerr_max, "errors -- maxval(v%L) on private derived type member"
52+
end if
53+
end program test03_dt_whole_array_ops
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
! test04_dt_zero_init.f90
2+
! Tests whether private derived type members are zero-initialized at entry
3+
! to each GPU thread, as the Fortran standard requires for local vars.
4+
! Known AMD bug: private derived type members are NOT zero-initialized,
5+
! causing incorrect accumulations (e.g. vel_rms += ...) if not explicitly set.
6+
program test04_dt_zero_init
7+
implicit none
8+
integer, parameter :: wp = 8
9+
integer, parameter :: N = 10000
10+
integer, parameter :: M = 3
11+
12+
type :: scalar_state
13+
real(wp) :: L, R
14+
end type
15+
16+
real(wp) :: res(N)
17+
type(scalar_state) :: acc
18+
real(wp) :: expected
19+
integer :: i, j, nerr
20+
21+
res = 0._wp
22+
23+
!$omp target teams distribute parallel do map(from:res) private(acc, j)
24+
do i = 1, N
25+
! Accumulate into private acc%L without explicit initialization.
26+
! If zero-init is NOT guaranteed, acc%L may start with garbage.
27+
acc%L = 0._wp
28+
acc%R = 0._wp
29+
do j = 1, M
30+
acc%L = acc%L + real(j, wp)**2._wp
31+
acc%R = acc%R + real(j, wp)
32+
end do
33+
res(i) = acc%L - acc%R
34+
end do
35+
!$omp end target teams distribute parallel do
36+
37+
! acc%L = 1 + 4 + 9 = 14; acc%R = 1+2+3 = 6; diff = 8
38+
nerr = 0
39+
expected = 14._wp - 6._wp
40+
do i = 1, N
41+
if (abs(res(i) - expected) > 1.e-10_wp) nerr = nerr + 1
42+
end do
43+
44+
if (nerr == 0) then
45+
print *, "PASS test04: explicit zero-init of private derived type scalar member"
46+
else
47+
print *, "FAIL test04:", nerr, "errors -- private derived type scalar accumulation"
48+
end if
49+
end program test04_dt_zero_init
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
! test05_device_routine_int_arg.f90
2+
! Tests whether passing an integer literal constant vs an integer variable
3+
! as an argument to a device (declare target) subroutine gives the same result.
4+
!
5+
! The specific pattern from MFC's HLLD solver:
6+
! call sub(B, 1, ...) -- literal index into B(3)
7+
! call sub(B, norm, ...) -- variable, equals 1 at runtime
8+
! inline: B(1)**2 -- direct element access, no call
9+
!
10+
! Known AMD bug (flang <= 6.x): literal integer argument causes wrong GPU code.
11+
program test05_device_routine_int_arg
12+
implicit none
13+
integer, parameter :: wp = 8
14+
integer, parameter :: N = 10000
15+
16+
real(wp) :: res_lit(N), res_var(N), res_inline(N)
17+
real(wp) :: B1(N), B2(N), rho(N), c_sq(N)
18+
real(wp) :: B2_total, term, disc
19+
integer :: i, norm_idx, nerr_lit, nerr_var
20+
21+
do i = 1, N
22+
rho(i) = 1._wp + real(i, wp) * 0.001_wp
23+
c_sq(i) = 0.5_wp + real(i, wp) * 0.0001_wp
24+
B1(i) = 0.1_wp * real(i, wp) * 0.001_wp
25+
B2(i) = 0.05_wp + real(i, wp) * 0.00001_wp
26+
end do
27+
28+
!$omp target enter data map(to: rho, c_sq, B1, B2)
29+
!$omp target enter data map(alloc: res_lit, res_var, res_inline)
30+
31+
! Pass integer literal 1
32+
!$omp target teams distribute parallel do
33+
do i = 1, N
34+
call magnetosonic(rho(i), c_sq(i), B1(i), B2(i), 1, res_lit(i))
35+
end do
36+
!$omp end target teams distribute parallel do
37+
38+
! Pass integer variable = 1
39+
!$omp target teams distribute parallel do private(norm_idx)
40+
do i = 1, N
41+
norm_idx = 1
42+
call magnetosonic(rho(i), c_sq(i), B1(i), B2(i), norm_idx, res_var(i))
43+
end do
44+
!$omp end target teams distribute parallel do
45+
46+
! Inline: explicit B(1) reference, no function call
47+
!$omp target teams distribute parallel do private(B2_total, term, disc)
48+
do i = 1, N
49+
B2_total = B1(i)**2._wp + B2(i)**2._wp
50+
term = c_sq(i) + B2_total / rho(i)
51+
disc = term**2._wp - 4._wp * c_sq(i) * (B1(i)**2._wp / rho(i))
52+
res_inline(i) = sqrt(max(0._wp, 0.5_wp * (term + sqrt(max(0._wp, disc)))))
53+
end do
54+
!$omp end target teams distribute parallel do
55+
56+
!$omp target exit data map(from: res_lit, res_var, res_inline)
57+
!$omp target exit data map(delete: rho, c_sq, B1, B2)
58+
59+
nerr_lit = 0
60+
nerr_var = 0
61+
do i = 1, N
62+
if (abs(res_lit(i) - res_inline(i)) > 1.e-12_wp * abs(res_inline(i)) + 1.e-15_wp) &
63+
nerr_lit = nerr_lit + 1
64+
if (abs(res_var(i) - res_inline(i)) > 1.e-12_wp * abs(res_inline(i)) + 1.e-15_wp) &
65+
nerr_var = nerr_var + 1
66+
end do
67+
68+
if (nerr_lit == 0) then
69+
print *, "PASS test05a: integer literal 1 as device routine arg matches inline"
70+
else
71+
print *, "FAIL test05a:", nerr_lit, "errors -- literal int arg differs from inline"
72+
print *, " sample: res_lit(1)=", res_lit(1), " vs inline=", res_inline(1)
73+
end if
74+
75+
if (nerr_var == 0) then
76+
print *, "PASS test05b: integer variable (=1) as device routine arg matches inline"
77+
else
78+
print *, "FAIL test05b:", nerr_var, "errors -- variable int arg differs from inline"
79+
end if
80+
81+
contains
82+
83+
subroutine magnetosonic(rho, c_sq, B_n, B_t, norm_idx, c_fast)
84+
!$omp declare target
85+
real(wp), intent(in) :: rho, c_sq, B_n, B_t
86+
integer, intent(in) :: norm_idx
87+
real(wp), intent(out) :: c_fast
88+
real(wp) :: B_arr(2), B2_total, term, disc
89+
90+
B_arr(1) = B_n
91+
B_arr(2) = B_t
92+
B2_total = B_arr(1)**2._wp + B_arr(2)**2._wp
93+
term = c_sq + B2_total / rho
94+
! norm_idx selects the normal B component (1 = B_n, 2 = B_t)
95+
disc = term**2._wp - 4._wp * c_sq * (B_arr(norm_idx)**2._wp / rho)
96+
c_fast = sqrt(max(0._wp, 0.5_wp * (term + sqrt(max(0._wp, disc)))))
97+
end subroutine
98+
99+
end program test05_device_routine_int_arg

0 commit comments

Comments
 (0)