|
| 1 | +! test05_device_routine_int_arg.f90 |
| 2 | +! Tests whether passing an integer literal constant vs an integer variable |
| 3 | +! as an argument to a device (declare target) subroutine gives the same result. |
| 4 | +! |
| 5 | +! The specific pattern from MFC's HLLD solver: |
| 6 | +! call sub(B, 1, ...) -- literal index into B(3) |
| 7 | +! call sub(B, norm, ...) -- variable, equals 1 at runtime |
| 8 | +! inline: B(1)**2 -- direct element access, no call |
| 9 | +! |
| 10 | +! Known AMD bug (flang <= 6.x): literal integer argument causes wrong GPU code. |
| 11 | +program test05_device_routine_int_arg |
| 12 | + implicit none |
| 13 | + integer, parameter :: wp = 8 |
| 14 | + integer, parameter :: N = 10000 |
| 15 | + |
| 16 | + real(wp) :: res_lit(N), res_var(N), res_inline(N) |
| 17 | + real(wp) :: B1(N), B2(N), rho(N), c_sq(N) |
| 18 | + real(wp) :: B2_total, term, disc |
| 19 | + integer :: i, norm_idx, nerr_lit, nerr_var |
| 20 | + |
| 21 | + do i = 1, N |
| 22 | + rho(i) = 1._wp + real(i, wp) * 0.001_wp |
| 23 | + c_sq(i) = 0.5_wp + real(i, wp) * 0.0001_wp |
| 24 | + B1(i) = 0.1_wp * real(i, wp) * 0.001_wp |
| 25 | + B2(i) = 0.05_wp + real(i, wp) * 0.00001_wp |
| 26 | + end do |
| 27 | + |
| 28 | + !$omp target enter data map(to: rho, c_sq, B1, B2) |
| 29 | + !$omp target enter data map(alloc: res_lit, res_var, res_inline) |
| 30 | + |
| 31 | + ! Pass integer literal 1 |
| 32 | + !$omp target teams distribute parallel do |
| 33 | + do i = 1, N |
| 34 | + call magnetosonic(rho(i), c_sq(i), B1(i), B2(i), 1, res_lit(i)) |
| 35 | + end do |
| 36 | + !$omp end target teams distribute parallel do |
| 37 | + |
| 38 | + ! Pass integer variable = 1 |
| 39 | + !$omp target teams distribute parallel do private(norm_idx) |
| 40 | + do i = 1, N |
| 41 | + norm_idx = 1 |
| 42 | + call magnetosonic(rho(i), c_sq(i), B1(i), B2(i), norm_idx, res_var(i)) |
| 43 | + end do |
| 44 | + !$omp end target teams distribute parallel do |
| 45 | + |
| 46 | + ! Inline: explicit B(1) reference, no function call |
| 47 | + !$omp target teams distribute parallel do private(B2_total, term, disc) |
| 48 | + do i = 1, N |
| 49 | + B2_total = B1(i)**2._wp + B2(i)**2._wp |
| 50 | + term = c_sq(i) + B2_total / rho(i) |
| 51 | + disc = term**2._wp - 4._wp * c_sq(i) * (B1(i)**2._wp / rho(i)) |
| 52 | + res_inline(i) = sqrt(max(0._wp, 0.5_wp * (term + sqrt(max(0._wp, disc))))) |
| 53 | + end do |
| 54 | + !$omp end target teams distribute parallel do |
| 55 | + |
| 56 | + !$omp target exit data map(from: res_lit, res_var, res_inline) |
| 57 | + !$omp target exit data map(delete: rho, c_sq, B1, B2) |
| 58 | + |
| 59 | + nerr_lit = 0 |
| 60 | + nerr_var = 0 |
| 61 | + do i = 1, N |
| 62 | + if (abs(res_lit(i) - res_inline(i)) > 1.e-12_wp * abs(res_inline(i)) + 1.e-15_wp) & |
| 63 | + nerr_lit = nerr_lit + 1 |
| 64 | + if (abs(res_var(i) - res_inline(i)) > 1.e-12_wp * abs(res_inline(i)) + 1.e-15_wp) & |
| 65 | + nerr_var = nerr_var + 1 |
| 66 | + end do |
| 67 | + |
| 68 | + if (nerr_lit == 0) then |
| 69 | + print *, "PASS test05a: integer literal 1 as device routine arg matches inline" |
| 70 | + else |
| 71 | + print *, "FAIL test05a:", nerr_lit, "errors -- literal int arg differs from inline" |
| 72 | + print *, " sample: res_lit(1)=", res_lit(1), " vs inline=", res_inline(1) |
| 73 | + end if |
| 74 | + |
| 75 | + if (nerr_var == 0) then |
| 76 | + print *, "PASS test05b: integer variable (=1) as device routine arg matches inline" |
| 77 | + else |
| 78 | + print *, "FAIL test05b:", nerr_var, "errors -- variable int arg differs from inline" |
| 79 | + end if |
| 80 | + |
| 81 | +contains |
| 82 | + |
| 83 | + subroutine magnetosonic(rho, c_sq, B_n, B_t, norm_idx, c_fast) |
| 84 | + !$omp declare target |
| 85 | + real(wp), intent(in) :: rho, c_sq, B_n, B_t |
| 86 | + integer, intent(in) :: norm_idx |
| 87 | + real(wp), intent(out) :: c_fast |
| 88 | + real(wp) :: B_arr(2), B2_total, term, disc |
| 89 | + |
| 90 | + B_arr(1) = B_n |
| 91 | + B_arr(2) = B_t |
| 92 | + B2_total = B_arr(1)**2._wp + B_arr(2)**2._wp |
| 93 | + term = c_sq + B2_total / rho |
| 94 | + ! norm_idx selects the normal B component (1 = B_n, 2 = B_t) |
| 95 | + disc = term**2._wp - 4._wp * c_sq * (B_arr(norm_idx)**2._wp / rho) |
| 96 | + c_fast = sqrt(max(0._wp, 0.5_wp * (term + sqrt(max(0._wp, disc))))) |
| 97 | + end subroutine |
| 98 | + |
| 99 | +end program test05_device_routine_int_arg |
0 commit comments