Skip to content

Commit d00e4a4

Browse files
authored
Merge branch 'main' into revert#2865
2 parents 870aeb1 + e56e2fd commit d00e4a4

11 files changed

Lines changed: 374 additions & 13 deletions

File tree

.buildkite/pipeline.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,21 @@ steps:
1919
timeout_in_minutes: 60
2020
soft_fail:
2121
- exit_status: 3
22+
- label: "AMDGPU Julia {{matrix.version}}"
23+
matrix:
24+
setup:
25+
version:
26+
- "1.10"
27+
plugins:
28+
- JuliaCI/julia#v1:
29+
version: "{{matrix.version}}"
30+
- JuliaCI/julia-test#v1: ~
31+
env:
32+
TRIXI_TEST: "AMDGPU"
33+
agents:
34+
queue: "juliagpu"
35+
rocm: "*"
36+
if: build.message !~ /\[skip ci\]/
37+
timeout_in_minutes: 60
38+
soft_fail:
39+
- exit_status: 3

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ Trixi.jl follows the interpretation of
55
used in the Julia ecosystem. Notable changes will be documented in this file
66
for human readability.
77

8+
## Changes in the v0.16 lifecycle
9+
10+
#### Added
11+
- GPU support extended to include AMD GPU with a buildkite workflow using `TRIXI_TEST=AMDGPU` ([#2834]).
12+
813
## Changes when updating to v0.16 from v0.15.x
914

1015
#### Changed

Project.toml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
5353
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
5454

5555
[weakdeps]
56+
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
5657
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
5758
Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
5859
ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
@@ -62,6 +63,7 @@ Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
6263
SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
6364

6465
[extensions]
66+
TrixiAMDGPUExt = "AMDGPU"
6567
TrixiCUDAExt = "CUDA"
6668
TrixiConvexECOSExt = ["Convex", "ECOS"]
6769
TrixiMakieExt = "Makie"
@@ -70,9 +72,10 @@ TrixiPlotsExt = "Plots"
7072
TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer"
7173

7274
[compat]
73-
Accessors = "0.1.36"
74-
Adapt = "4.1"
75-
CUDA = "5.8.2"
75+
Accessors = "0.1.42"
76+
AMDGPU = "2.2.1"
77+
Adapt = "4.4"
78+
CUDA = "5.9.1"
7679
CodeTracking = "1.0.5, 2, 3"
7780
ConstructionBase = "1.5.8"
7881
Convex = "0.16"
@@ -90,8 +93,8 @@ KernelAbstractions = "0.9.38"
9093
LinearAlgebra = "1"
9194
LinearMaps = "2.7, 3.0"
9295
LoopVectorization = "0.12.171"
93-
MPI = "0.20.22"
94-
Makie = "0.21, 0.22, 0.23, 0.24"
96+
MPI = "0.20.23"
97+
Makie = "0.22, 0.23, 0.24"
9598
MuladdMacro = "0.2.4"
9699
NLsolve = "4.5.1"
97100
Octavian = "0.3.28"

benchmark/AMDGPU/Project.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[deps]
2+
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
3+
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
4+
OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
5+
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
6+
Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
7+
8+
[sources]
9+
Trixi = {path = "../.."}
10+
11+
[compat]
12+
AMDGPU = "2.3"
13+
JSON = "1.4.0"
14+
OrdinaryDiffEqLowStorageRK = "1.12.0"
15+
TimerOutputs = "0.5.25"
16+
Trixi = "0.16"
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
using OrdinaryDiffEqLowStorageRK
2+
using Trixi
3+
4+
###############################################################################
5+
# semidiscretization of the compressible Euler equations
6+
7+
equations = CompressibleEulerEquations3D(1.4)
8+
9+
function initial_condition_taylor_green_vortex(x, t,
10+
equations::CompressibleEulerEquations3D)
11+
A = 1.0 # magnitude of speed
12+
Ms = 0.1 # maximum Mach number
13+
14+
rho = 1.0
15+
v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
16+
v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
17+
v3 = 0.0
18+
p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
19+
p = p +
20+
1.0 / 16.0 * A^2 * rho *
21+
(cos(2 * x[1]) * cos(2 * x[3]) +
22+
2 * cos(2 * x[2]) + 2 * cos(2 * x[1]) + cos(2 * x[2]) * cos(2 * x[3]))
23+
24+
return prim2cons(SVector(rho, v1, v2, v3, p), equations)
25+
end
26+
27+
initial_condition = initial_condition_taylor_green_vortex
28+
29+
volume_flux = flux_ranocha
30+
surface_flux = flux_lax_friedrichs
31+
volume_integral = VolumeIntegralFluxDifferencing(volume_flux)
32+
solver = DGSEM(polydeg = 5, surface_flux = surface_flux, volume_integral = volume_integral)
33+
34+
coordinates_min = (-1.0, -1.0, -1.0) .* pi
35+
coordinates_max = (1.0, 1.0, 1.0) .* pi
36+
37+
initial_refinement_level = 1
38+
trees_per_dimension = (4, 4, 4)
39+
40+
mesh = P4estMesh(trees_per_dimension, polydeg = 1,
41+
coordinates_min = coordinates_min, coordinates_max = coordinates_max,
42+
periodicity = true, initial_refinement_level = initial_refinement_level)
43+
44+
semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver;
45+
boundary_conditions = boundary_condition_periodic)
46+
47+
###############################################################################
48+
# ODE solvers, callbacks etc.
49+
50+
tspan = (0.0, 100.0)
51+
ode = semidiscretize(semi, tspan; storage_type = nothing, real_type = nothing)
52+
53+
summary_callback = SummaryCallback()
54+
55+
stepsize_callback = StepsizeCallback(cfl = 0.1)
56+
57+
callbacks = CallbackSet(summary_callback,
58+
stepsize_callback)
59+
60+
###############################################################################
61+
# run the simulation
62+
63+
maxiters = 200
64+
65+
# disable warnings when maxiters is reached
66+
integrator = init(ode, CarpenterKennedy2N54(williamson_condition = false),
67+
dt = 1.0,
68+
save_everystep = false, callback = callbacks,
69+
maxiters = maxiters, verbose = false)
70+
71+
solve!(integrator)

benchmark/AMDGPU/run.jl

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
using Trixi
2+
using AMDGPU
3+
using TimerOutputs
4+
using JSON
5+
6+
function main(elixir_path)
7+
8+
# setup
9+
maxiters = 50
10+
initial_refinement_level = 3
11+
storage_type = ROCArray
12+
real_type = Float64
13+
14+
println("Warming up...")
15+
16+
# start simulation with tiny final time to trigger compilation
17+
duration_compile = @elapsed begin
18+
trixi_include(elixir_path,
19+
tspan = (0.0, 1e-14),
20+
storage_type = storage_type,
21+
real_type = real_type)
22+
end
23+
24+
println("Finished warm-up in $duration_compile seconds\n")
25+
println("Starting simulation...")
26+
27+
# start the real simulation
28+
duration_elixir = @elapsed trixi_include(elixir_path,
29+
maxiters = maxiters,
30+
initial_refinement_level = initial_refinement_level,
31+
storage_type = storage_type,
32+
real_type = real_type)
33+
34+
# store metrics (on every rank!)
35+
metrics = Dict{String, Float64}("elapsed time" => duration_elixir)
36+
37+
# read TimerOutputs timings
38+
timer = Trixi.timer()
39+
metrics["total time"] = 1.0e-9 * TimerOutputs.tottime(timer)
40+
metrics["rhs! time"] = 1.0e-9 * TimerOutputs.time(timer["rhs!"])
41+
42+
# compute performance index
43+
latest_semi = @invokelatest (@__MODULE__).semi
44+
nrhscalls = Trixi.ncalls(latest_semi.performance_counter)
45+
walltime = 1.0e-9 * take!(latest_semi.performance_counter)
46+
metrics["PID"] = walltime * Trixi.mpi_nranks() /
47+
(Trixi.ndofsglobal(latest_semi) * nrhscalls)
48+
49+
# write json file
50+
open("metrics.out", "w") do f
51+
indent = 2
52+
JSON.print(f, metrics, indent)
53+
end
54+
end
55+
56+
# hardcoded elixir
57+
elixir_path = joinpath(@__DIR__(), "elixir_euler_taylor_green_vortex.jl")
58+
59+
main(elixir_path)

ext/TrixiAMDGPUExt.jl

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Package extension for adding AMDGPU-based features to Trixi.jl
2+
module TrixiAMDGPUExt
3+
4+
using AMDGPU: AMDGPU, ROCArray, ROCDeviceArray
5+
import AMDGPU.Device: @device_override
6+
import AMDGPU.Runtime: Adaptor
7+
import Trixi
8+
9+
function Trixi.storage_type(::Type{<:ROCArray})
10+
return ROCArray
11+
end
12+
13+
function Trixi.unsafe_wrap_or_alloc(::Adaptor, vec, size)
14+
return Trixi.unsafe_wrap_or_alloc(ROCDeviceArray, vec, size)
15+
end
16+
17+
function Trixi.unsafe_wrap_or_alloc(::Type{<:ROCDeviceArray}, vec::ROCDeviceArray, size)
18+
return reshape(vec, size)
19+
end
20+
21+
@static if Trixi._PREFERENCE_LOG == "log_Trixi_NaN"
22+
@device_override Trixi.log(x::Float64) = ccall("extern __ocml_log_f64", llvmcall,
23+
Cdouble,
24+
(Cdouble,), x)
25+
@device_override Trixi.log(x::Float32) = ccall("extern __ocml_log_f32", llvmcall,
26+
Cfloat,
27+
(Cfloat,), x)
28+
# TODO: Trixi.log(x::Float16)
29+
end
30+
31+
end

test/Project.toml

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
[deps]
2-
Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
32
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
3+
Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
44
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
5+
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
56
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
67
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
78
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
@@ -42,12 +43,13 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
4243
TrixiTest = "0a316866-cbd0-4425-8bcb-08103b2c1f26"
4344

4445
[compat]
45-
Accessors = "0.1.36"
46-
ADTypes = "1.14"
47-
Adapt = "4.1"
46+
Accessors = "0.1.42"
47+
ADTypes = "1.16"
48+
AMDGPU = "2.2.1"
49+
Adapt = "4.4"
4850
Aqua = "0.8"
49-
CUDA = "5.8.2"
50-
CairoMakie = "0.12, 0.13, 0.14, 0.15"
51+
CUDA = "5.9.1"
52+
CairoMakie = "0.13, 0.14, 0.15"
5153
Convex = "0.16"
5254
DelimitedFiles = "1"
5355
DoubleFloats = "1.4.0"
@@ -58,8 +60,8 @@ FiniteDiff = "2.27.0"
5860
ForwardDiff = "0.10.36, 1"
5961
Krylov = "0.10"
6062
LinearAlgebra = "1"
61-
LinearSolve = "3.13"
62-
MPI = "0.20.22"
63+
LinearSolve = "3.54"
64+
MPI = "0.20.23"
6365
NLsolve = "4.5.1"
6466
OrdinaryDiffEqBDF = "1.1"
6567
OrdinaryDiffEqCore = "1.26, 2, 3"

test/runtests.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,16 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
130130
end
131131
end
132132

133+
@time if TRIXI_TEST == "all" || TRIXI_TEST == "AMDGPU"
134+
import AMDGPU
135+
if AMDGPU.functional()
136+
include(joinpath(@__DIR__, "test_amdgpu_2d.jl"))
137+
include(joinpath(@__DIR__, "test_amdgpu_3d.jl"))
138+
else
139+
@warn "Unable to run AMDGPU tests on this machine"
140+
end
141+
end
142+
133143
@time if TRIXI_TEST == "all" || TRIXI_TEST == "kernelabstractions"
134144
previous_backend = Trixi._PREFERENCE_THREADING
135145
Trixi.set_threading_backend!(:kernelabstractions)

test/test_amdgpu_2d.jl

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
module TestAMDGPU2D
2+
3+
using Test
4+
using Trixi
5+
6+
include("test_trixi.jl")
7+
8+
EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
9+
10+
# Start with a clean environment: remove Trixi.jl output directory if it exists
11+
outdir = "out"
12+
isdir(outdir) && rm(outdir, recursive = true)
13+
14+
@testset "AMDGPU 2D" begin
15+
#! format: noindent
16+
17+
@trixi_testset "elixir_advection_basic_gpu.jl native" begin
18+
@test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
19+
# Expected errors are exactly the same as with TreeMesh!
20+
l2=8.311947673061856e-6,
21+
linf=6.627000273229378e-5)
22+
# Ensure that we do not have excessive memory allocations
23+
# (e.g., from type instabilities)
24+
semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
25+
@test_allocations(Trixi.rhs!, semi, sol, 1000)
26+
@test real(ode.p.solver) == Float64
27+
@test real(ode.p.solver.basis) == Float64
28+
@test real(ode.p.solver.mortar) == Float64
29+
# TODO: `mesh` is currently not `adapt`ed correctly
30+
@test real(ode.p.mesh) == Float64
31+
32+
@test ode.u0 isa Array
33+
@test ode.p.solver.basis.derivative_matrix isa Array
34+
35+
@test Trixi.storage_type(ode.p.cache.elements) === Array
36+
@test Trixi.storage_type(ode.p.cache.interfaces) === Array
37+
@test Trixi.storage_type(ode.p.cache.boundaries) === Array
38+
@test Trixi.storage_type(ode.p.cache.mortars) === Array
39+
end
40+
41+
@trixi_testset "elixir_advection_basic_gpu.jl Float32 / AMDGPU" begin
42+
# Using AMDGPU inside the testset since otherwise the bindings are hiddend by the anonymous modules
43+
using AMDGPU
44+
@test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
45+
# Expected errors are exactly the same as with TreeMesh!
46+
l2=[Float32(8.311947673061856e-6)],
47+
linf=[Float32(6.627000273229378e-5)],
48+
RealT_for_test_tolerances=Float32,
49+
real_type=Float32,
50+
storage_type=ROCArray)
51+
# Ensure that we do not have excessive memory allocations
52+
# (e.g., from type instabilities)
53+
semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
54+
@test_allocations(Trixi.rhs!, semi, sol, 50_000)
55+
@test real(ode.p.solver) == Float32
56+
@test real(ode.p.solver.basis) == Float32
57+
@test real(ode.p.solver.mortar) == Float32
58+
# TODO: `mesh` is currently not `adapt`ed correctly
59+
@test real(ode.p.mesh) == Float64
60+
61+
@test ode.u0 isa ROCArray
62+
@test ode.p.solver.basis.derivative_matrix isa ROCArray
63+
64+
@test Trixi.storage_type(ode.p.cache.elements) === ROCArray
65+
@test Trixi.storage_type(ode.p.cache.interfaces) === ROCArray
66+
@test Trixi.storage_type(ode.p.cache.boundaries) === ROCArray
67+
@test Trixi.storage_type(ode.p.cache.mortars) === ROCArray
68+
end
69+
70+
# Clean up afterwards: delete Trixi.jl output directory
71+
@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
72+
end
73+
end # module

0 commit comments

Comments
 (0)