Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ LLVM_jll = "86de99a1-58d6-5da7-8064-bd56ce2e322c"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Expand All @@ -32,7 +33,6 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Random123 = "74087812-796a-5b5d-8853-05524746bad3"
RandomNumbers = "e6cf234a-135c-5ec9-84dd-332b85af5143"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
Expand All @@ -41,11 +41,13 @@ UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"

[extensions]
AMDGPUChainRulesCoreExt = "ChainRulesCore"
AMDGPUEnzymeCoreExt = "EnzymeCore"
AMDGPUSparseMatricesCSRExt = "SparseMatricesCSR"
AMDGPUSpecialFunctionsExt = "SpecialFunctions"

[compat]
AbstractFFTs = "1.0"
Expand All @@ -64,6 +66,7 @@ KernelAbstractions = "0.9.2"
LLD_jll = "15, 16, 17, 18, 19, 20"
LLVM = "9"
LLVM_jll = "15, 16, 17, 18, 19, 20"
PrecompileTools = "1"
Preferences = "1"
PrettyTables = "3"
ROCmDeviceLibs_jll = "=5.6.1, =6.2.1, =7.0.2"
Expand Down
19 changes: 19 additions & 0 deletions ext/AMDGPUSpecialFunctionsExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
module AMDGPUSpecialFunctionsExt

# Device-side overrides mapping SpecialFunctions.jl functions to OCML intrinsics.
# Kept in an extension (like CUDA.jl) so that `using AMDGPU` does not pay the
# load-time cost of SpecialFunctions and its dependencies unless they are needed.

import AMDGPU
import AMDGPU.Device: @device_override, fntypes, DEFINED_SF_INTRINSICS
import SpecialFunctions

for jltype in (Float64, Float32, Float16)
type_suffix = fntypes[jltype]
for (fname, intrinsic) in DEFINED_SF_INTRINSICS
@eval @device_override SpecialFunctions.$(fname)(x::$jltype) = ccall(
$("extern __ocml_$(intrinsic)_$(type_suffix)"), llvmcall, $jltype, ($jltype,), x)
end
end

end
2 changes: 2 additions & 0 deletions src/AMDGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ include("ROCKernels.jl")
import .ROCKernels: ROCBackend
export ROCBackend

include("precompile.jl")

function __init__()
# Used to shutdown hostcalls if any is running.
atexit(() -> begin Runtime.RT_EXITING[] = true end)
Expand Down
11 changes: 4 additions & 7 deletions src/device/gcn/math.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import Base: FastMath
import SpecialFunctions

const DEFINED_UNARY_INTRNISICS = [
(:Base, :acos), (:Base, :acosh), (nothing, :acospi), (:Base, :cos), (:Base, :cosh), (:Base, :cospi),
Expand All @@ -11,7 +10,10 @@ const DEFINED_UNARY_INTRNISICS = [
(:Base, :floor), (:Base, :ceil), (:Base, :trunc),
(nothing, :nearbyint), (nothing, :nextafter),
]
# SpecialFunctions (SF.fname, OCML intrinsic).
# SpecialFunctions (SF.fname, OCML intrinsic). The device overrides themselves
# live in `AMDGPUSpecialFunctionsExt`; this list stays here (it is just symbol
# pairs, with no SpecialFunctions dependency) as the single source of truth used
# both by the extension and by the test suite.
const DEFINED_SF_INTRINSICS = [
(:loggamma, :lgamma), (:gamma, :tgamma),
(:bessely0, :y0), (:bessely1, :y1), (:besselj0, :j0), (:besselj1, :j1),
Expand All @@ -36,11 +38,6 @@ for jltype in (Float64, Float32, Float16)
end
end

for (fname, intrinsic) in DEFINED_SF_INTRINSICS
@eval @device_override SpecialFunctions.$(fname)(x::$jltype) = ccall(
$("extern __ocml_$(intrinsic)_$(type_suffix)"), llvmcall, $jltype, ($jltype,), x)
end

@eval @device_override Base.abs(x::$jltype) = ccall(
$("extern __ocml_fabs_$(type_suffix)"), llvmcall, $jltype, ($jltype,), x)

Expand Down
117 changes: 117 additions & 0 deletions src/precompile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
using PrecompileTools: @compile_workload

# Compiling a GCN kernel requires being able to initialize the AMDGPU LLVM
# backend, so we only run the precompile workload when that's supported, to be
# able to load this package also on systems where the backend isn't available.
#
# This mirrors CUDA.jl's precompile workload: it warms up the GPUCompiler ->
# AMDGPU codegen pipeline during precompilation so that the first kernel launch
# at runtime doesn't have to JIT-compile the entire compiler. It does NOT need a
# GPU (or even the ROCm runtime to be discovered) -- it only uses LLVM.
if :AMDGPU in LLVM.backends()
@compile_workload begin
let
function _precompile_kernel(a)
i = workitemIdx().x
@inbounds a[i] += 1.0f0
return
end

# Build a device-free compiler config for a baseline GCN target.
# `gfx1030` (RDNA2, wavefront 32) is a portable baseline that exercises
# the full pipeline; the cached *code* is reused regardless of the
# actual device's ISA at runtime (only the kernel binary differs).
#
# NOTE: the ISA must be RDNA/CDNA, not pre-RDNA. The `wavefrontsize*`
# LLVM features only exist on gfx10+, so pairing them with e.g. gfx900
# (GCN3) yields an inconsistent target that miscompiles the
# wavefront-sensitive exception/bounds-error path (manifests under
# `--check-bounds=yes`/`--code-coverage`, where `@inbounds` is ignored).
target = GPUCompiler.GCNCompilerTarget(;
dev_isa="gfx1030", features="+wavefrontsize32,-wavefrontsize64")
params = Compiler.HIPCompilerParams(false, true)
config = GPUCompiler.CompilerConfig(target, params;
kernel=true, name=nothing, always_inline=true)

tt = Tuple{ROCDeviceArray{Float32, 1, AS.Global}}
source = GPUCompiler.methodinstance(typeof(_precompile_kernel), tt)
job = GPUCompiler.CompilerJob(source, config)

# Under `--check-bounds=yes` (used by `Pkg.test`) or `--code-coverage`,
# `@inbounds` is ignored, so `a[i]` emits a bounds-error path
# (`throw_boundserror` -> `signal_exception` -> `kernel_state()`). That
# path compiles fine at *runtime*, but NOT during precompilation: the
# `@generated kernel_state()` fails to inline there, leaving a dynamic
# call -> invalid GPU IR. Those flags only occur during testing, never
# in normal user precompilation (where `@inbounds` elides the path), so
# skip the warming compile then -- users still get the full benefit.
instrumented = Base.JLOptions().code_coverage != 0 ||
Base.JLOptions().check_bounds == 1

# On Julia < 1.12, GPU compilation during precompilation leaks foreign
# MIs into native compilation, causing LLVM errors. Guard like CUDA.jl.
@static if VERSION >= v"1.12-"
if !instrumented
GPUCompiler.JuliaContext() do ctx
GPUCompiler.compile(:obj, job)
end

# The compile above runs during precompilation, when ROCm
# discovery (`__init__`) has NOT run, so `libdevice_libs` is
# empty. That poisons the `DEVICE_LIBS` cache with empty entries
# (e.g. an `ocml` `DevLib` with no path), which would be baked
# into the precompile image and prevent device-library linking
# at runtime (`unsupported call to __ocml_*`). Reset it so it is
# repopulated correctly once discovery has run.
empty!(Compiler.DEVICE_LIBS)
end
end
end
end
end

# Kernel launch infrastructure that the workload above cannot reach, because it
# requires a live device (mirrors CUDA.jl's explicit precompile directives:
# `cufunction`, `link`, and `actual_compilation`).
precompile(Tuple{typeof(Compiler.hipfunction), typeof(identity), Type{Tuple{Nothing}}})
precompile(Tuple{typeof(GPUCompiler.actual_compilation),
Dict{Any, HIP.HIPFunction}, Core.MethodInstance, UInt64,
Compiler.HIPCompilerConfig, typeof(Compiler.hipcompile), typeof(Compiler.hiplink)})
precompile(Tuple{typeof(Compiler.hiplink), Compiler.HIPCompilerJob,
NamedTuple{(:obj, :entry, :global_hostcalls),
Tuple{Vector{UInt8}, String, Vector{Symbol}}}})

# Hot entry points of the bundled ROCm libraries, mirroring CUDA.jl's per-library
# precompile directives. These compile the (GPU-free) Julia wrappers so the first
# `A * B`, factorization, FFT plan, etc. doesn't pay full first-use compilation.
let RM = (T) -> ROCArray{T, 2, Mem.HIPBuffer}
# rocBLAS: handle creation, GEMM and high-level matmul.
precompile(Tuple{typeof(rocBLAS.create_handle)})
precompile(Tuple{typeof(rocBLAS.lib_state)})
for T in (Float32, Float64, ComplexF32, ComplexF64)
precompile(Tuple{typeof(rocBLAS.gemm!), Char, Char, T, RM(T), RM(T), T, RM(T)})
end
for T in (Float32, Float64)
precompile(Tuple{typeof(*), RM(T), RM(T)})
precompile(Tuple{typeof(LinearAlgebra.mul!), RM(T), RM(T), RM(T)})
end

# rocSOLVER: common factorizations.
for T in (Float32, Float64)
precompile(Tuple{typeof(rocSOLVER.getrf!), RM(T)})
precompile(Tuple{typeof(rocSOLVER.geqrf!), RM(T)})
precompile(Tuple{typeof(rocSOLVER.potrf!), Char, RM(T)})
end

# rocFFT: plan creation for common types.
for T in (ComplexF32, ComplexF64)
precompile(Tuple{typeof(rocFFT.plan_fft!), RM(T), Int})
end
for T in (Float32, Float64)
precompile(Tuple{typeof(rocFFT.plan_rfft), RM(T), Int})
end

# rocRAND / random.
precompile(Tuple{typeof(rand), Type{Float32}, Dims{2}})
precompile(Tuple{typeof(Random.rand!), RM(Float32)})
end