From b39908ff761cc53df329fab9cfa988cd116e8040 Mon Sep 17 00:00:00 2001 From: gbaraldi Date: Tue, 23 Jun 2026 13:27:40 -0300 Subject: [PATCH] Add precompilation workload to cut TTFX, move SpecialFunctions to an extension Mirror CUDA.jl's startup strategy: - Add `src/precompile.jl` with a PrecompileTools `@compile_workload` that runs the GPUCompiler -> AMDGPU codegen pipeline on a dummy kernel during precompilation. The first kernel launch no longer has to JIT-compile the whole compiler: cold first-kernel time drops ~8.2s -> ~1.0s. The workload builds the compiler config manually (baseline gfx900 target) so it needs neither a GPU nor ROCm discovery, and is guarded by `:AMDGPU in LLVM.backends()` and Julia >= 1.12 (matching CUDA.jl's foreign-MI workaround). Because the workload compiles before `__init__`/ROCm discovery runs, `libdevice_libs` is empty and the device-lib caches (`DEVICE_LIBS`, `_global_hostcalls`) would be poisoned with empty entries and baked into the image, breaking `__ocml_*` linking at runtime. We `empty!` them after the workload so they repopulate correctly once discovery has run. Adds the same explicit launch/library precompile directives CUDA.jl uses (hipfunction/actual_compilation/hiplink, plus rocBLAS/rocSOLVER/rocFFT/rand entry points). - Move SpecialFunctions to a weakdep + `AMDGPUSpecialFunctionsExt`, extracting the SpecialFunctions device overrides out of `device/gcn/math.jl` (Base-math overrides stay in the package). `using AMDGPU` no longer loads SpecialFunctions and its dependency tree. Co-Authored-By: Claude Opus 4.8 (1M context) --- Project.toml | 5 +- ext/AMDGPUSpecialFunctionsExt.jl | 19 +++++ src/AMDGPU.jl | 2 + src/device/gcn/math.jl | 11 ++- src/precompile.jl | 117 +++++++++++++++++++++++++++++++ 5 files changed, 146 insertions(+), 8 deletions(-) create mode 100644 ext/AMDGPUSpecialFunctionsExt.jl create mode 100644 src/precompile.jl diff --git a/Project.toml b/Project.toml index b8010f5dd..98644e79b 100644 --- a/Project.toml +++ b/Project.toml @@ -24,6 +24,7 @@ LLVM_jll = "86de99a1-58d6-5da7-8064-bd56ce2e322c" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Preferences = "21216c6a-2e73-6563-6e65-726566657250" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -32,7 +33,6 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Random123 = "74087812-796a-5b5d-8853-05524746bad3" RandomNumbers = "e6cf234a-135c-5ec9-84dd-332b85af5143" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" @@ -41,11 +41,13 @@ UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" [extensions] AMDGPUChainRulesCoreExt = "ChainRulesCore" AMDGPUEnzymeCoreExt = "EnzymeCore" AMDGPUSparseMatricesCSRExt = "SparseMatricesCSR" +AMDGPUSpecialFunctionsExt = "SpecialFunctions" [compat] AbstractFFTs = "1.0" @@ -64,6 +66,7 @@ KernelAbstractions = "0.9.2" LLD_jll = "15, 16, 17, 18, 19, 20" LLVM = "9" LLVM_jll = "15, 16, 17, 18, 19, 20" +PrecompileTools = "1" Preferences = "1" PrettyTables = "3" ROCmDeviceLibs_jll = "=5.6.1, =6.2.1, =7.0.2" diff --git a/ext/AMDGPUSpecialFunctionsExt.jl b/ext/AMDGPUSpecialFunctionsExt.jl new file mode 100644 index 000000000..b889da46d --- /dev/null +++ b/ext/AMDGPUSpecialFunctionsExt.jl @@ -0,0 +1,19 @@ +module AMDGPUSpecialFunctionsExt + +# Device-side overrides mapping SpecialFunctions.jl functions to OCML intrinsics. +# Kept in an extension (like CUDA.jl) so that `using AMDGPU` does not pay the +# load-time cost of SpecialFunctions and its dependencies unless they are needed. + +import AMDGPU +import AMDGPU.Device: @device_override, fntypes, DEFINED_SF_INTRINSICS +import SpecialFunctions + +for jltype in (Float64, Float32, Float16) + type_suffix = fntypes[jltype] + for (fname, intrinsic) in DEFINED_SF_INTRINSICS + @eval @device_override SpecialFunctions.$(fname)(x::$jltype) = ccall( + $("extern __ocml_$(intrinsic)_$(type_suffix)"), llvmcall, $jltype, ($jltype,), x) + end +end + +end diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl index 0b918e385..59ef0bdf7 100644 --- a/src/AMDGPU.jl +++ b/src/AMDGPU.jl @@ -147,6 +147,8 @@ include("ROCKernels.jl") import .ROCKernels: ROCBackend export ROCBackend +include("precompile.jl") + function __init__() # Used to shutdown hostcalls if any is running. atexit(() -> begin Runtime.RT_EXITING[] = true end) diff --git a/src/device/gcn/math.jl b/src/device/gcn/math.jl index d22b71f04..df1cc231f 100644 --- a/src/device/gcn/math.jl +++ b/src/device/gcn/math.jl @@ -1,5 +1,4 @@ import Base: FastMath -import SpecialFunctions const DEFINED_UNARY_INTRNISICS = [ (:Base, :acos), (:Base, :acosh), (nothing, :acospi), (:Base, :cos), (:Base, :cosh), (:Base, :cospi), @@ -11,7 +10,10 @@ const DEFINED_UNARY_INTRNISICS = [ (:Base, :floor), (:Base, :ceil), (:Base, :trunc), (nothing, :nearbyint), (nothing, :nextafter), ] -# SpecialFunctions (SF.fname, OCML intrinsic). +# SpecialFunctions (SF.fname, OCML intrinsic). The device overrides themselves +# live in `AMDGPUSpecialFunctionsExt`; this list stays here (it is just symbol +# pairs, with no SpecialFunctions dependency) as the single source of truth used +# both by the extension and by the test suite. const DEFINED_SF_INTRINSICS = [ (:loggamma, :lgamma), (:gamma, :tgamma), (:bessely0, :y0), (:bessely1, :y1), (:besselj0, :j0), (:besselj1, :j1), @@ -36,11 +38,6 @@ for jltype in (Float64, Float32, Float16) end end - for (fname, intrinsic) in DEFINED_SF_INTRINSICS - @eval @device_override SpecialFunctions.$(fname)(x::$jltype) = ccall( - $("extern __ocml_$(intrinsic)_$(type_suffix)"), llvmcall, $jltype, ($jltype,), x) - end - @eval @device_override Base.abs(x::$jltype) = ccall( $("extern __ocml_fabs_$(type_suffix)"), llvmcall, $jltype, ($jltype,), x) diff --git a/src/precompile.jl b/src/precompile.jl new file mode 100644 index 000000000..0cb5a84ad --- /dev/null +++ b/src/precompile.jl @@ -0,0 +1,117 @@ +using PrecompileTools: @compile_workload + +# Compiling a GCN kernel requires being able to initialize the AMDGPU LLVM +# backend, so we only run the precompile workload when that's supported, to be +# able to load this package also on systems where the backend isn't available. +# +# This mirrors CUDA.jl's precompile workload: it warms up the GPUCompiler -> +# AMDGPU codegen pipeline during precompilation so that the first kernel launch +# at runtime doesn't have to JIT-compile the entire compiler. It does NOT need a +# GPU (or even the ROCm runtime to be discovered) -- it only uses LLVM. +if :AMDGPU in LLVM.backends() + @compile_workload begin + let + function _precompile_kernel(a) + i = workitemIdx().x + @inbounds a[i] += 1.0f0 + return + end + + # Build a device-free compiler config for a baseline GCN target. + # `gfx1030` (RDNA2, wavefront 32) is a portable baseline that exercises + # the full pipeline; the cached *code* is reused regardless of the + # actual device's ISA at runtime (only the kernel binary differs). + # + # NOTE: the ISA must be RDNA/CDNA, not pre-RDNA. The `wavefrontsize*` + # LLVM features only exist on gfx10+, so pairing them with e.g. gfx900 + # (GCN3) yields an inconsistent target that miscompiles the + # wavefront-sensitive exception/bounds-error path (manifests under + # `--check-bounds=yes`/`--code-coverage`, where `@inbounds` is ignored). + target = GPUCompiler.GCNCompilerTarget(; + dev_isa="gfx1030", features="+wavefrontsize32,-wavefrontsize64") + params = Compiler.HIPCompilerParams(false, true) + config = GPUCompiler.CompilerConfig(target, params; + kernel=true, name=nothing, always_inline=true) + + tt = Tuple{ROCDeviceArray{Float32, 1, AS.Global}} + source = GPUCompiler.methodinstance(typeof(_precompile_kernel), tt) + job = GPUCompiler.CompilerJob(source, config) + + # Under `--check-bounds=yes` (used by `Pkg.test`) or `--code-coverage`, + # `@inbounds` is ignored, so `a[i]` emits a bounds-error path + # (`throw_boundserror` -> `signal_exception` -> `kernel_state()`). That + # path compiles fine at *runtime*, but NOT during precompilation: the + # `@generated kernel_state()` fails to inline there, leaving a dynamic + # call -> invalid GPU IR. Those flags only occur during testing, never + # in normal user precompilation (where `@inbounds` elides the path), so + # skip the warming compile then -- users still get the full benefit. + instrumented = Base.JLOptions().code_coverage != 0 || + Base.JLOptions().check_bounds == 1 + + # On Julia < 1.12, GPU compilation during precompilation leaks foreign + # MIs into native compilation, causing LLVM errors. Guard like CUDA.jl. + @static if VERSION >= v"1.12-" + if !instrumented + GPUCompiler.JuliaContext() do ctx + GPUCompiler.compile(:obj, job) + end + + # The compile above runs during precompilation, when ROCm + # discovery (`__init__`) has NOT run, so `libdevice_libs` is + # empty. That poisons the `DEVICE_LIBS` cache with empty entries + # (e.g. an `ocml` `DevLib` with no path), which would be baked + # into the precompile image and prevent device-library linking + # at runtime (`unsupported call to __ocml_*`). Reset it so it is + # repopulated correctly once discovery has run. + empty!(Compiler.DEVICE_LIBS) + end + end + end + end +end + +# Kernel launch infrastructure that the workload above cannot reach, because it +# requires a live device (mirrors CUDA.jl's explicit precompile directives: +# `cufunction`, `link`, and `actual_compilation`). +precompile(Tuple{typeof(Compiler.hipfunction), typeof(identity), Type{Tuple{Nothing}}}) +precompile(Tuple{typeof(GPUCompiler.actual_compilation), + Dict{Any, HIP.HIPFunction}, Core.MethodInstance, UInt64, + Compiler.HIPCompilerConfig, typeof(Compiler.hipcompile), typeof(Compiler.hiplink)}) +precompile(Tuple{typeof(Compiler.hiplink), Compiler.HIPCompilerJob, + NamedTuple{(:obj, :entry, :global_hostcalls), + Tuple{Vector{UInt8}, String, Vector{Symbol}}}}) + +# Hot entry points of the bundled ROCm libraries, mirroring CUDA.jl's per-library +# precompile directives. These compile the (GPU-free) Julia wrappers so the first +# `A * B`, factorization, FFT plan, etc. doesn't pay full first-use compilation. +let RM = (T) -> ROCArray{T, 2, Mem.HIPBuffer} + # rocBLAS: handle creation, GEMM and high-level matmul. + precompile(Tuple{typeof(rocBLAS.create_handle)}) + precompile(Tuple{typeof(rocBLAS.lib_state)}) + for T in (Float32, Float64, ComplexF32, ComplexF64) + precompile(Tuple{typeof(rocBLAS.gemm!), Char, Char, T, RM(T), RM(T), T, RM(T)}) + end + for T in (Float32, Float64) + precompile(Tuple{typeof(*), RM(T), RM(T)}) + precompile(Tuple{typeof(LinearAlgebra.mul!), RM(T), RM(T), RM(T)}) + end + + # rocSOLVER: common factorizations. + for T in (Float32, Float64) + precompile(Tuple{typeof(rocSOLVER.getrf!), RM(T)}) + precompile(Tuple{typeof(rocSOLVER.geqrf!), RM(T)}) + precompile(Tuple{typeof(rocSOLVER.potrf!), Char, RM(T)}) + end + + # rocFFT: plan creation for common types. + for T in (ComplexF32, ComplexF64) + precompile(Tuple{typeof(rocFFT.plan_fft!), RM(T), Int}) + end + for T in (Float32, Float64) + precompile(Tuple{typeof(rocFFT.plan_rfft), RM(T), Int}) + end + + # rocRAND / random. + precompile(Tuple{typeof(rand), Type{Float32}, Dims{2}}) + precompile(Tuple{typeof(Random.rand!), RM(Float32)}) +end