|
| 1 | +using PrecompileTools: @compile_workload |
| 2 | + |
| 3 | +# Compiling a GCN kernel requires being able to initialize the AMDGPU LLVM |
| 4 | +# backend, so we only run the precompile workload when that's supported, to be |
| 5 | +# able to load this package also on systems where the backend isn't available. |
| 6 | +# |
| 7 | +# This mirrors CUDA.jl's precompile workload: it warms up the GPUCompiler -> |
| 8 | +# AMDGPU codegen pipeline during precompilation so that the first kernel launch |
| 9 | +# at runtime doesn't have to JIT-compile the entire compiler. It does NOT need a |
| 10 | +# GPU (or even the ROCm runtime to be discovered) -- it only uses LLVM. |
| 11 | +if :AMDGPU in LLVM.backends() |
| 12 | + @compile_workload begin |
| 13 | + let |
| 14 | + function _precompile_kernel(a) |
| 15 | + i = workitemIdx().x |
| 16 | + @inbounds a[i] += 1.0f0 |
| 17 | + return |
| 18 | + end |
| 19 | + |
| 20 | + # Build a device-free compiler config for a baseline GCN target. |
| 21 | + # `gfx1030` (RDNA2, wavefront 32) is a portable baseline that exercises |
| 22 | + # the full pipeline; the cached *code* is reused regardless of the |
| 23 | + # actual device's ISA at runtime (only the kernel binary differs). |
| 24 | + # |
| 25 | + # NOTE: the ISA must be RDNA/CDNA, not pre-RDNA. The `wavefrontsize*` |
| 26 | + # LLVM features only exist on gfx10+, so pairing them with e.g. gfx900 |
| 27 | + # (GCN3) yields an inconsistent target that miscompiles the |
| 28 | + # wavefront-sensitive exception/bounds-error path (manifests under |
| 29 | + # `--check-bounds=yes`/`--code-coverage`, where `@inbounds` is ignored). |
| 30 | + target = GPUCompiler.GCNCompilerTarget(; |
| 31 | + dev_isa="gfx1030", features="+wavefrontsize32,-wavefrontsize64") |
| 32 | + params = Compiler.HIPCompilerParams(false, true) |
| 33 | + config = GPUCompiler.CompilerConfig(target, params; |
| 34 | + kernel=true, name=nothing, always_inline=true) |
| 35 | + |
| 36 | + tt = Tuple{ROCDeviceArray{Float32, 1, AS.Global}} |
| 37 | + source = GPUCompiler.methodinstance(typeof(_precompile_kernel), tt) |
| 38 | + job = GPUCompiler.CompilerJob(source, config) |
| 39 | + |
| 40 | + # Under `--check-bounds=yes` (used by `Pkg.test`) or `--code-coverage`, |
| 41 | + # `@inbounds` is ignored, so `a[i]` emits a bounds-error path |
| 42 | + # (`throw_boundserror` -> `signal_exception` -> `kernel_state()`). That |
| 43 | + # path compiles fine at *runtime*, but NOT during precompilation: the |
| 44 | + # `@generated kernel_state()` fails to inline there, leaving a dynamic |
| 45 | + # call -> invalid GPU IR. Those flags only occur during testing, never |
| 46 | + # in normal user precompilation (where `@inbounds` elides the path), so |
| 47 | + # skip the warming compile then -- users still get the full benefit. |
| 48 | + instrumented = Base.JLOptions().code_coverage != 0 || |
| 49 | + Base.JLOptions().check_bounds == 1 |
| 50 | + |
| 51 | + # On Julia < 1.12, GPU compilation during precompilation leaks foreign |
| 52 | + # MIs into native compilation, causing LLVM errors. Guard like CUDA.jl. |
| 53 | + @static if VERSION >= v"1.12-" |
| 54 | + if !instrumented |
| 55 | + GPUCompiler.JuliaContext() do ctx |
| 56 | + GPUCompiler.compile(:obj, job) |
| 57 | + end |
| 58 | + |
| 59 | + # The compile above runs during precompilation, when ROCm |
| 60 | + # discovery (`__init__`) has NOT run, so `libdevice_libs` is |
| 61 | + # empty. That poisons the `DEVICE_LIBS` cache with empty entries |
| 62 | + # (e.g. an `ocml` `DevLib` with no path), which would be baked |
| 63 | + # into the precompile image and prevent device-library linking |
| 64 | + # at runtime (`unsupported call to __ocml_*`). Reset it so it is |
| 65 | + # repopulated correctly once discovery has run. |
| 66 | + empty!(Compiler.DEVICE_LIBS) |
| 67 | + end |
| 68 | + end |
| 69 | + end |
| 70 | + end |
| 71 | +end |
| 72 | + |
| 73 | +# Kernel launch infrastructure that the workload above cannot reach, because it |
| 74 | +# requires a live device (mirrors CUDA.jl's explicit precompile directives: |
| 75 | +# `cufunction`, `link`, and `actual_compilation`). |
| 76 | +precompile(Tuple{typeof(Compiler.hipfunction), typeof(identity), Type{Tuple{Nothing}}}) |
| 77 | +precompile(Tuple{typeof(GPUCompiler.actual_compilation), |
| 78 | + Dict{Any, HIP.HIPFunction}, Core.MethodInstance, UInt64, |
| 79 | + Compiler.HIPCompilerConfig, typeof(Compiler.hipcompile), typeof(Compiler.hiplink)}) |
| 80 | +precompile(Tuple{typeof(Compiler.hiplink), Compiler.HIPCompilerJob, |
| 81 | + NamedTuple{(:obj, :entry, :global_hostcalls), |
| 82 | + Tuple{Vector{UInt8}, String, Vector{Symbol}}}}) |
| 83 | + |
| 84 | +# Hot entry points of the bundled ROCm libraries, mirroring CUDA.jl's per-library |
| 85 | +# precompile directives. These compile the (GPU-free) Julia wrappers so the first |
| 86 | +# `A * B`, factorization, FFT plan, etc. doesn't pay full first-use compilation. |
| 87 | +let RM = (T) -> ROCArray{T, 2, Mem.HIPBuffer} |
| 88 | + # rocBLAS: handle creation, GEMM and high-level matmul. |
| 89 | + precompile(Tuple{typeof(rocBLAS.create_handle)}) |
| 90 | + precompile(Tuple{typeof(rocBLAS.lib_state)}) |
| 91 | + for T in (Float32, Float64, ComplexF32, ComplexF64) |
| 92 | + precompile(Tuple{typeof(rocBLAS.gemm!), Char, Char, T, RM(T), RM(T), T, RM(T)}) |
| 93 | + end |
| 94 | + for T in (Float32, Float64) |
| 95 | + precompile(Tuple{typeof(*), RM(T), RM(T)}) |
| 96 | + precompile(Tuple{typeof(LinearAlgebra.mul!), RM(T), RM(T), RM(T)}) |
| 97 | + end |
| 98 | + |
| 99 | + # rocSOLVER: common factorizations. |
| 100 | + for T in (Float32, Float64) |
| 101 | + precompile(Tuple{typeof(rocSOLVER.getrf!), RM(T)}) |
| 102 | + precompile(Tuple{typeof(rocSOLVER.geqrf!), RM(T)}) |
| 103 | + precompile(Tuple{typeof(rocSOLVER.potrf!), Char, RM(T)}) |
| 104 | + end |
| 105 | + |
| 106 | + # rocFFT: plan creation for common types. |
| 107 | + for T in (ComplexF32, ComplexF64) |
| 108 | + precompile(Tuple{typeof(rocFFT.plan_fft!), RM(T), Int}) |
| 109 | + end |
| 110 | + for T in (Float32, Float64) |
| 111 | + precompile(Tuple{typeof(rocFFT.plan_rfft), RM(T), Int}) |
| 112 | + end |
| 113 | + |
| 114 | + # rocRAND / random. |
| 115 | + precompile(Tuple{typeof(rand), Type{Float32}, Dims{2}}) |
| 116 | + precompile(Tuple{typeof(Random.rand!), RM(Float32)}) |
| 117 | +end |
0 commit comments