diff --git a/Project.toml b/Project.toml index 9afe3dc..3596254 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.15.2" [deps] MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" @@ -22,6 +23,7 @@ CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4, MPI = "0.20" Polyester = "0.7" julia = "1.9" +oneAPI = "1.6.1" [extras] CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9" diff --git a/ext/ImplicitGlobalGrid_ONEAPIExt.jl b/ext/ImplicitGlobalGrid_ONEAPIExt.jl new file mode 100644 index 0000000..7eb09d6 --- /dev/null +++ b/ext/ImplicitGlobalGrid_ONEAPIExt.jl @@ -0,0 +1,5 @@ +module ImplicitGlobalGrid_INTELExt + include(joinpath(@__DIR__, "..", "src", "ONEAPIExt", "shared.jl")) + include(joinpath(@__DIR__, "..", "src", "ONEAPIExt", "select_device.jl")) + include(joinpath(@__DIR__, "..", "src", "ONEAPIExt", "update_halo.jl")) +end diff --git a/src/ImplicitGlobalGrid.jl b/src/ImplicitGlobalGrid.jl index f86e18b..d2979ac 100644 --- a/src/ImplicitGlobalGrid.jl +++ b/src/ImplicitGlobalGrid.jl @@ -48,6 +48,7 @@ include("shared.jl") include("defaults_shared.jl") include(joinpath("AMDGPUExt", "defaults.jl")) include(joinpath("CUDAExt", "defaults.jl")) +include(joinpath("ONEAPIExt", "defaults.jl")) include(joinpath("PolyesterExt", "memcopy_polyester_default.jl")) ## Alphabetical include of files diff --git a/src/ONEAPIExt/defaults.jl b/src/ONEAPIExt/defaults.jl new file mode 100644 index 0000000..cbb3560 --- /dev/null +++ b/src/ONEAPIExt/defaults.jl @@ -0,0 +1,22 @@ +# shared.jl + +is_onearray(A::GGArray) = false + + +# select_device.jl + +function nb_oneapidevices end +function oneapidevice! end + + +# update_halo.jl + +function free_update_halo_intelbuffers end +function init_onebufs_arrays end +function init_onebufs end +function reinterpret_onebufs end +function reallocate_undersized_onebufs end +function reregister_onebufs end +function get_onesendbufs_raw end +function get_onerecvbufs_raw end +function allocate_onestreams end diff --git a/src/ONEAPIExt/select_device.jl b/src/ONEAPIExt/select_device.jl new file mode 100644 index 0000000..e381b2c --- /dev/null +++ b/src/ONEAPIExt/select_device.jl @@ -0,0 +1,2 @@ +ImplicitGlobalGrid.nb_oneapidevices() = length(oneAPI.devices()) +ImplicitGlobalGrid.oneapidevice!(device_id) = oneAPI.device!(device_id) diff --git a/src/ONEAPIExt/shared.jl b/src/ONEAPIExt/shared.jl new file mode 100644 index 0000000..47c12a2 --- /dev/null +++ b/src/ONEAPIExt/shared.jl @@ -0,0 +1,45 @@ +import ImplicitGlobalGrid +import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, oneapiaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, register, is_cuarray +import ImplicitGlobalGrid: NNEIGHBORS_PER_DIM, GG_ALLOC_GRANULARITY +using oneAPI + + +##------ +## TYPES + +const oneField{T,N} = GGField{T,N,oneArray{T,N}} + + +##------------------------------------ +## HANDLING OF CUDA AND AMDGPU SUPPORT + +ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_ONEAPIExt}) = true +ImplicitGlobalGrid.is_functional(::Val{:oneAPI}) = oneAPI.functional() + + +##------------- +## SYNTAX SUGAR + +ImplicitGlobalGrid.is_onearray(A::oneArray) = true #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer. + + +##-------------------------------------------------------------------------------- +## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS + +ImplicitGlobalGrid.wrap_field(A::oneArray, hw::Tuple) = oneField{eltype(A), ndims(A)}((A, hw)) + +Base.size(A::oneField) = Base.size(A.A) +Base.size(A::oneField, args...) = Base.size(A.A, args...) +Base.length(A::oneField) = Base.length(A.A) +Base.ndims(A::oneField) = Base.ndims(A.A) +Base.eltype(A::oneField) = Base.eltype(A.A) + + +##--------------- +## oneAPI functions + +function ImplicitGlobalGrid.register(::Type{<:oneArray},buf::Array{T}) where T <: GGNumber + rbuf = oneAPI.Mem.register(oneAPI.Mem.Host, pointer(buf), sizeof(buf), oneAPI.Mem.HOSTREGISTER_DEVICEMAP); + rbuf_d = convert(onePtr{T}, rbuf); + return unsafe_wrap(oneArray, rbuf_d, size(buf)), rbuf; +end diff --git a/src/ONEAPIExt/update_halo.jl b/src/ONEAPIExt/update_halo.jl new file mode 100644 index 0000000..27774c7 --- /dev/null +++ b/src/ONEAPIExt/update_halo.jl @@ -0,0 +1,260 @@ +##--------------------------------------- +## FUNCTIONS RELATED TO BUFFER ALLOCATION + +# NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time. + +ImplicitGlobalGrid.free_update_halo_onebuffers(args...) = free_update_halo_onebuffers(args...) +ImplicitGlobalGrid.init_onebufs_arrays(args...) = init_onebufs_arrays(args...) +ImplicitGlobalGrid.init_onebufs(args...) = init_onebufs(args...) +ImplicitGlobalGrid.reinterpret_onebufs(args...) = reinterpret_onebufs(args...) +ImplicitGlobalGrid.reallocate_undersized_onebufs(args...) = reallocate_undersized_onebufs(args...) +ImplicitGlobalGrid.reregister_onebufs(args...) = reregister_onebufs(args...) +ImplicitGlobalGrid.get_onesendbufs_raw(args...) = get_onesendbufs_raw(args...) +ImplicitGlobalGrid.get_onerecvbufs_raw(args...) = get_onerecvbufs_raw(args...) +ImplicitGlobalGrid.gpusendbuf(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where {T <: GGNumber} = gpusendbuf(n,dim,i,A) +ImplicitGlobalGrid.gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where {T <: GGNumber} = gpurecvbuf(n,dim,i,A) +ImplicitGlobalGrid.gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where {T <: GGNumber} = gpusendbuf_flat(n,dim,i,A) +ImplicitGlobalGrid.gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where {T <: GGNumber} = gpurecvbuf_flat(n,dim,i,A) + +let + global free_update_halo_onebuffers, init_onebufs_arrays, init_onebufs, reinterpret_onebufs, reregister_onebufs, reallocate_undersized_onebufs + global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat + onesendbufs_raw = nothing + onerecvbufs_raw = nothing + onesendbufs_raw_h = nothing + onerecvbufs_raw_h = nothing + + function free_update_halo_onebuffers() + free_onebufs(onesendbufs_raw) + free_onebufs(onerecvbufs_raw) + unregister_onebufs(onesendbufs_raw_h) + unregister_onebufs(onerecvbufs_raw_h) + reset_one_buffers() + end + + function free_onebufs(bufs) + if (bufs !== nothing) + for i = 1:length(bufs) + for n = 1:length(bufs[i]) + if is_onearray(bufs[i][n]) oneAPI.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end + end + end + end + end + + function unregister_onebufs(bufs) + if (bufs !== nothing) + for i = 1:length(bufs) + for n = 1:length(bufs[i]) + if (isa(bufs[i][n],oneAPI.Mem.HostBuffer)) oneAPI.Mem.unregister(bufs[i][n]); bufs[i][n] = []; end + end + end + end + end + + function reset_one_buffers() + onesendbufs_raw = nothing + onerecvbufs_raw = nothing + onesendbufs_raw_h = nothing + onerecvbufs_raw_h = nothing + end + + + # (oneAPI functions) + + function init_onebufs_arrays() + onesendbufs_raw = Array{Array{Any,1},1}(); + onerecvbufs_raw = Array{Array{Any,1},1}(); + onesendbufs_raw_h = Array{Array{Any,1},1}(); + onerecvbufs_raw_h = Array{Array{Any,1},1}(); + end + + function init_onebufs(T::DataType, fields::GGField...) + while (length(onesendbufs_raw) < length(fields)) push!(onesendbufs_raw, [oneArray{T}(undef,0), oneArray{T}(undef,0)]); end + while (length(onerecvbufs_raw) < length(fields)) push!(onerecvbufs_raw, [oneArray{T}(undef,0), oneArray{T}(undef,0)]); end + while (length(onesendbufs_raw_h) < length(fields)) push!(onesendbufs_raw_h, [[], []]); end + while (length(onerecvbufs_raw_h) < length(fields)) push!(onerecvbufs_raw_h, [[], []]); end + end + + function reinterpret_onebufs(T::DataType, i::Integer, n::Integer) + if (eltype(onesendbufs_raw[i][n]) != T) onesendbufs_raw[i][n] = reinterpret(T, onesendbufs_raw[i][n]); end + if (eltype(onerecvbufs_raw[i][n]) != T) onerecvbufs_raw[i][n] = reinterpret(T, onerecvbufs_raw[i][n]); end + end + + function reallocate_undersized_onebufs(T::DataType, i::Integer, max_halo_elems::Integer) + if (!isnothing(onesendbufs_raw) && length(onesendbufs_raw[i][1]) < max_halo_elems) + for n = 1:NNEIGHBORS_PER_DIM + reallocate_onebufs(T, i, n, max_halo_elems); GC.gc(); # Too small buffers had been replaced with larger ones; free the unused memory immediately. + end + end + end + + function reallocate_onebufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer) + onesendbufs_raw[i][n] = oneAPI.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater. + onerecvbufs_raw[i][n] = oneAPI.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); + end + + function reregister_onebufs(T::DataType, i::Integer, n::Integer, sendbufs_raw, recvbufs_raw) + if (isa(onesendbufs_raw_h[i][n],oneAPI.Mem.HostBuffer)) oneAPI.Mem.unregister(onesendbufs_raw_h[i][n]); onesendbufs_raw_h[i][n] = []; end # It is always initialized registered... if (cusendbufs_raw_h[i][n].bytesize > 32*sizeof(T)) + if (isa(onerecvbufs_raw_h[i][n],oneAPI.Mem.HostBuffer)) oneAPI.Mem.unregister(onerecvbufs_raw_h[i][n]); onerecvbufs_raw_h[i][n] = []; end # It is always initialized registered... if (curecvbufs_raw_h[i][n].bytesize > 32*sizeof(T)) + onesendbufs_raw[i][n], onesendbufs_raw_h[i][n] = register(oneArray,sendbufs_raw[i][n]); + onerecvbufs_raw[i][n], onerecvbufs_raw_h[i][n] = register(oneArray,recvbufs_raw[i][n]); + end + + + # (oneAPI functions) + + function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where T <: GGNumber + return view(onesendbufs_raw[i][n]::oneVector{T},1:prod(halosize(dim,A))); + end + + function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where T <: GGNumber + return view(onerecvbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); + end + + + # (GPU functions) + + #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber and GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others. + function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where T <: GGNumber + return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A)); + end + + function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where T <: GGNumber + return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A)); + end + + + # Make sendbufs_raw and recvbufs_raw accessible for unit testing. + global get_onesendbufs_raw, get_onerecvbufs_raw + get_onesendbufs_raw() = deepcopy(onesendbufs_raw) + get_onerecvbufs_raw() = deepcopy(onerecvbufs_raw) +end + + +##---------------------------------------------- +## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS + +function ImplicitGlobalGrid.allocate_onestreams(fields::GGField...) + allocate_onestreams_iwrite(fields...); + allocate_onestreams_iread(fields...); +end + +ImplicitGlobalGrid.iwrite_sendbufs!(n::Integer, dim::Integer, F::oneField{T}, i::Integer) where {T <: GGNumber} = iwrite_sendbufs!(n,dim,F,i) +ImplicitGlobalGrid.iread_recvbufs!(n::Integer, dim::Integer, F::oneField{T}, i::Integer) where {T <: GGNumber} = iread_recvbufs!(n,dim,F,i) +ImplicitGlobalGrid.wait_iwrite(n::Integer, A::oneField{T}, i::Integer) where {T <: GGNumber} = wait_iwrite(n,A,i) +ImplicitGlobalGrid.wait_iread(n::Integer, A::oneField{T}, i::Integer) where {T <: GGNumber} = wait_iread(n,A,i) + +let + global iwrite_sendbufs!, allocate_onestreams_iwrite, wait_iwrite + + onestreams = Array{oneStream}(undef, NNEIGHBORS_PER_DIM, 0) + + wait_iwrite(n::Integer, A::oneField{T}, i::Integer) where T <: GGNumber = oneAPI.synchronize(onestreams[n,i]; blocking=true); + + function allocate_onestreams_iwrite(fields::GGField...) + if length(fields) > size(onestreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField + onestreams = [onestreams [oneStream(; flags=ONEAPI.STREAM_NON_BLOCKING, priority=oneAPI.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(onestreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. + end + end + + function iwrite_sendbufs!(n::Integer, dim::Integer, F::oneField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + if dim == 1 || oneapiaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). + ranges = sendranges(n, dim, F); + nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); + halosize = [r[end] - r[1] + 1 for r in ranges]; + nblocks = Tuple(ceil.(Int, halosize./nthreads)); + @cuda blocks=nblocks threads=nthreads stream=onestreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); + else + write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), onestreams[n,i]); + end + end + end +end + +let + global iread_recvbufs!, allocate_onestreams_iread, wait_iread + + onestreams = Array{oneStream}(undef, NNEIGHBORS_PER_DIM, 0) + + wait_iread(n::Integer, A::oneField{T}, i::Integer) where T <: GGNumber = oneAPI.synchronize(onestreams[n,i]; blocking=true); + + function allocate_onestreams_iread(fields::GGField...) + if length(fields) > size(onestreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField + onestreams = [onestreams [oneStream(; flags=ONEAPI.STREAM_NON_BLOCKING, priority=oneAPI.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(onestreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. + end + end + + function iread_recvbufs!(n::Integer, dim::Integer, F::oneField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + if dim == 1 || oneapiaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). + ranges = recvranges(n, dim, F); + nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); + halosize = [r[end] - r[1] + 1 for r in ranges]; + nblocks = Tuple(ceil.(Int, halosize./nthreads)); + @cuda blocks=nblocks threads=nthreads stream=onestreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); + else + read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), onestreams[n,i]); + end + end + end +end + + +# (CUDA functions) + +# Write to the send buffer on the host or device from the array on the device (d2x). +function ImplicitGlobalGrid.write_d2x!(gpusendbuf::oneArray{T}, A::oneArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber + ix = (oneAPI.blockIdx().x-1) * oneAPI.blockDim().x + oneAPI.threadIdx().x + sendrangex[1] - 1 + iy = (oneAPI.blockIdx().y-1) * oneAPI.blockDim().y + oneAPI.threadIdx().y + sendrangey[1] - 1 + iz = (oneAPI.blockIdx().z-1) * oneAPI.blockDim().z + oneAPI.threadIdx().z + sendrangez[1] - 1 + if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end + gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; + return nothing +end + +# Read from the receive buffer on the host or device and store on the array on the device (x2d). +function ImplicitGlobalGrid.read_x2d!(gpurecvbuf::oneArray{T}, A::oneArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber + ix = (oneAPI.blockIdx().x-1) * oneAPI.blockDim().x + oneAPI.threadIdx().x + recvrangex[1] - 1 + iy = (oneAPI.blockIdx().y-1) * oneAPI.blockDim().y + oneAPI.threadIdx().y + recvrangey[1] - 1 + iz = (oneAPI.blockIdx().z-1) * oneAPI.blockDim().z + oneAPI.threadIdx().z + recvrangez[1] - 1 + if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end + A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; + return nothing +end + +# Write to the send buffer on the host from the array on the device (d2h). +function ImplicitGlobalGrid.write_d2h_async!(sendbuf::AbstractArray{T}, A::oneArray{T}, sendranges::Array{UnitRange{T2},1}, onestream::oneStream) where T <: GGNumber where T2 <: Integer + oneAPI.Mem.unsafe_copy3d!( + pointer(sendbuf), oneAPI.Mem.Host, pointer(A), oneAPI.Mem.Device, + length(sendranges[1]), length(sendranges[2]), length(sendranges[3]); + srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]), + srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2), + dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]), + async=true, stream=onestream + ) +end + +# Read from the receive buffer on the host and store on the array on the device (h2d). +function ImplicitGlobalGrid.read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, onestream::oneStream) where T <: GGNumber where T2 <: Integer + oneAPI.Mem.unsafe_copy3d!( + pointer(A), oneAPI.Mem.Device, pointer(recvbuf), oneAPI.Mem.Host, + length(recvranges[1]), length(recvranges[2]), length(recvranges[3]); + dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]), + srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]), + dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2), + async=true, stream=onestream + ) +end + + +##------------------------------ +## FUNCTIONS TO SEND/RECV FIELDS + +function ImplicitGlobalGrid.gpumemcopy!(dst::oneArray{T}, src::oneArray{T}) where T <: GGNumber + @inbounds oneAPI.copyto!(dst, src) +end + diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl index dfcf347..35f684d 100644 --- a/src/init_global_grid.jl +++ b/src/init_global_grid.jl @@ -44,6 +44,8 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 set_cuda_functional() set_amdgpu_loaded() set_amdgpu_functional() + set_oneapi_loaded() + set_oneapi_functional() nxyz = [nx, ny, nz]; dims = [dimx, dimy, dimz]; periods = [periodx, periody, periodz]; @@ -51,12 +53,15 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 halowidths = [halowidths...]; cuda_enabled = false amdgpu_enabled = false + oneapi_enabled = false cudaaware_MPI = [false, false, false] amdgpuaware_MPI = [false, false, false] + oneapiaware_MPI = [false, false, false] use_polyester = [false, false, false] if haskey(ENV, "IGG_LOOPVECTORIZATION") error("Environment variable IGG_LOOPVECTORIZATION is not supported anymore. Use IGG_USE_POLYESTER instead.") end if haskey(ENV, "IGG_CUDAAWARE_MPI") cudaaware_MPI .= (parse(Int64, ENV["IGG_CUDAAWARE_MPI"]) > 0); end if haskey(ENV, "IGG_ROCMAWARE_MPI") amdgpuaware_MPI .= (parse(Int64, ENV["IGG_ROCMAWARE_MPI"]) > 0); end + if haskey(ENV, "IGG_ONEAPIAWARE_MPI") oneapiaware_MPI .= (parse(Int64, ENV["IGG_ONEAPIAWARE_MPI"]) > 0); end if haskey(ENV, "IGG_USE_POLYESTER") use_polyester .= (parse(Int64, ENV["IGG_USE_POLYESTER"]) > 0); end if none(cudaaware_MPI) if haskey(ENV, "IGG_CUDAAWARE_MPI_DIMX") cudaaware_MPI[1] = (parse(Int64, ENV["IGG_CUDAAWARE_MPI_DIMX"]) > 0); end @@ -73,11 +78,17 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 if haskey(ENV, "IGG_USE_POLYESTER_DIMY") use_polyester[2] = (parse(Int64, ENV["IGG_USE_POLYESTER_DIMY"]) > 0); end if haskey(ENV, "IGG_USE_POLYESTER_DIMZ") use_polyester[3] = (parse(Int64, ENV["IGG_USE_POLYESTER_DIMZ"]) > 0); end end - if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end - if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && cuda_functional() && amdgpu_loaded() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded and functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end + if none(oneapiaware_MPI) + if haskey(ENV, "IGG_ONEAPIAWARE_MPI_DIMX") oneapiaware_MPI[1] = (parse(Int64, ENV["IGG_ONEAPIAWARE_MPI_DIMX"]) > 0); end + if haskey(ENV, "IGG_ONEAPIAWARE_MPI_DIMY") oneapiaware_MPI[2] = (parse(Int64, ENV["IGG_ONEAPIAWARE_MPI_DIMY"]) > 0); end + if haskey(ENV, "IGG_ONEAPIAWARE_MPI_DIMZ") oneapiaware_MPI[3] = (parse(Int64, ENV["IGG_ONEAPIAWARE_MPI_DIMZ"]) > 0); end + end + if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU, DEVICE_TYPE_ONEAPI]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_ONEAPI, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end + if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && cuda_functional() && amdgpu_loaded() && amdgpu_functional() && oneapi_loaded() && oneapi_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded and functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU or $DEVICE_TYPE_ONEAPI.") end if (device_type != DEVICE_TYPE_NONE) if (device_type in [DEVICE_TYPE_CUDA, DEVICE_TYPE_AUTO]) cuda_enabled = cuda_loaded() && cuda_functional() end # NOTE: cuda could be enabled/disabled depending on some additional criteria. if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() && amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria. + if (device_type in [DEVICE_TYPE_ONEAPI, DEVICE_TYPE_AUTO]) oneapi_enabled = oneapi_loaded() && oneapi_functional() end # NOTE: oneapi could be enabled/disabled depending on some additional criteria. end if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end @@ -105,13 +116,14 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 neighbors[:,i] .= MPI.Cart_shift(comm_cart, i-1, disp); end nxyz_g = dims.*(nxyz.-overlaps) .+ overlaps.*(periods.==0); # E.g. for dimension x with ol=2 and periodx=0: dimx*(nx-2)+2 - set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, halowidths, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, use_polyester, quiet)); + set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, halowidths, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, oneapi_enabled, cudaaware_MPI, amdgpuaware_MPI, oneapiaware_MPI, use_polyester, quiet)); cuda_support_string = (cuda_enabled && all(cudaaware_MPI)) ? "CUDA-aware" : (cuda_enabled && any(cudaaware_MPI)) ? "CUDA(-aware)" : (cuda_enabled) ? "CUDA" : ""; amdgpu_support_string = (amdgpu_enabled && all(amdgpuaware_MPI)) ? "AMDGPU-aware" : (amdgpu_enabled && any(amdgpuaware_MPI)) ? "AMDGPU(-aware)" : (amdgpu_enabled) ? "AMDGPU" : ""; - gpu_support_string = join(filter(!isempty, [cuda_support_string, amdgpu_support_string]), ", "); + oneapi_support_string = (oneapi_enabled && all(oneapiaware_MPI)) ? "ONEAPI-aware" : (oneapi_enabled && any(oneapiaware_MPI)) ? "ONEAPI(-aware)" : (oneapi_enabled) ? "ONEAPI" : ""; + gpu_support_string = join(filter(!isempty, [cuda_support_string, amdgpu_support_string, oneapi_support_string]), ", "); support_string = isempty(gpu_support_string) ? "none" : gpu_support_string; if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]); device support: $support_string)"); end - if ((cuda_enabled || amdgpu_enabled) && select_device) _select_device() end + if ((cuda_enabled || amdgpu_enabled || oneapi_enabled) && select_device) _select_device() end init_timing_functions(); return me, dims, nprocs, coords, comm_cart; # The typical use case requires only these variables; the remaining can be obtained calling get_global_grid() if needed. end diff --git a/src/select_device.jl b/src/select_device.jl index 5df62cf..9b41e90 100644 --- a/src/select_device.jl +++ b/src/select_device.jl @@ -14,14 +14,17 @@ See also: [`init_global_grid`](@ref) """ function select_device() check_initialized() - if (cuda_enabled() && amdgpu_enabled()) error("Cannot select a device because both CUDA and AMDGPU are enabled (meaning that both modules were imported before ImplicitGlobalGrid).") end - if cuda_enabled() || amdgpu_enabled() + if (cuda_enabled() && amdgpu_enabled() && oneapi_enabled) error("Cannot select a device because both CUDA and AMDGPU are enabled (meaning that both modules were imported before ImplicitGlobalGrid).") end + if cuda_enabled() || amdgpu_enabled() || oneapi_enabled() if cuda_enabled() @assert cuda_functional() nb_devices = nb_cudevices() elseif amdgpu_enabled() @assert amdgpu_functional() nb_devices = nb_rocdevices() + elseif oneapi_enabled() + @assert oneapi_functional() + nb_devices = nb_oneapidevices() end comm_l = MPI.Comm_split_type(comm(), MPI.COMM_TYPE_SHARED, me()) if (MPI.Comm_size(comm_l) > nb_devices) error("More processes have been launched per node than there are GPUs available."); end @@ -29,6 +32,7 @@ function select_device() device_id = amdgpu_enabled() ? me_l+1 : me_l if cuda_enabled() cudevice!(device_id) elseif amdgpu_enabled() rocdevice!(device_id) + elseif oneapi_enabled() oneapidevice!(device_id) end return device_id else diff --git a/src/shared.jl b/src/shared.jl index 7bb108a..6d3f8b4 100644 --- a/src/shared.jl +++ b/src/shared.jl @@ -6,19 +6,25 @@ using Base.Threads ## HANDLING OF CUDA AND AMDGPU SUPPORT let - global cuda_loaded, cuda_functional, amdgpu_loaded, amdgpu_functional, set_cuda_loaded, set_cuda_functional, set_amdgpu_loaded, set_amdgpu_functional + global cuda_loaded, cuda_functional, amdgpu_loaded, amdgpu_functional, oneapi_loaded, oneapi_functional, set_cuda_loaded, set_cuda_functional, set_amdgpu_loaded, set_amdgpu_functional, set_oneapi_loaded, set_oneapi_functional _cuda_loaded::Bool = false _cuda_functional::Bool = false _amdgpu_loaded::Bool = false _amdgpu_functional::Bool = false + _oneapi_loaded::Bool = false + _oneapi_functional::Bool = false cuda_loaded()::Bool = _cuda_loaded cuda_functional()::Bool = _cuda_functional amdgpu_loaded()::Bool = _amdgpu_loaded amdgpu_functional()::Bool = _amdgpu_functional + oneapi_loaded()::Bool = _oneapi_loaded + oneapi_functional()::Bool = _oneapi_functional set_cuda_loaded() = (_cuda_loaded = is_loaded(Val(:ImplicitGlobalGrid_CUDAExt))) set_cuda_functional() = (_cuda_functional = is_functional(Val(:CUDA))) set_amdgpu_loaded() = (_amdgpu_loaded = is_loaded(Val(:ImplicitGlobalGrid_AMDGPUExt))) set_amdgpu_functional() = (_amdgpu_functional = is_functional(Val(:AMDGPU))) + set_oneapi_loaded() = (_oneapi_loaded = is_loaded(Val(:ImplicitGlobalGrid_ONEAPIExt))) + set_oneapi_functional() = (_oneapi_functional = is_functional(Val(:ONEAPI))) end @@ -33,7 +39,8 @@ const DEVICE_TYPE_NONE = "none" const DEVICE_TYPE_AUTO = "auto" const DEVICE_TYPE_CUDA = "CUDA" const DEVICE_TYPE_AMDGPU = "AMDGPU" -const SUPPORTED_DEVICE_TYPES = [DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU] +const DEVICE_TYPE_ONEAPI = "ONEAPI" +const SUPPORTED_DEVICE_TYPES = [DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU, DEVICE_TYPE_ONEAPI] ##------ @@ -64,8 +71,10 @@ struct GlobalGrid comm::MPI.Comm cuda_enabled::Bool amdgpu_enabled::Bool + oneapi_enabled::Bool cudaaware_MPI::Vector{Bool} amdgpuaware_MPI::Vector{Bool} + oneapigpuaware_MPI::Vector{Bool} use_polyester::Vector{Bool} quiet::Bool end @@ -104,19 +113,24 @@ neighbors(dim::Integer) = global_grid().neighbors[:,dim] neighbor(n::Integer, dim::Integer) = global_grid().neighbors[n,dim] cuda_enabled() = global_grid().cuda_enabled amdgpu_enabled() = global_grid().amdgpu_enabled +oneapi_enabled() = global_grid().oneapi_enabled cudaaware_MPI() = global_grid().cudaaware_MPI cudaaware_MPI(dim::Integer) = global_grid().cudaaware_MPI[dim] amdgpuaware_MPI() = global_grid().amdgpuaware_MPI amdgpuaware_MPI(dim::Integer) = global_grid().amdgpuaware_MPI[dim] +oneapiaware_MPI() = global_grid().oneapiaware_MPI +oneapiaware_MPI(dim::Integer) = global_grid().oneapiaware_MPI[dim] use_polyester() = global_grid().use_polyester use_polyester(dim::Integer) = global_grid().use_polyester[dim] has_neighbor(n::Integer, dim::Integer) = neighbor(n, dim) != MPI.PROC_NULL any_array(fields::GGField...) = any([is_array(A.A) for A in fields]) any_cuarray(fields::GGField...) = any([is_cuarray(A.A) for A in fields]) any_rocarray(fields::GGField...) = any([is_rocarray(A.A) for A in fields]) +any_onearray(fields::GGField...) = any([is_onearray(A.A) for A in fields]) all_arrays(fields::GGField...) = all([is_array(A.A) for A in fields]) all_cuarrays(fields::GGField...) = all([is_cuarray(A.A) for A in fields]) all_rocarrays(fields::GGField...) = all([is_rocarray(A.A) for A in fields]) +all_onearrays(fields::GGField...) = all([is_onearray(A.A) for A in fields]) is_array(A::GGArray) = typeof(A) <: Array diff --git a/src/update_halo.jl b/src/update_halo.jl index e917506..d7c1eaa 100644 --- a/src/update_halo.jl +++ b/src/update_halo.jl @@ -35,7 +35,7 @@ function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...; dims=(N end # function _update_halo!(fields::GGField...; dims=dims) - if (!cuda_enabled() && !amdgpu_enabled() && !all_arrays(fields...)) error("not all arrays are CPU arrays, but no GPU extension is loaded.") end #NOTE: in the following, it is only required to check for `cuda_enabled()`/`amdgpu_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)` or the corresponding for AMDGPU. # NOTE: the case where only one of the two extensions are loaded, but an array dad would be for the other extension is passed is very unlikely and therefore not explicitly checked here (but could be added later). + if (!cuda_enabled() && !amdgpu_enabled() && !oneapi_enabled() && !all_arrays(fields...)) error("not all arrays are CPU arrays, but no GPU extension is loaded.") end #NOTE: in the following, it is only required to check for `cuda_enabled()`/`amdgpu_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)` or the corresponding for AMDGPU. # NOTE: the case where only one of the two extensions are loaded, but an array dad would be for the other extension is passed is very unlikely and therefore not explicitly checked here (but could be added later). allocate_bufs(fields...); if any_array(fields...) allocate_tasks(fields...); end if any_cuarray(fields...) allocate_custreams(fields...); end @@ -103,6 +103,7 @@ let free_update_halo_cpubuffers() if (cuda_enabled() && none(cudaaware_MPI())) free_update_halo_cubuffers() end if (amdgpu_enabled() && none(amdgpuaware_MPI())) free_update_halo_rocbuffers() end + if (oneapi_enabled() && none(oneapiaware_MPI())) free_update_halo_rocbuffers() end GC.gc() #TODO: see how to modify this! end @@ -122,21 +123,25 @@ let init_bufs_arrays(); if cuda_enabled() init_cubufs_arrays(); end if amdgpu_enabled() init_rocbufs_arrays(); end + if oneapi_enabled() init_onebufs_arrays(); end end init_bufs(T, fields...); if cuda_enabled() init_cubufs(T, fields...); end if amdgpu_enabled() init_rocbufs(T, fields...); end + if oneapi_enabled() init_onebufs(T, fields...); end for i = 1:length(fields) A, halowidths = fields[i]; for n = 1:NNEIGHBORS_PER_DIM # Ensure that the buffers are interpreted to contain elements of the same type as the array. reinterpret_bufs(T, i, n); if cuda_enabled() reinterpret_cubufs(T, i, n); end if amdgpu_enabled() reinterpret_rocbufs(T, i, n); end + if oneapi_enabled() reinterpret_onebufs(T, i, n); end end max_halo_elems = maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1])); reallocate_undersized_hostbufs(T, i, max_halo_elems, A); if (is_cuarray(A) && any(cudaaware_MPI())) reallocate_undersized_cubufs(T, i, max_halo_elems) end if (is_rocarray(A) && any(amdgpuaware_MPI())) reallocate_undersized_rocbufs(T, i, max_halo_elems) end + if (is_onearray(A) && any(oneapiaware_MPI())) reallocate_undersized_onebufs(T, i, max_halo_elems) end end end @@ -164,6 +169,7 @@ let reallocate_bufs(T, i, n, max_halo_elems); if (is_cuarray(A) && none(cudaaware_MPI())) reregister_cubufs(T, i, n, sendbufs_raw, recvbufs_raw); end # Host memory is page-locked (and mapped to device memory) to ensure optimal access performance (from kernel or with 3-D memcopy). if (is_rocarray(A) && none(amdgpuaware_MPI())) reregister_rocbufs(T, i, n, sendbufs_raw, recvbufs_raw); end # ... + if (is_onearray(A) && none(oneapiaware_MPI())) reregister_onebufs(T, i, n, sendbufs_raw, recvbufs_raw); end # ... end GC.gc(); # Too small buffers had been replaced with larger ones; free the now unused memory. end @@ -337,7 +343,7 @@ function irecv_halo!(n::Integer, dim::Integer, F::GGField, i::Integer; tag::Inte req = MPI.REQUEST_NULL; A, halowidths = F; if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A)) + if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A) || (oneapiaware_MPI(dim) && is_onearray(A))) req = MPI.Irecv!(gpurecvbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm()); else req = MPI.Irecv!(recvbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm()); @@ -350,7 +356,7 @@ function isend_halo(n::Integer, dim::Integer, F::GGField, i::Integer; tag::Integ req = MPI.REQUEST_NULL; A, halowidths = F; if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A)) + if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A) || (oneapiaware_MPI(dim) && is_onearray(A))) req = MPI.Isend(gpusendbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm()); else req = MPI.Isend(sendbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm()); @@ -362,7 +368,7 @@ end function sendrecv_halo_local(n::Integer, dim::Integer, F::GGField, i::Integer) A, halowidths = F; if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A)) + if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A) || (oneapiaware_MPI(dim) && is_onearray(A))) if n == 1 gpumemcopy!(gpurecvbuf_flat(2,dim,i,F), gpusendbuf_flat(1,dim,i,F)); elseif n == 2