Skip to content

Commit f8f5756

Browse files
author
Felipe Tome
committed
WIP: MPI works
1 parent cc700c6 commit f8f5756

27 files changed

Lines changed: 551 additions & 385 deletions

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
1212
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
1313
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
1414
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
15-
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
1615
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
16+
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
1717
MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
1818
NextLA = "d37ed344-79c4-486d-9307-6d11355a15a3"
1919
OnlineStats = "a15396b6-48d5-5d58-9928-6d29437db91e"
@@ -77,8 +77,8 @@ GraphViz = "0.2"
7777
Graphs = "1"
7878
JSON3 = "1"
7979
KernelAbstractions = "0.9"
80-
MPI = "0.20.22"
8180
MacroTools = "0.5"
81+
MPI = "0.20.22"
8282
MemPool = "0.4.12"
8383
Metal = "1.1"
8484
NextLA = "0.2.2"

src/Dagger.jl

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import MemPool: DRef, FileRef, poolget, poolset
1010
import Base: collect, reduce, view
1111
import NextLA
1212
import LinearAlgebra
13-
import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LU, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, diagind, ishermitian, issymmetric, I, norm, dot
13+
import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LU, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, Cholesky, diagind, ishermitian, issymmetric, I
1414
import Random
1515
import Random: AbstractRNG
1616

@@ -53,7 +53,7 @@ import Adapt
5353
include("lib/util.jl")
5454
include("utils/dagdebug.jl")
5555

56-
# Type definitions
56+
# Type definitions (for MPI/acceleration)
5757
include("types/processor.jl")
5858
include("types/scope.jl")
5959
include("types/memory-space.jl")
@@ -71,6 +71,7 @@ include("context.jl")
7171
include("utils/processors.jl")
7272
include("scopes.jl")
7373
include("utils/scopes.jl")
74+
include("chunks.jl")
7475
include("utils/signature.jl")
7576
include("thunkid.jl")
7677
include("utils/lfucache.jl")
@@ -82,11 +83,7 @@ include("argument.jl")
8283
include("queue.jl")
8384
include("thunk.jl")
8485
include("utils/fetch.jl")
85-
include("chunks.jl")
86-
include("affinity.jl")
87-
include("tochunk.jl")
88-
include("mutable.jl")
89-
include("shard.jl")
86+
include("utils/chunks.jl")
9087
include("weakchunk.jl")
9188
include("utils/logging.jl")
9289
include("submission.jl")
@@ -101,6 +98,7 @@ include("utils/clock.jl")
10198
include("utils/system_uuid.jl")
10299
include("utils/caching.jl")
103100
include("sch/Sch.jl"); using .Sch
101+
include("tochunk.jl")
104102

105103
# Data dependency task queue
106104
include("datadeps/aliasing.jl")
@@ -138,7 +136,7 @@ include("array/mul.jl")
138136
include("array/cholesky.jl")
139137
include("array/trsm.jl")
140138
include("array/lu.jl")
141-
include("array/gmres.jl")
139+
include("array/qr.jl")
142140

143141
# GPU
144142
include("gpu.jl")
@@ -167,8 +165,9 @@ function set_distributed_package!(value)
167165
@info "Dagger.jl preference has been set, restart your Julia session for this change to take effect!"
168166
end
169167

170-
# MPI
168+
# MPI (mpi.jl loads MPI; mpi_mempool uses it)
171169
include("mpi.jl")
170+
include("mpi_mempool.jl")
172171

173172
# Precompilation
174173
import PrecompileTools: @compile_workload

src/array/alloc.jl

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,13 +184,24 @@ function Base.zero(x::DArray{T,N}) where {T,N}
184184
return _to_darray(a)
185185
end
186186

187-
@warn "Consider a better way to provide a unique ID for each chunk" maxlog=1
188-
function Base.view(A::AbstractArray{T,N}, p::Blocks{N}; space=default_memory_space(current_acceleration(), A)) where {T,N}
187+
# Weird LinearAlgebra dispatch in `\` needs this
188+
function LinearAlgebra._zeros(::Type{T}, B::DVector, n::Integer) where T
189+
m = max(size(B, 1), n)
190+
sz = (m,)
191+
return zeros(auto_blocks(sz), T, sz)
192+
end
193+
function LinearAlgebra._zeros(::Type{T}, B::DMatrix, n::Integer) where T
194+
m = max(size(B, 1), n)
195+
sz = (m, size(B, 2))
196+
return zeros(auto_blocks(sz), T, sz)
197+
end
198+
199+
function Base.view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N}
189200
d = ArrayDomain(Base.index_shape(A))
190201
dc = partition(p, d)
191202
# N.B. We use `tochunk` because we only want to take the view locally, and
192203
# taking views should be very fast
193-
chunks = [@with(MPI_UID => eager_next_id(), tochunk(view(A, x.indexes...), space)) for x in dc]
204+
chunks = [tochunk(view(A, x.indexes...)) for x in dc]
194205
return DArray(T, d, dc, chunks, p)
195206
end
196207
Base.view(A::AbstractArray, ::AutoBlocks) =

src/array/copy.jl

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,7 @@ function darray_copyto!(B::DArray{TB,NB}, A::DArray{TA,NA}, Binds=parentindices(
119119
Arange_local = Arange_global_clamped .- CartesianIndex(Arange_start) .+ CartesianIndex{Nmax}(1)
120120

121121
# Perform local view copy
122-
space = (Bpart isa DTask ? fetch(Bpart; move_value=false, unwrap=false) : Bpart).space
123-
procs = processors(space)
124-
scope = UnionScope([ExactScope(proc) for proc in procs])
125-
check_uniform(space)
126-
for proc in procs
127-
check_uniform(proc)
128-
end
129-
Dagger.@spawn scope = scope copyto_view!(Out(Bpart), Brange_local, In(Apart), Arange_local)
122+
Dagger.@spawn copyto_view!(Out(Bpart), Brange_local, In(Apart), Arange_local)
130123
end
131124
end
132125
end

src/array/darray.jl

Lines changed: 61 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import Base: ==, fetch
1+
import Base: ==, fetch, length, isempty, size
22

33
export DArray, DVector, DMatrix, DVecOrMat, Blocks, AutoBlocks
44
export distribute
@@ -83,7 +83,8 @@ isempty(a::ArrayDomain) = length(a) == 0
8383
The domain of an array is an ArrayDomain.
8484
"""
8585
domain(x::AbstractArray) = ArrayDomain([1:l for l in size(x)])
86-
86+
# Scalar / non-array values (e.g. for Chunk of immediate data)
87+
domain(x::Any) = ArrayDomain(())
8788

8889
abstract type ArrayOp{T, N} <: AbstractArray{T, N} end
8990
Base.IndexStyle(::Type{<:ArrayOp}) = IndexCartesian()
@@ -176,46 +177,28 @@ domainchunks(d::DArray) = d.subdomains
176177
size(x::DArray) = size(domain(x))
177178
stage(ctx, c::DArray) = c
178179

179-
@warn "Dispatch uniform on acceleration" maxlog=1
180-
@warn "Take D.concat into account" maxlog=1
181-
function Base.collect(D::DArray{T,N}; tree=false, copyto=false, uniform::Bool=true) where {T,N}
182-
if isempty(D.chunks)
183-
return Array{eltype(D)}(undef, size(D)...)
180+
function Base.collect(d::DArray{T,N}; tree=false, copyto=false) where {T,N}
181+
a = fetch(d)
182+
if isempty(d.chunks)
183+
return Array{eltype(d)}(undef, size(d)...)
184184
end
185185

186-
# Return a scalar, as required by Julia's array interface
187-
if ndims(D) == 0
188-
return fetch(D.chunks[1]; unwrap=true)
186+
if ndims(d) == 0
187+
return fetch(a.chunks[1])
189188
end
190189

191-
if uniform
192-
@assert D.concat === cat "FIXME: Handle non-cat"
193-
A = Array{eltype(D)}(undef, size(D)...)
194-
DA = view(A, D.partitioning; space=CPURAMMemorySpace())
195-
196-
# Perform the equivalent of `copyto!(DA, D)`, but force local updates
197-
# FIXME: Be more parallel?
198-
for idx in eachindex(DA.chunks)
199-
dest = fetch(DA.chunks[idx]; move_value=false, unwrap=true, uniform=true)::AbstractArray
200-
src = fetch(D.chunks[idx]; move_value=true, unwrap=true, uniform=true)::AbstractArray
201-
copyto!(dest, src)
202-
end
190+
if copyto
191+
C = Array{T,N}(undef, size(a))
192+
DC = view(C, Blocks(size(a)...))
193+
copyto!(DC, a)
194+
return C
195+
end
203196

204-
return A
197+
dimcatfuncs = [(x...) -> d.concat(x..., dims=i) for i in 1:ndims(d)]
198+
if tree
199+
collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), a.chunks)))
205200
else
206-
if copyto
207-
C = Array{T,N}(undef, size(D))
208-
DC = view(C, Blocks(size(D)...))
209-
copyto!(DC, D)
210-
return C
211-
end
212-
213-
dimcatfuncs = [(x...) -> D.concat(x..., dims=i) for i in 1:ndims(D)]
214-
if tree
215-
collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), D.chunks)))
216-
else
217-
treereduce_nd(dimcatfuncs, asyncmap(fetch, D.chunks))
218-
end
201+
collect(treereduce_nd(dimcatfuncs, asyncmap(fetch, a.chunks)))
219202
end
220203
end
221204
Array{T,N}(A::DArray{S,N}) where {T,N,S} = convert(Array{T,N}, collect(A))
@@ -339,8 +322,8 @@ function Base.isequal(x::ArrayOp, y::ArrayOp)
339322
x === y
340323
end
341324

342-
Base.similar(::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N} =
343-
DArray{S,N}(undef, dims)
325+
Base.similar(D::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N} =
326+
DArray{S,N}(undef, D.partitioning, dims)
344327

345328
Base.copy(x::DArray{T,N,B,F}) where {T,N,B,F} =
346329
map(identity, x)::DArray{T,N,B,F}
@@ -406,18 +389,23 @@ function lookup_parts(A::DArray, ps::AbstractArray, subdmns::DomainBlocks{N}, d:
406389
end
407390

408391
"""
409-
Base.fetch(A::DArray; unwrap::Bool=false, kwargs...) -> DArray
392+
Base.fetch(c::DArray)
410393
411-
Returns a new `DArray` with the same data as `A`, but where all values are
412-
fully computed.
394+
If a `DArray` tree has a `Thunk` in it, make the whole thing a big thunk.
413395
"""
414-
function Base.fetch(A::DArray{T}; unwrap::Bool=false, kwargs...) where T
415-
if any(unwrappable, chunks(A))
416-
tasks = map(t->unwrappable(t) ? fetch(t; unwrap, kwargs...) : t, chunks(A))
417-
B = DArray(T, A.domain, A.subdomains, tasks, A.partitioning, A.concat)
418-
return B
396+
function Base.fetch(c::DArray{T}) where T
397+
if any(istask, chunks(c))
398+
thunks = chunks(c)
399+
sz = size(thunks)
400+
dmn = domain(c)
401+
dmnchunks = domainchunks(c)
402+
return fetch(Dagger.spawn(Options(meta=true), thunks...) do results...
403+
t = eltype(fetch(results[1]))
404+
DArray(t, dmn, dmnchunks, reshape(Any[results...], sz),
405+
c.partitioning, c.concat)
406+
end)
419407
else
420-
return A
408+
return c
421409
end
422410
end
423411

@@ -518,6 +506,7 @@ auto_blocks(A::AbstractArray{T,N}) where {T,N} = auto_blocks(size(A))
518506

519507
const AssignmentType{N} = Union{Symbol, AbstractArray{<:Int, N}, AbstractArray{<:Processor, N}}
520508

509+
distribute(A::AbstractArray, assignment::AssignmentType = :arbitrary) = distribute(A, AutoBlocks(), assignment)
521510
function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::AssignmentType{N} = :arbitrary) where {T,N}
522511
procgrid = nothing
523512
availprocs = collect(Dagger.compatible_processors())
@@ -558,10 +547,8 @@ function distribute(A::AbstractArray{T,N}, dist::Blocks{N}, assignment::Assignme
558547
procgrid = assignment
559548
end
560549

561-
return _distribute(current_acceleration(), A, dist, procgrid)
550+
return _to_darray(Distribute(dist, A, procgrid))
562551
end
563-
_distribute(::DistributedAcceleration, A::AbstractArray{T,N}, dist::Blocks{N}, procgrid) where {T,N} =
564-
_to_darray(Distribute(dist, A, procgrid))
565552

566553
distribute(A::AbstractArray, ::AutoBlocks, assignment::AssignmentType = :arbitrary) = distribute(A, auto_blocks(A), assignment)
567554
function distribute(x::AbstractArray{T,N}, n::NTuple{N}, assignment::AssignmentType{N} = :arbitrary) where {T,N}
@@ -570,6 +557,7 @@ function distribute(x::AbstractArray{T,N}, n::NTuple{N}, assignment::AssignmentT
570557
end
571558
distribute(x::AbstractVector, n::Int, assignment::AssignmentType{1} = :arbitrary) = distribute(x, (n,), assignment)
572559

560+
573561
DVector(A::AbstractVector{T}, part::Blocks{1}, assignment::AssignmentType{1} = :arbitrary) where T = distribute(A, part, assignment)
574562
DMatrix(A::AbstractMatrix{T}, part::Blocks{2}, assignment::AssignmentType{2} = :arbitrary) where T = distribute(A, part, assignment)
575563
DArray(A::AbstractArray{T,N}, part::Blocks{N}, assignment::AssignmentType{N} = :arbitrary) where {T,N} = distribute(A, part, assignment)
@@ -582,26 +570,29 @@ DVector(A::AbstractVector{T}, ::AutoBlocks, assignment::AssignmentType{1} = :arb
582570
DMatrix(A::AbstractMatrix{T}, ::AutoBlocks, assignment::AssignmentType{2} = :arbitrary) where T = DMatrix(A, auto_blocks(A), assignment)
583571
DArray(A::AbstractArray, ::AutoBlocks, assignment::AssignmentType = :arbitrary) = DArray(A, auto_blocks(A), assignment)
584572

585-
@warn "Add assignment to undef initializer" maxlog=1
586-
function DArray{T,N}(::UndefInitializer, dims::NTuple{N,Int}) where {T,N}
587-
dist = auto_blocks(dims)
588-
return DArray{T,N}(undef, dist, dims...)
589-
end
590-
function DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}) where {T,N}
591-
domain = ArrayDomain(ntuple(i->1:dims[i], N))
573+
struct AllocateUndef{S} end
574+
(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = Array{S,N}(undef, dims)
575+
function DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N}
576+
domain = ArrayDomain(map(x->1:x, dims))
592577
subdomains = partition(dist, domain)
593-
tasks = Array{DTask,N}(undef, size(subdomains)...)
594-
Dagger.spawn_datadeps() do
595-
for (i, x) in enumerate(subdomains)
596-
tasks[i] = Dagger.@spawn allocate_array_undef(T, size(x))
597-
end
598-
end
599-
return DArray(T, domain, subdomains, tasks, dist)
600-
end
601-
DArray{T,N}(::UndefInitializer, dims::Vararg{Int,N}) where {T,N} =
602-
DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,))
603-
DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}) where {T,N} =
604-
DArray{T,N}(undef, dist, (dims...,))
578+
a = AllocateArray(T, AllocateUndef{T}(), false, domain, subdomains, dist, assignment)
579+
return _to_darray(a)
580+
end
581+
DArray{T,N}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} =
582+
DArray{T,N}(undef, dist, (dims...,); assignment)
583+
DArray{T,N}(::UndefInitializer, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} =
584+
DArray{T,N}(undef, auto_blocks(dims), dims; assignment)
585+
DArray{T,N}(::UndefInitializer, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} =
586+
DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,); assignment)
587+
588+
DArray{T}(::UndefInitializer, dist::Blocks{N}, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} =
589+
DArray{T,N}(undef, dist, dims; assignment)
590+
DArray{T}(::UndefInitializer, dist::Blocks{N}, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} =
591+
DArray{T,N}(undef, dist, (dims...,); assignment)
592+
DArray{T}(::UndefInitializer, dims::NTuple{N,Int}; assignment::AssignmentType{N} = :arbitrary) where {T,N} =
593+
DArray{T,N}(undef, auto_blocks(dims), dims; assignment)
594+
DArray{T}(::UndefInitializer, dims::Vararg{Int,N}; assignment::AssignmentType{N} = :arbitrary) where {T,N} =
595+
DArray{T,N}(undef, auto_blocks((dims...,)), (dims...,); assignment)
605596

606597
function Base.:(==)(x::ArrayOp{T,N}, y::AbstractArray{S,N}) where {T,S,N}
607598
collect(x) == y
@@ -622,7 +613,7 @@ end
622613
mapchunk(f, chunk) = tochunk(f(poolget(chunk.handle)))
623614
function mapchunks(f, d::DArray{T,N,F}) where {T,N,F}
624615
chunks = map(d.chunks) do chunk
625-
owner = get_parent(chunk.processor).pid
616+
owner = root_worker_id(chunk.processor)
626617
remotecall_fetch(mapchunk, owner, f, chunk)
627618
end
628619
DArray{T,N,F}(d.domain, d.subdomains, chunks, d.concat)

0 commit comments

Comments
 (0)