-
Notifications
You must be signed in to change notification settings - Fork 97
Expand file tree
/
Copy pathabstractarray.jl
More file actions
346 lines (282 loc) · 11.1 KB
/
abstractarray.jl
File metadata and controls
346 lines (282 loc) · 11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# core definition of the AbstractGPUArray type
# storage handling
export DataRef, unsafe_free!
# DataRef provides a helper class to manage the storage of an array.
#
# There's multiple reasons we don't just put the data directly in a GPUArray struct:
# - to share data between multiple arrays, e.g., to create views;
# - to be able to early-free data and release GC pressure.
#
# To support this, wrap the data in a DataRef instead, and use it with the following methods:
# - `ref[]`: get the data;
# - `copy(ref)`: create a new reference, increasing the reference count;
# - `unsafe_free!(ref)`: decrease the reference count, and free the data if it reaches 0.
#
# The contained RefCounted struct should not be used directly.
# shared, reference-counted state.
mutable struct RefCounted{D}
obj::D
finalizer
count::Threads.Atomic{Int}
end
function retain(rc::RefCounted)
if rc.count[] == 0
throw(ArgumentError("Attempt to retain freed data."))
end
Threads.atomic_add!(rc.count, 1)
return
end
function release(rc::RefCounted, args...)
if rc.count[] == 0
throw(ArgumentError("Attempt to release freed data."))
end
refcount = Threads.atomic_add!(rc.count, -1)
if refcount == 1 && rc.finalizer !== nothing
rc.finalizer(rc.obj, args...)
end
return
end
function Base.getindex(rc::RefCounted)
if rc.count[] == 0
throw(ArgumentError("Attempt to use freed data."))
end
rc.obj
end
# per-object state, with a flag to indicate whether the object has been freed.
# this is to support multiple calls to `unsafe_free!` on the same object,
# while only lowering the reference count of the underlying data once.
mutable struct DataRef{D}
rc::RefCounted{D}
freed::Bool
cached::Bool
end
function DataRef(finalizer, ref::D) where {D}
rc = RefCounted{D}(ref, finalizer, Threads.Atomic{Int}(1))
DataRef{D}(rc, false, false)
end
DataRef(ref; kwargs...) = DataRef(nothing, ref; kwargs...)
Base.sizeof(ref::DataRef) = sizeof(ref.rc[])
function Base.getindex(ref::DataRef)
if ref.freed
throw(ArgumentError("Attempt to use a freed reference."))
end
ref.rc[]
end
function Base.copy(ref::DataRef{D}) where {D}
if ref.freed
throw(ArgumentError("Attempt to copy a freed reference."))
end
retain(ref.rc)
# copies of cached references are not managed by the cache, so
# we need to mark them as such to make sure their refcount can drop.
return DataRef{D}(ref.rc, false, false)
end
function unsafe_free!(ref::DataRef)
if ref.cached
# lifetimes of cached references are tied to the cache.
return
end
if ref.freed
# multiple frees *of the same object* are allowed.
# we should only ever call `release` once per object, though,
# as multiple releases of the underlying data is not allowed.
return
end
ref.freed = true
release(ref.rc)
return
end
# array methods
storage(x::AbstractGPUArray) = error("Not implemented") # COV_EXCL_LINE
"""
unsafe_free!(a::GPUArray)
Release the memory of an array for reuse by future allocations. This operation is
performed automatically by the GC when an array goes out of scope, but can be called
earlier to reduce pressure on the memory allocator.
"""
unsafe_free!(x::AbstractGPUArray) = unsafe_free!(storage(x))
# input/output
## serialization
using Serialization: AbstractSerializer, serialize_type
function Serialization.serialize(s::AbstractSerializer, @nospecialize(t::AbstractGPUArray))
serialize_type(s, typeof(t))
serialize(s, Array(t))
end
function Serialization.deserialize(s::AbstractSerializer, ::Type{T}) where T <: AbstractGPUArray
A = deserialize(s)
T(A)
end
## showing
struct ToArray end
Adapt.adapt_storage(::ToArray, xs::AbstractGPUArray) = convert(Array, xs)
# display: show is called on the materialised CPU copy, so no need to
# specialize the forwarders per element type / wrapper.
Base.print_array(io::IO, @nospecialize(X::AnyGPUArray)) =
Base.print_array(io, adapt(ToArray(), X))
# show
Base._show_nonempty(io::IO, @nospecialize(X::AnyGPUArray), prefix::String) =
Base._show_nonempty(io, adapt(ToArray(), X), prefix)
Base._show_empty(io::IO, @nospecialize(X::AnyGPUArray)) =
Base._show_empty(io, adapt(ToArray(), X))
Base.show_vector(io::IO, @nospecialize(v::AnyGPUArray), args...) =
Base.show_vector(io, adapt(ToArray(), v), args...)
## collect to CPU (discarding wrapper type)
collect_to_cpu(xs::AbstractArray) = collect(adapt(ToArray(), xs))
Base.collect(X::AnyGPUArray) = collect_to_cpu(X)
# memory copying
# expects the GPU array type to have linear `copyto!` methods (i.e. accepting an integer
# offset and length) from and to CPU arrays and between GPU arrays.
for (D, S) in ((AnyGPUArray, Array),
(Array, AnyGPUArray),
(AnyGPUArray, AnyGPUArray))
@eval begin
function Base.copyto!(dest::$D{<:Any, N}, rdest::UnitRange,
src::$S{<:Any, N}, ssrc::UnitRange) where {N}
drange = CartesianIndices((rdest,))
srange = CartesianIndices((ssrc,))
copyto!(dest, drange, src, srange)
end
Base.copyto!(dest::$D, src::$S) = copyto!(dest, 1, src, 1, length(src))
end
end
# kernel-based variant for copying between wrapped GPU arrays
@kernel function linear_copy_kernel!(dest, dstart, src, sstart, n)
i = @index(Global, Linear)
if i <= n
@inbounds dest[dstart+i-1] = src[sstart+i-1]
end
end
function Base.copyto!(dest::AnyGPUArray, dstart::Integer,
src::AnyGPUArray, sstart::Integer, n::Integer)
n == 0 && return dest
n < 0 && throw(ArgumentError(string("tried to copy n=", n, " elements, but n should be nonnegative")))
destinds, srcinds = LinearIndices(dest), LinearIndices(src)
(checkbounds(Bool, destinds, dstart) && checkbounds(Bool, destinds, dstart+n-1)) || throw(BoundsError(dest, dstart:dstart+n-1))
(checkbounds(Bool, srcinds, sstart) && checkbounds(Bool, srcinds, sstart+n-1)) || throw(BoundsError(src, sstart:sstart+n-1))
kernel = linear_copy_kernel!(get_backend(dest))
kernel(dest, dstart, src, sstart, n; ndrange=n)
return dest
end
# variants that materialize the GPU wrapper before copying from or to the CPU
function Base.copyto!(dest::Array, dstart::Integer,
src::WrappedGPUArray, sstart::Integer, n::Integer)
n == 0 && return dest
temp = similar(parent(src), n)
copyto!(temp, 1, src, sstart, n)
copyto!(dest, dstart, temp, 1, n)
return dest
end
function Base.copyto!(dest::WrappedGPUArray, dstart::Integer,
src::Array, sstart::Integer, n::Integer)
n == 0 && return dest
temp = similar(parent(dest), n)
copyto!(temp, 1, src, sstart, n)
copyto!(dest, dstart, temp, 1, n)
return dest
end
# variants that converts values on the CPU when there's a type mismatch
#
# we prefer to convert on the CPU where there's typically more memory / less memory pressure
# to quickly perform these very lightweight conversions
function Base.copyto!(dest::Array{T}, dstart::Integer,
src::AnyGPUArray{U}, sstart::Integer,
n::Integer) where {T,U}
n == 0 && return dest
temp = Vector{U}(undef, n)
copyto!(temp, 1, src, sstart, n)
copyto!(dest, dstart, temp, 1, n)
return dest
end
function Base.copyto!(dest::AnyGPUArray{T}, dstart::Integer,
src::Array{U}, sstart::Integer, n::Integer) where {T,U}
n == 0 && return dest
temp = Vector{T}(undef, n)
copyto!(temp, 1, src, sstart, n)
copyto!(dest, dstart, temp, 1, n)
return dest
end
## generalized blocks of heterogeneous memory
@kernel function cartesian_copy_kernel!(dest, dest_offsets, src, src_offsets)
I = @index(Global, Cartesian)
@inbounds dest[I + dest_offsets] = src[I + src_offsets]
end
function Base.copyto!(dest::AnyGPUArray{<:Any, N}, destcrange::CartesianIndices{N},
src::AnyGPUArray{<:Any, N}, srccrange::CartesianIndices{N}) where {N}
shape = size(destcrange)
if shape != size(srccrange)
throw(ArgumentError("Ranges don't match their size. Found: $shape, $(size(srccrange))"))
end
len = length(destcrange)
len == 0 && return dest
# linear copy if we can
if N == 1
d_offset = first(destcrange)[1]
s_offset = first(srccrange)[1]
return copyto!(dest, d_offset, src, s_offset, len)
end
dest_offsets = first(destcrange) - oneunit(CartesianIndex{N})
src_offsets = first(srccrange) - oneunit(CartesianIndex{N})
kernel = cartesian_copy_kernel!(get_backend(dest))
kernel(dest, dest_offsets, src, src_offsets; ndrange=shape)
dest
end
for (dstTyp, srcTyp) in (AbstractGPUArray=>Array, Array=>AbstractGPUArray)
@eval function Base.copyto!(dst::$dstTyp{T,N}, dstrange::CartesianIndices{N},
src::$srcTyp{T,N}, srcrange::CartesianIndices{N}) where {T,N}
isempty(dstrange) && return dst
if size(dstrange) != size(srcrange)
throw(ArgumentError("source and destination must have same size (got $(size(srcrange)) and $(size(dstrange)))"))
end
len = length(dstrange)
len == 0 && return dest
# linear copy if we can
if N == 1
d_offset = first(dstrange)[1]
s_offset = first(srcrange)[1]
return copyto!(dst, d_offset, src, s_offset, len)
end
# figure out how many dimensions of the Cartesian ranges map onto contiguous memory
# in both source and destination. we will copy these one by one as linear ranges.
contiguous_dims = 1
for dim in 2:N
# a slice is broken up if the previous dimension didn't cover the entire range
if axes(src, dim-1) == axes(srcrange, dim-1) &&
axes(dst, dim-1) == axes(dstrange, dim-1)
contiguous_dims = dim
else
break
end
end
m = prod(size(dstrange)[1:contiguous_dims]) # inner, contiguous length
n = prod(size(dstrange)[contiguous_dims+1:end]) # outer non-contiguous length
@assert m*n == length(srcrange) == length(dstrange)
# copy linear slices
for i in 1:m:m*n
srcoff = LinearIndices(src)[srcrange[i]]
dstoff = LinearIndices(dst)[dstrange[i]]
# TODO: Use asynchronous memory copies
copyto!(dst, dstoff, src, srcoff, m)
end
dst
end
end
## other
Base.copy(x::AbstractGPUArray) = error("Not implemented") # COV_EXCL_LINE
Base.deepcopy_internal(@nospecialize(x::AbstractGPUArray), ::IdDict) = copy(x)
# filtering
# TODO: filter!
# revert of JuliaLang/julia#31929
Base.filter(f, As::AbstractGPUArray) = As[map(f, As)::AbstractGPUArray{Bool}]
# appending
function Base.append!(a::AbstractGPUVector, items::AbstractVector)
n = length(items)
resize!(a, length(a) + n)
copyto!(a, length(a) - n + 1, items, firstindex(items), n)
return a
end
# this is needed because copyto! of most GPU arrays
# doesn't currently support Tuple sources
function Base.append!(a::AbstractGPUVector, @nospecialize(items::Tuple))
append!(a, collect(items))
return a
end