-
Notifications
You must be signed in to change notification settings - Fork 34
Expand file tree
/
Copy pathpool.jl
More file actions
86 lines (70 loc) · 3.16 KB
/
pool.jl
File metadata and controls
86 lines (70 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Track total allocated GPU memory (device + shared buffers) for proactive GC.
# This mirrors AMDGPU.jl's approach: trigger GC before OOM so that finalizers
# can free stale GPU buffers that Julia's GC hasn't collected yet (Julia's GC
# only sees CPU memory pressure, not GPU memory pressure).
const _allocated_bytes = Threads.Atomic{Int64}(0)
const _total_mem_cache = Threads.Atomic{Int64}(0)
function _get_total_mem(dev)
cached = _total_mem_cache[]
cached > 0 && return cached
total = only(oneL0.memory_properties(dev)).totalSize
Threads.atomic_cas!(_total_mem_cache, Int64(0), Int64(total))
return _total_mem_cache[]
end
function _maybe_gc(dev, bytes)
allocated = _allocated_bytes[]
allocated <= 0 && return
total_mem = _get_total_mem(dev)
return if allocated + bytes > total_mem * 0.8
# Flush deferred resource releases (e.g., MKL sparse handles) from previous GC
# cycles first — these are safe to release now because they were deferred earlier.
# Do this BEFORE GC to avoid racing with new finalizers.
oneL0._run_reclaim_callbacks()
# Full GC to collect old-generation objects whose finalizers free GPU memory.
GC.gc(true)
elseif allocated + bytes > total_mem * 0.4
GC.gc(false)
end
end
function allocate(::Type{oneL0.DeviceBuffer}, ctx, dev, bytes::Int, alignment::Int)
bytes == 0 && return oneL0.DeviceBuffer(ZE_NULL, bytes, ctx, dev)
_maybe_gc(dev, bytes)
buf = device_alloc(ctx, dev, bytes, alignment)
make_resident(ctx, dev, buf)
Threads.atomic_add!(_allocated_bytes, Int64(bytes))
return buf
end
function allocate(::Type{oneL0.SharedBuffer}, ctx, dev, bytes::Int, alignment::Int)
bytes == 0 && return oneL0.SharedBuffer(ZE_NULL, bytes, ctx, dev)
# TODO: support cross-device shared buffers (by setting `dev=nothing`)
_maybe_gc(dev, bytes)
buf = shared_alloc(ctx, dev, bytes, alignment)
make_resident(ctx, dev, buf)
Threads.atomic_add!(_allocated_bytes, Int64(bytes))
return buf
end
function allocate(::Type{oneL0.HostBuffer}, ctx, dev, bytes::Int, alignment::Int)
bytes == 0 && return oneL0.HostBuffer(ZE_NULL, bytes, ctx)
host_alloc(ctx, bytes, alignment)
end
function release(buf::oneL0.AbstractBuffer)
sizeof(buf) == 0 && return
if buf isa oneL0.DeviceBuffer || buf isa oneL0.SharedBuffer
Threads.atomic_sub!(_allocated_bytes, Int64(sizeof(buf)))
end
# XXX: is it necessary to evice memory if we are going to free it?
# this is racy, because eviction is not queue-ordered, and
# we don't want to synchronize inside what could have been a
# GC-driven finalizer. if we need to, port the stream/queue
# tracking from CUDA.jl so that we can synchronize only the
# queue that's associated with the buffer.
#if buf isa oneL0.DeviceBuffer || buf isa oneL0.SharedBuffer
# ctx = oneL0.context(buf)
# dev = oneL0.device(buf)
# evict(ctx, dev, buf)
#end
free(buf; policy=oneL0.ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_BLOCKING_FREE)
# TODO: queue-ordered free from non-finalizer tasks once we have
# `zeMemFreeAsync(ptr, queue)`
return
end