Skip to content

Commit f0f03fe

Browse files
committed
Memory tracking for GC
1 parent 1dc24f8 commit f0f03fe

1 file changed

Lines changed: 36 additions & 0 deletions

File tree

src/pool.jl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,38 @@
1+
# Track total allocated GPU memory (device + shared buffers) for proactive GC.
2+
# This mirrors AMDGPU.jl's approach: trigger GC before OOM so that finalizers
3+
# can free stale GPU buffers that Julia's GC hasn't collected yet (Julia's GC
4+
# only sees CPU memory pressure, not GPU memory pressure).
5+
const _allocated_bytes = Threads.Atomic{Int64}(0)
6+
const _total_mem_cache = Threads.Atomic{Int64}(0)
7+
8+
function _get_total_mem(dev)
9+
cached = _total_mem_cache[]
10+
cached > 0 && return cached
11+
total = only(oneL0.memory_properties(dev)).totalSize
12+
Threads.atomic_cas!(_total_mem_cache, Int64(0), Int64(total))
13+
return _total_mem_cache[]
14+
end
15+
16+
function _maybe_gc(dev, bytes)
17+
allocated = _allocated_bytes[]
18+
allocated <= 0 && return
19+
total_mem = _get_total_mem(dev)
20+
if allocated + bytes > total_mem * 0.8
21+
# Full GC to collect old-generation objects whose finalizers free GPU memory.
22+
# GC.gc(false) only does minor collection which won't reclaim promoted objects.
23+
GC.gc(true)
24+
elseif allocated + bytes > total_mem * 0.4
25+
GC.gc(false)
26+
end
27+
end
28+
129
function allocate(::Type{oneL0.DeviceBuffer}, ctx, dev, bytes::Int, alignment::Int)
230
bytes == 0 && return oneL0.DeviceBuffer(ZE_NULL, bytes, ctx, dev)
331

32+
_maybe_gc(dev, bytes)
433
buf = device_alloc(ctx, dev, bytes, alignment)
534
make_resident(ctx, dev, buf)
35+
Threads.atomic_add!(_allocated_bytes, Int64(bytes))
636

737
return buf
838
end
@@ -12,8 +42,10 @@ function allocate(::Type{oneL0.SharedBuffer}, ctx, dev, bytes::Int, alignment::I
1242

1343
# TODO: support cross-device shared buffers (by setting `dev=nothing`)
1444

45+
_maybe_gc(dev, bytes)
1546
buf = shared_alloc(ctx, dev, bytes, alignment)
1647
make_resident(ctx, dev, buf)
48+
Threads.atomic_add!(_allocated_bytes, Int64(bytes))
1749

1850
return buf
1951
end
@@ -26,6 +58,10 @@ end
2658
function release(buf::oneL0.AbstractBuffer)
2759
sizeof(buf) == 0 && return
2860

61+
if buf isa oneL0.DeviceBuffer || buf isa oneL0.SharedBuffer
62+
Threads.atomic_sub!(_allocated_bytes, Int64(sizeof(buf)))
63+
end
64+
2965
# XXX: is it necessary to evice memory if we are going to free it?
3066
# this is racy, because eviction is not queue-ordered, and
3167
# we don't want to synchronize inside what could have been a

0 commit comments

Comments
 (0)