1+ # Track total allocated GPU memory (device + shared buffers) for proactive GC.
2+ # This mirrors AMDGPU.jl's approach: trigger GC before OOM so that finalizers
3+ # can free stale GPU buffers that Julia's GC hasn't collected yet (Julia's GC
4+ # only sees CPU memory pressure, not GPU memory pressure).
5+ const _allocated_bytes = Threads. Atomic {Int64} (0 )
6+ const _total_mem_cache = Threads. Atomic {Int64} (0 )
7+
8+ function _get_total_mem (dev)
9+ cached = _total_mem_cache[]
10+ cached > 0 && return cached
11+ total = only (oneL0. memory_properties (dev)). totalSize
12+ Threads. atomic_cas! (_total_mem_cache, Int64 (0 ), Int64 (total))
13+ return _total_mem_cache[]
14+ end
15+
16+ function _maybe_gc (dev, bytes)
17+ allocated = _allocated_bytes[]
18+ allocated <= 0 && return
19+ total_mem = _get_total_mem (dev)
20+ if allocated + bytes > total_mem * 0.8
21+ # Full GC to collect old-generation objects whose finalizers free GPU memory.
22+ # GC.gc(false) only does minor collection which won't reclaim promoted objects.
23+ GC. gc (true )
24+ elseif allocated + bytes > total_mem * 0.4
25+ GC. gc (false )
26+ end
27+ end
28+
129function allocate (:: Type{oneL0.DeviceBuffer} , ctx, dev, bytes:: Int , alignment:: Int )
230 bytes == 0 && return oneL0. DeviceBuffer (ZE_NULL, bytes, ctx, dev)
331
32+ _maybe_gc (dev, bytes)
433 buf = device_alloc (ctx, dev, bytes, alignment)
534 make_resident (ctx, dev, buf)
35+ Threads. atomic_add! (_allocated_bytes, Int64 (bytes))
636
737 return buf
838end
@@ -12,8 +42,10 @@ function allocate(::Type{oneL0.SharedBuffer}, ctx, dev, bytes::Int, alignment::I
1242
1343 # TODO : support cross-device shared buffers (by setting `dev=nothing`)
1444
45+ _maybe_gc (dev, bytes)
1546 buf = shared_alloc (ctx, dev, bytes, alignment)
1647 make_resident (ctx, dev, buf)
48+ Threads. atomic_add! (_allocated_bytes, Int64 (bytes))
1749
1850 return buf
1951end
2658function release (buf:: oneL0.AbstractBuffer )
2759 sizeof (buf) == 0 && return
2860
61+ if buf isa oneL0. DeviceBuffer || buf isa oneL0. SharedBuffer
62+ Threads. atomic_sub! (_allocated_bytes, Int64 (sizeof (buf)))
63+ end
64+
2965 # XXX : is it necessary to evice memory if we are going to free it?
3066 # this is racy, because eviction is not queue-ordered, and
3167 # we don't want to synchronize inside what could have been a
0 commit comments