Skip to content

Commit ed03fb0

Browse files
committed
Flush deferred sparse releases during memory reclaim
1 parent cfb8cd1 commit ed03fb0

3 files changed

Lines changed: 33 additions & 1 deletion

File tree

lib/level-zero/utils.jl

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,23 @@
11
isdebug(group) = Base.CoreLogging.current_logger_for_env(Base.CoreLogging.Debug, group, oneL0) !== nothing
22

3+
# Registered callbacks invoked during memory reclamation (e.g., flushing deferred MKL
4+
# sparse handle releases). Extensions like oneMKL can register cleanup functions here
5+
# so they run when Level Zero reports OOM or when proactive GC fires.
6+
const _reclaim_callbacks = Function[]
7+
8+
function register_reclaim_callback!(f::Function)
9+
push!(_reclaim_callbacks, f)
10+
end
11+
12+
function _run_reclaim_callbacks()
13+
for cb in _reclaim_callbacks
14+
try
15+
cb()
16+
catch
17+
end
18+
end
19+
end
20+
321
function retry_reclaim(f, isfailed)
422
ret = f()
523

@@ -11,6 +29,12 @@ function retry_reclaim(f, isfailed)
1129
GC.gc(false)
1230
elseif phase == 2
1331
GC.gc(true)
32+
elseif phase == 3
33+
# After GC, finalizers may have deferred resource releases (e.g., MKL
34+
# sparse handles). Flush them now, then GC again to free the memory
35+
# those releases made available.
36+
_run_reclaim_callbacks()
37+
GC.gc(true)
1438
else
1539
break
1640
end

lib/mkl/oneMKL.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ include("linalg.jl")
3131
include("interfaces.jl")
3232
include("fft.jl")
3333

34+
# Register deferred sparse handle flush as a memory reclaim callback so that OOM
35+
# recovery (retry_reclaim) and proactive GC (_maybe_gc) can free MKL internal buffers
36+
# associated with sparse matrix handles that were deferred from finalizer threads.
37+
oneL0.register_reclaim_callback!(flush_deferred_sparse_releases)
38+
3439
function version()
3540
major = Ref{Int64}()
3641
minor = Ref{Int64}()

src/pool.jl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,11 @@ function _maybe_gc(dev, bytes)
1818
allocated <= 0 && return
1919
total_mem = _get_total_mem(dev)
2020
if allocated + bytes > total_mem * 0.8
21+
# Flush deferred resource releases (e.g., MKL sparse handles) from previous GC
22+
# cycles first — these are safe to release now because they were deferred earlier.
23+
# Do this BEFORE GC to avoid racing with new finalizers.
24+
oneL0._run_reclaim_callbacks()
2125
# Full GC to collect old-generation objects whose finalizers free GPU memory.
22-
# GC.gc(false) only does minor collection which won't reclaim promoted objects.
2326
GC.gc(true)
2427
elseif allocated + bytes > total_mem * 0.4
2528
GC.gc(false)

0 commit comments

Comments
 (0)