File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 11isdebug (group) = Base. CoreLogging. current_logger_for_env (Base. CoreLogging. Debug, group, oneL0) != = nothing
22
3+ # Registered callbacks invoked during memory reclamation (e.g., flushing deferred MKL
4+ # sparse handle releases). Extensions like oneMKL can register cleanup functions here
5+ # so they run when Level Zero reports OOM or when proactive GC fires.
6+ const _reclaim_callbacks = Function[]
7+
8+ function register_reclaim_callback! (f:: Function )
9+ push! (_reclaim_callbacks, f)
10+ end
11+
12+ function _run_reclaim_callbacks ()
13+ for cb in _reclaim_callbacks
14+ try
15+ cb ()
16+ catch
17+ end
18+ end
19+ end
20+
321function retry_reclaim (f, isfailed)
422 ret = f ()
523
@@ -11,6 +29,12 @@ function retry_reclaim(f, isfailed)
1129 GC. gc (false )
1230 elseif phase == 2
1331 GC. gc (true )
32+ elseif phase == 3
33+ # After GC, finalizers may have deferred resource releases (e.g., MKL
34+ # sparse handles). Flush them now, then GC again to free the memory
35+ # those releases made available.
36+ _run_reclaim_callbacks ()
37+ GC. gc (true )
1438 else
1539 break
1640 end
Original file line number Diff line number Diff line change @@ -31,6 +31,11 @@ include("linalg.jl")
3131include (" interfaces.jl" )
3232include (" fft.jl" )
3333
34+ # Register deferred sparse handle flush as a memory reclaim callback so that OOM
35+ # recovery (retry_reclaim) and proactive GC (_maybe_gc) can free MKL internal buffers
36+ # associated with sparse matrix handles that were deferred from finalizer threads.
37+ oneL0. register_reclaim_callback! (flush_deferred_sparse_releases)
38+
3439function version ()
3540 major = Ref {Int64} ()
3641 minor = Ref {Int64} ()
Original file line number Diff line number Diff line change @@ -18,8 +18,11 @@ function _maybe_gc(dev, bytes)
1818 allocated <= 0 && return
1919 total_mem = _get_total_mem (dev)
2020 if allocated + bytes > total_mem * 0.8
21+ # Flush deferred resource releases (e.g., MKL sparse handles) from previous GC
22+ # cycles first — these are safe to release now because they were deferred earlier.
23+ # Do this BEFORE GC to avoid racing with new finalizers.
24+ oneL0. _run_reclaim_callbacks ()
2125 # Full GC to collect old-generation objects whose finalizers free GPU memory.
22- # GC.gc(false) only does minor collection which won't reclaim promoted objects.
2326 GC. gc (true )
2427 elseif allocated + bytes > total_mem * 0.4
2528 GC. gc (false )
You can’t perform that action at this time.
0 commit comments