1717
1818use crate :: memory_pool:: {
1919 MemoryConsumer , MemoryLimit , MemoryPool , MemoryReclaimer , MemoryReservation ,
20- human_readable_size,
20+ human_readable_size, reclaimer_state ,
2121} ;
2222use datafusion_common:: HashMap ;
2323use datafusion_common:: { DataFusionError , Result , resources_datafusion_err} ;
@@ -29,7 +29,7 @@ use std::pin::Pin;
2929use std:: sync:: Arc ;
3030use std:: {
3131 num:: NonZeroUsize ,
32- sync:: atomic:: { AtomicBool , AtomicUsize , Ordering } ,
32+ sync:: atomic:: { AtomicU8 , AtomicUsize , Ordering } ,
3333} ;
3434use tokio:: sync:: Semaphore ;
3535
@@ -332,13 +332,55 @@ struct TrackedConsumer {
332332 /// Hook the pool walks on `try_grow_async` failure. `None` when
333333 /// the consumer registered without a reclaimer.
334334 reclaimer : Option < Arc < dyn MemoryReclaimer > > ,
335- /// Set while a reclaim walk is currently calling this consumer's
336- /// [`MemoryReclaimer::reclaim`]. Concurrent walks check-and-swap
337- /// before invoking the reclaimer so that at most one in-flight
338- /// reclaim per consumer happens at a time, even when the pool
339- /// allows multiple walks concurrently. Skipped candidates fall
340- /// through to the next victim in their list.
341- reclaim_in_flight : AtomicBool ,
335+ /// Shared tri-state flag controlling whether this reclaimer is
336+ /// pickable by the walk. Values are defined in
337+ /// [`reclaimer_state`]. The pool flips `AVAILABLE` ↔ `IN_FLIGHT`
338+ /// for cross-walk dedup; the reclaimer's owner may sticky-set
339+ /// `DISABLED` once it can no longer free memory (e.g. on entering
340+ /// the non-reclaimable merge phase). If the reclaimer didn't
341+ /// expose its own flag via [`MemoryReclaimer::reclaimer_state`],
342+ /// the pool allocates a private `AVAILABLE`-initialised flag used
343+ /// only for in-flight dedup.
344+ reclaimer_state : Arc < AtomicU8 > ,
345+ }
346+
347+ /// RAII guard for the [`IN_FLIGHT`] slot of a [`TrackedConsumer`]'s
348+ /// `reclaimer_state` flag. On `Drop` it only restores `AVAILABLE` if
349+ /// the state is still `IN_FLIGHT` — leaves a sticky `DISABLED` alone
350+ /// so a victim can retire mid-walk without being re-armed.
351+ ///
352+ /// [`IN_FLIGHT`]: reclaimer_state::IN_FLIGHT
353+ struct ReclaimerStateGuard {
354+ flag : Arc < AtomicU8 > ,
355+ }
356+
357+ impl Drop for ReclaimerStateGuard {
358+ fn drop ( & mut self ) {
359+ let _ = self . flag . compare_exchange (
360+ reclaimer_state:: IN_FLIGHT ,
361+ reclaimer_state:: AVAILABLE ,
362+ Ordering :: AcqRel ,
363+ Ordering :: Relaxed ,
364+ ) ;
365+ }
366+ }
367+
368+ impl ReclaimerStateGuard {
369+ /// Try to transition the flag from `AVAILABLE` to `IN_FLIGHT`.
370+ /// Fails on contention (another walker won the CAS) or on a
371+ /// sticky `DISABLED`.
372+ fn try_acquire ( flag : & Arc < AtomicU8 > ) -> Option < Self > {
373+ flag. compare_exchange (
374+ reclaimer_state:: AVAILABLE ,
375+ reclaimer_state:: IN_FLIGHT ,
376+ Ordering :: AcqRel ,
377+ Ordering :: Relaxed ,
378+ )
379+ . ok ( )
380+ . map ( |_| Self {
381+ flag : Arc :: clone ( flag) ,
382+ } )
383+ }
342384}
343385
344386impl TrackedConsumer {
@@ -587,7 +629,16 @@ impl<I: MemoryPool> MemoryPool for TrackConsumersPool<I> {
587629 reserved : Default :: default ( ) ,
588630 peak : Default :: default ( ) ,
589631 reclaimer : consumer. reclaimer ( ) . cloned ( ) ,
590- reclaim_in_flight : AtomicBool :: new ( false ) ,
632+ // Source the tri-state flag from the reclaimer if it
633+ // exposes one (so the operator can sticky-set
634+ // DISABLED). Otherwise allocate a private AVAILABLE
635+ // flag used only for in-flight dedup across walks.
636+ reclaimer_state : consumer
637+ . reclaimer ( )
638+ . and_then ( |r| r. reclaimer_state ( ) )
639+ . unwrap_or_else ( || {
640+ Arc :: new ( AtomicU8 :: new ( reclaimer_state:: AVAILABLE ) )
641+ } ) ,
591642 } ,
592643 ) ;
593644
@@ -777,13 +828,22 @@ impl<I: MemoryPool> TrackConsumersPool<I> {
777828 min_reserved : usize ,
778829 ) -> usize {
779830 // Snapshot candidates while holding the read lock; release
780- // before any await. We carry the `cid` and name along so we
781- // can flip the per-victim in-flight flag without re-locking
782- // and log meaningfully.
783- let ( mut candidates, total_eligible) = {
831+ // before any await. Carry the shared `reclaimer_state` flag
832+ // along so we can do the CAS without re-locking, and skip
833+ // sticky-DISABLED reclaimers at filter time instead of
834+ // wasting a `reclaim()` call that would return `Ok(0)`.
835+ let ( mut candidates, total_eligible, skipped_disabled) = {
784836 let guard = self . tracked_consumers . read ( ) ;
785837 let mut total = 0usize ;
786- let cands: Vec < ( usize , String , usize , Arc < dyn MemoryReclaimer > , i32 ) > = guard
838+ let mut skipped_disabled = 0usize ;
839+ let cands: Vec < (
840+ usize ,
841+ String ,
842+ usize ,
843+ Arc < dyn MemoryReclaimer > ,
844+ i32 ,
845+ Arc < AtomicU8 > ,
846+ ) > = guard
787847 . iter ( )
788848 . filter_map ( |( cid, tc) | {
789849 if Some ( * cid) == exclude_consumer_id {
@@ -794,17 +854,30 @@ impl<I: MemoryPool> TrackConsumersPool<I> {
794854 if reserved <= min_reserved {
795855 return None ;
796856 }
857+ // Skip sticky-DISABLED reclaimers (operator has
858+ // retired itself). IN_FLIGHT is also non-AVAILABLE
859+ // but we still record those at the per-candidate
860+ // CAS site below for the `skipped_in_flight`
861+ // counter — they may transition back to AVAILABLE
862+ // by the time we reach them in the iteration.
863+ if tc. reclaimer_state . load ( Ordering :: Acquire )
864+ == reclaimer_state:: DISABLED
865+ {
866+ skipped_disabled += 1 ;
867+ return None ;
868+ }
797869 total = total. saturating_add ( reserved) ;
798870 Some ( (
799871 * cid,
800872 tc. name . clone ( ) ,
801873 reserved,
802874 Arc :: clone ( reclaimer) ,
803875 reclaimer. priority ( ) ,
876+ Arc :: clone ( & tc. reclaimer_state ) ,
804877 ) )
805878 } )
806879 . collect ( ) ;
807- ( cands, total)
880+ ( cands, total, skipped_disabled )
808881 } ;
809882 // Order: priority DESC, then reserved-size DESC. No cap —
810883 // walk every eligible candidate; the in-flight flag handles
@@ -813,43 +886,33 @@ impl<I: MemoryPool> TrackConsumersPool<I> {
813886
814887 debug ! (
815888 "[reclaim-walk] excl={:?} min_reserved={} target={} candidates={} \
816- eligible_bytes={} pool_reserved={}",
889+ eligible_bytes={} skipped_disabled={} pool_reserved={}",
817890 exclude_consumer_id,
818891 min_reserved,
819892 target,
820893 candidates. len( ) ,
821894 total_eligible,
895+ skipped_disabled,
822896 self . inner. reserved( ) ,
823897 ) ;
824898
825899 let mut total_freed: usize = 0 ;
826900 let mut still_needed = target;
827901 let mut skipped_in_flight = 0usize ;
828- for ( cid, name, _size, reclaimer, _prio) in candidates {
902+ for ( cid, name, _size, reclaimer, _prio, flag ) in candidates {
829903 if still_needed == 0 {
830904 break ;
831905 }
832- // Atomic check-and-set: if another walk is already
833- // reclaiming this victim, skip and try the next one. We
834- // re-fetch the entry by cid (held read lock briefly).
835- let claimed = {
836- let guard = self . tracked_consumers . read ( ) ;
837- guard
838- . get ( & cid)
839- . is_some_and ( |tc| !tc. reclaim_in_flight . swap ( true , Ordering :: AcqRel ) )
840- } ;
841- if !claimed {
906+ // CAS AVAILABLE → IN_FLIGHT. Fails if (a) another walker
907+ // got there first (in-flight) or (b) the owner sticky-set
908+ // DISABLED between snapshot and now. Either way, skip.
909+ // Guard's Drop restores AVAILABLE on scope exit (leaving
910+ // DISABLED alone).
911+ let Some ( _g) = ReclaimerStateGuard :: try_acquire ( & flag) else {
842912 skipped_in_flight += 1 ;
843913 continue ;
844- }
914+ } ;
845915 let result = reclaimer. reclaim ( still_needed) . await ;
846- // Release the in-flight flag whether reclaim succeeded or
847- // not. If the consumer was deregistered mid-walk
848- // `get(&cid)` returns None; that's fine — the flag was on
849- // a now-dropped value.
850- if let Some ( tc) = self . tracked_consumers . read ( ) . get ( & cid) {
851- tc. reclaim_in_flight . store ( false , Ordering :: Release ) ;
852- }
853916 match result {
854917 Ok ( freed) => {
855918 debug ! (
@@ -875,26 +938,33 @@ impl<I: MemoryPool> TrackConsumersPool<I> {
875938 }
876939
877940 /// Diagnostic: format ALL tracked consumers with non-zero current
878- /// reservations, grouped by whether they have a reclaimer or not.
941+ /// reservations, split by reclaimer state so dumps distinguish
942+ /// truly-reclaimable bytes from phantom (sticky-DISABLED) ones.
879943 /// Useful at OOM time to see the full pool composition beyond
880944 /// what [`Self::report_top`] shows.
881945 pub fn report_all_consumers ( & self ) -> String {
882946 let guard = self . tracked_consumers . read ( ) ;
883- let mut with_reclaimer: Vec < ( usize , String , usize , usize ) > = Vec :: new ( ) ;
947+ let mut live: Vec < ( usize , String , usize , usize ) > = Vec :: new ( ) ;
948+ let mut disabled: Vec < ( usize , String , usize , usize ) > = Vec :: new ( ) ;
884949 let mut without_reclaimer: Vec < ( usize , String , usize , usize ) > = Vec :: new ( ) ;
885950 for ( cid, tc) in guard. iter ( ) {
886951 let reserved = tc. reserved ( ) ;
887952 if reserved == 0 {
888953 continue ;
889954 }
890955 let entry = ( * cid, tc. name . clone ( ) , reserved, tc. peak ( ) ) ;
891- if tc. reclaimer . is_some ( ) {
892- with_reclaimer. push ( entry) ;
893- } else {
956+ if tc. reclaimer . is_none ( ) {
894957 without_reclaimer. push ( entry) ;
958+ } else if tc. reclaimer_state . load ( Ordering :: Acquire )
959+ == reclaimer_state:: DISABLED
960+ {
961+ disabled. push ( entry) ;
962+ } else {
963+ live. push ( entry) ;
895964 }
896965 }
897- with_reclaimer. sort_by_key ( |e| std:: cmp:: Reverse ( e. 2 ) ) ;
966+ live. sort_by_key ( |e| std:: cmp:: Reverse ( e. 2 ) ) ;
967+ disabled. sort_by_key ( |e| std:: cmp:: Reverse ( e. 2 ) ) ;
898968 without_reclaimer. sort_by_key ( |e| std:: cmp:: Reverse ( e. 2 ) ) ;
899969 let fmt = |v : & [ ( usize , String , usize , usize ) ] | -> ( String , usize ) {
900970 let sum = v. iter ( ) . map ( |e| e. 2 ) . sum :: < usize > ( ) ;
@@ -911,15 +981,22 @@ impl<I: MemoryPool> TrackConsumersPool<I> {
911981 . join ( "\n " ) ;
912982 ( body, sum)
913983 } ;
914- let ( with_body, with_sum) = fmt ( & with_reclaimer) ;
984+ let ( live_body, live_sum) = fmt ( & live) ;
985+ let ( disabled_body, disabled_sum) = fmt ( & disabled) ;
915986 let ( without_body, without_sum) = fmt ( & without_reclaimer) ;
916987 format ! (
917- "[consumer-dump] pool_reserved={} reclaimable_sum={} ({} consumers):\n {}\n \
988+ "[consumer-dump] pool_reserved={}\n \
989+ reclaimable_sum={} ({} consumers):\n {}\n \
990+ disabled_reclaimer_sum={} ({} consumers — sticky-retired, \
991+ pool can no longer free these):\n {}\n \
918992 non-reclaimable_sum={} ({} consumers):\n {}",
919993 human_readable_size( self . inner. reserved( ) ) ,
920- human_readable_size( with_sum) ,
921- with_reclaimer. len( ) ,
922- with_body,
994+ human_readable_size( live_sum) ,
995+ live. len( ) ,
996+ live_body,
997+ human_readable_size( disabled_sum) ,
998+ disabled. len( ) ,
999+ disabled_body,
9231000 human_readable_size( without_sum) ,
9241001 without_reclaimer. len( ) ,
9251002 without_body,
0 commit comments