88
99//! Unified jemalloc-based memory guard for pool override decisions.
1010//!
11- //! Provides a single entry point (`should_override`) that both the admission
12- //! layer (`query_budget.rs`) and the operator layer (`memory.rs`) call before
13- //! reducing partitions or triggering spill respectively.
11+ //! All RSS checks go through [`cached_resident_bytes()`] — a single source of
12+ //! truth refreshed at most once per 100ms. This avoids expensive jemalloc
13+ //! `epoch.advance()` calls on the hot path while keeping the memory picture
14+ //! consistent across all decision layers (hard guard, override, cancel, admission).
1415//!
1516//! Thresholds are configurable at runtime via `set_thresholds`.
1617
17- use std:: sync:: atomic:: { AtomicU64 , Ordering } ;
18+ use std:: sync:: atomic:: { AtomicI64 , AtomicU64 , Ordering } ;
19+ use std:: time:: Instant ;
20+
21+ // --- Cached RSS ---
22+
23+ const RESIDENT_CACHE_INTERVAL_MS : u64 = 100 ;
24+ static CACHED_RESIDENT : AtomicI64 = AtomicI64 :: new ( 0 ) ;
25+ static LAST_CHECK_MS : AtomicU64 = AtomicU64 :: new ( 0 ) ;
26+ static EPOCH_BASE : std:: sync:: OnceLock < Instant > = std:: sync:: OnceLock :: new ( ) ;
27+
28+ /// Returns jemalloc resident bytes, cached for up to 100ms.
29+ ///
30+ /// Only one thread per interval pays the ~1-5µs `epoch.advance()` cost;
31+ /// all others get the cached value in <1ns (one atomic load).
32+ /// Used by all memory decision points: hard guard, override, cancel, admission.
33+ pub fn cached_resident_bytes ( ) -> i64 {
34+ let base = EPOCH_BASE . get_or_init ( Instant :: now) ;
35+ let now_ms = base. elapsed ( ) . as_millis ( ) as u64 ;
36+ let last = LAST_CHECK_MS . load ( Ordering :: Relaxed ) ;
37+ if now_ms. saturating_sub ( last) >= RESIDENT_CACHE_INTERVAL_MS {
38+ if LAST_CHECK_MS . compare_exchange ( last, now_ms, Ordering :: Relaxed , Ordering :: Relaxed ) . is_ok ( ) {
39+ let r = native_bridge_common:: allocator:: resident_bytes ( ) ;
40+ CACHED_RESIDENT . store ( r, Ordering :: Relaxed ) ;
41+ return r;
42+ }
43+ }
44+ CACHED_RESIDENT . load ( Ordering :: Relaxed )
45+ }
46+
47+ // --- Thresholds ---
1848
1949/// Minimum pool size (bytes) for jemalloc override to activate.
2050/// Below this, the pool is assumed to be a unit test / benchmark with
2151/// artificial limits — never override.
2252const MIN_POOL_FOR_OVERRIDE : usize = 16 * 1024 * 1024 ; // 16MB
2353
2454// Configurable thresholds stored as fixed-point (×1000) in atomics.
25- // Defaults: admission=70%, operator=85%, kill =95%.
55+ // Defaults: admission=70%, operator=85%, critical =95%.
2656static ADMISSION_THRESHOLD_X1000 : AtomicU64 = AtomicU64 :: new ( 700 ) ;
2757static OPERATOR_THRESHOLD_X1000 : AtomicU64 = AtomicU64 :: new ( 850 ) ;
28- static KILL_THRESHOLD_X1000 : AtomicU64 = AtomicU64 :: new ( 950 ) ;
58+ static CRITICAL_THRESHOLD_X1000 : AtomicU64 = AtomicU64 :: new ( 950 ) ;
2959
3060/// Which layer is asking for the override check.
3161#[ derive( Debug , Clone , Copy ) ]
@@ -49,17 +79,18 @@ pub struct MemoryThresholds {
4979 pub admission : f64 ,
5080 /// Threshold for operator decisions (trigger spill). Default: 0.85
5181 pub operator : f64 ,
52- /// Threshold for query kill (cancel in-flight query). Default: 0.95
53- /// When RSS exceeds this, spill can't save the node — cancel the query.
54- pub kill : f64 ,
82+ /// Critical memory threshold. Default: 0.95
83+ /// When RSS exceeds this: the hard guard forces spill (pre-CAS path), and
84+ /// the cancel path terminates the query (post-CAS-fail path, last resort).
85+ pub critical : f64 ,
5586}
5687
5788impl Default for MemoryThresholds {
5889 fn default ( ) -> Self {
5990 Self {
6091 admission : 0.70 ,
6192 operator : 0.85 ,
62- kill : 0.95 ,
93+ critical : 0.95 ,
6394 }
6495 }
6596}
@@ -75,8 +106,8 @@ pub fn set_thresholds(thresholds: MemoryThresholds) {
75106 ( thresholds. operator * 1000.0 ) as u64 ,
76107 Ordering :: Release ,
77108 ) ;
78- KILL_THRESHOLD_X1000 . store (
79- ( thresholds. kill * 1000.0 ) as u64 ,
109+ CRITICAL_THRESHOLD_X1000 . store (
110+ ( thresholds. critical * 1000.0 ) as u64 ,
80111 Ordering :: Release ,
81112 ) ;
82113}
@@ -86,25 +117,28 @@ pub fn get_thresholds() -> MemoryThresholds {
86117 MemoryThresholds {
87118 admission : ADMISSION_THRESHOLD_X1000 . load ( Ordering :: Acquire ) as f64 / 1000.0 ,
88119 operator : OPERATOR_THRESHOLD_X1000 . load ( Ordering :: Acquire ) as f64 / 1000.0 ,
89- kill : KILL_THRESHOLD_X1000 . load ( Ordering :: Acquire ) as f64 / 1000.0 ,
120+ critical : CRITICAL_THRESHOLD_X1000 . load ( Ordering :: Acquire ) as f64 / 1000.0 ,
90121 }
91122}
92123
93- /// Returns `true` if RSS exceeds the kill threshold — the query should be
94- /// cancelled rather than spilled. Spill can't help at this pressure level;
95- /// protecting the node is more important than completing the query.
124+ /// Returns `true` if RSS exceeds the critical threshold — the query should be
125+ /// cancelled. This is the last-resort path (post-CAS-fail, post-override-denied):
126+ /// the pool rejected, jemalloc confirms pressure, and spill alone can't recover
127+ /// fast enough. Cancel the query to protect the node.
96128///
97- /// Only called from the operator spill path (after pool rejection + override blocked).
98- pub fn should_kill_query ( pool_limit_bytes : usize ) -> bool {
129+ /// The same critical threshold is used by the hard guard (pre-CAS) to force spill
130+ /// earlier — that path is recoverable. This path fires only when spill was already
131+ /// attempted or cannot help.
132+ pub fn should_cancel_query ( pool_limit_bytes : usize ) -> bool {
99133 if pool_limit_bytes < MIN_POOL_FOR_OVERRIDE {
100134 return false ;
101135 }
102- let resident = native_bridge_common :: allocator :: resident_bytes ( ) ;
136+ let resident = cached_resident_bytes ( ) ;
103137 if resident <= 0 {
104138 return false ;
105139 }
106- let kill_bytes = ( pool_limit_bytes as u64 ) . saturating_mul ( KILL_THRESHOLD_X1000 . load ( Ordering :: Acquire ) ) / 1000 ;
107- resident >= kill_bytes as i64
140+ let critical_bytes = ( pool_limit_bytes as u64 ) . saturating_mul ( CRITICAL_THRESHOLD_X1000 . load ( Ordering :: Acquire ) ) / 1000 ;
141+ resident >= critical_bytes as i64
108142}
109143
110144/// Check whether jemalloc says physical memory has headroom, meaning the
@@ -123,12 +157,11 @@ pub fn should_kill_query(pool_limit_bytes: usize) -> bool {
123157/// - `pool_limit_bytes`: the pool's configured limit
124158/// - `context`: which layer is asking (determines threshold)
125159pub fn should_override ( pool_limit_bytes : usize , context : OverrideContext ) -> bool {
126- // Skip for tiny pools (unit tests, benchmarks with artificial limits)
127160 if pool_limit_bytes < MIN_POOL_FOR_OVERRIDE {
128161 return false ;
129162 }
130163
131- let resident = native_bridge_common :: allocator :: resident_bytes ( ) ;
164+ let resident = cached_resident_bytes ( ) ;
132165 if resident <= 0 {
133166 return false ;
134167 }
@@ -156,7 +189,7 @@ pub fn is_memory_pressured(pool_limit_bytes: usize) -> bool {
156189 return false ;
157190 }
158191
159- let resident = native_bridge_common :: allocator :: resident_bytes ( ) ;
192+ let resident = cached_resident_bytes ( ) ;
160193 if resident <= 0 {
161194 return false ;
162195 }
@@ -239,20 +272,20 @@ mod tests {
239272 let t = MemoryThresholds :: default ( ) ;
240273 assert ! ( ( t. admission - 0.70 ) . abs( ) < 0.001 ) ;
241274 assert ! ( ( t. operator - 0.85 ) . abs( ) < 0.001 ) ;
242- assert ! ( ( t. kill - 0.95 ) . abs( ) < 0.001 ) ;
275+ assert ! ( ( t. critical - 0.95 ) . abs( ) < 0.001 ) ;
243276 }
244277
245278 #[ test]
246279 fn set_and_get_thresholds ( ) {
247280 set_thresholds ( MemoryThresholds {
248281 admission : 0.60 ,
249282 operator : 0.90 ,
250- kill : 0.97 ,
283+ critical : 0.97 ,
251284 } ) ;
252285 let t = get_thresholds ( ) ;
253286 assert ! ( ( t. admission - 0.60 ) . abs( ) < 0.001 ) ;
254287 assert ! ( ( t. operator - 0.90 ) . abs( ) < 0.001 ) ;
255- assert ! ( ( t. kill - 0.97 ) . abs( ) < 0.001 ) ;
288+ assert ! ( ( t. critical - 0.97 ) . abs( ) < 0.001 ) ;
256289 // Restore defaults
257290 set_thresholds ( MemoryThresholds :: default ( ) ) ;
258291 }
@@ -303,6 +336,55 @@ mod tests {
303336 assert ! ( !is_memory_pressured( 1_000_000 ) ) ; // 1MB — below MIN_POOL_FOR_OVERRIDE
304337 }
305338
339+ #[ test]
340+ fn cached_resident_bytes_returns_positive ( ) {
341+ // jemalloc is active in the test process — resident_bytes must be > 0
342+ let resident = cached_resident_bytes ( ) ;
343+ assert ! ( resident > 0 , "cached_resident_bytes() should return > 0 when jemalloc is active, got {}" , resident) ;
344+ }
345+
346+ #[ test]
347+ fn cached_resident_bytes_is_stable_within_interval ( ) {
348+ // Two calls within <100ms should return the same cached value
349+ // (only one thread per interval refreshes the cache).
350+ let first = cached_resident_bytes ( ) ;
351+ let second = cached_resident_bytes ( ) ;
352+ assert_eq ! (
353+ first, second,
354+ "Two immediate calls should return the same cached value"
355+ ) ;
356+ }
357+
358+ #[ test]
359+ fn should_cancel_query_false_for_small_pools ( ) {
360+ // Pools below MIN_POOL_FOR_OVERRIDE (16MB) always return false
361+ assert ! ( !should_cancel_query( 1_000_000 ) ) ; // 1MB
362+ assert ! ( !should_cancel_query( 8 * 1024 * 1024 ) ) ; // 8MB
363+ assert ! ( !should_cancel_query( 15 * 1024 * 1024 ) ) ; // 15MB
364+ }
365+
366+ #[ test]
367+ fn should_cancel_query_true_when_rss_exceeds_limit ( ) {
368+ // With a 20MB pool limit (above MIN_POOL_FOR_OVERRIDE), the current test
369+ // process RSS should exceed 95% of 20MB = 19MB. A Rust test process
370+ // typically uses 50-200MB RSS.
371+ let small_pool = 20 * 1024 * 1024 ; // 20MB
372+ let resident = native_bridge_common:: allocator:: resident_bytes ( ) ;
373+ if resident <= 0 {
374+ return ; // jemalloc not available in this test env
375+ }
376+ // Only assert if RSS actually exceeds the critical threshold
377+ let critical_bytes = ( small_pool as f64 * 0.95 ) as i64 ;
378+ if resident >= critical_bytes {
379+ assert ! (
380+ should_cancel_query( small_pool) ,
381+ "should_cancel_query should return true when RSS ({}) exceeds 95% of pool ({})" ,
382+ resident,
383+ small_pool
384+ ) ;
385+ }
386+ }
387+
306388 #[ test]
307389 fn override_respects_operator_vs_admission_threshold ( ) {
308390 // Operator threshold (85%) is more permissive than admission (70%).
0 commit comments