|
10 | 10 |
|
11 | 11 | import org.apache.logging.log4j.LogManager; |
12 | 12 | import org.apache.logging.log4j.Logger; |
| 13 | +import org.opensearch.common.settings.Settings; |
13 | 14 | import org.opensearch.common.unit.TimeValue; |
14 | 15 | import org.opensearch.monitor.os.OsProbe; |
15 | 16 | import org.opensearch.threadpool.ThreadPool; |
16 | 17 |
|
| 18 | +import java.lang.management.ManagementFactory; |
| 19 | +import java.util.concurrent.atomic.AtomicBoolean; |
| 20 | +import java.util.function.LongSupplier; |
| 21 | + |
17 | 22 | /** |
18 | | - * AverageNativeMemoryUsageTracker tracks the average native (physical) memory usage on the node |
19 | | - * by polling the OS-level memory stats every (pollingInterval) and keeping track of the rolling |
20 | | - * average over a defined time window (windowDuration). |
| 23 | + * AverageNativeMemoryUsageTracker reports this OpenSearch process's off-heap native memory |
| 24 | + * utilization as a percentage of the budget configured for the native analytics engines, |
| 25 | + * averaged over a rolling window. |
| 26 | + * |
| 27 | + * <p>On Linux, each polling cycle computes |
| 28 | + * {@code usage = max(0, RssAnon - HeapCommitted)} where {@code RssAnon} comes from |
| 29 | + * {@link OsProbe#getProcessRssAnon()} and {@code HeapCommitted} comes from the JVM memory MX |
| 30 | + * bean. The denominator is a 20% padded sum of the two plugin budgets: |
| 31 | + * {@code cap = 1.2 * (datafusion.memory_pool_limit_bytes + resolved(parquet.max_native_allocation))}. |
| 32 | + * When either plugin setting is absent, zero, negative, or unparseable, its contribution is 0; |
| 33 | + * when both contribute 0 the cap is 0 and {@code getUsage()} returns 0 without dividing. |
21 | 34 | * |
22 | | - * On Linux, it uses available memory from /proc/meminfo (MemAvailable) which accounts for |
23 | | - * reclaimable page cache and slab memory, giving a more accurate picture of actual memory |
24 | | - * pressure. On other platforms, it falls back to free physical memory. |
| 35 | + * <p>Activation is already gated to Linux in {@link NodeResourceUsageTracker}; on non-Linux |
| 36 | + * platforms the polling loop is not started and {@link OsProbe#getProcessRssAnon()} returns |
| 37 | + * {@code -1} without touching the filesystem. |
25 | 38 | */ |
26 | 39 | public class AverageNativeMemoryUsageTracker extends AbstractAverageUsageTracker { |
27 | 40 |
|
28 | 41 | private static final Logger LOGGER = LogManager.getLogger(AverageNativeMemoryUsageTracker.class); |
29 | 42 |
|
30 | | - public AverageNativeMemoryUsageTracker(ThreadPool threadPool, TimeValue pollingInterval, TimeValue windowDuration) { |
| 43 | + static final String DATAFUSION_MEMORY_POOL_LIMIT_KEY = "datafusion.memory_pool_limit_bytes"; |
| 44 | + static final String PARQUET_MAX_NATIVE_ALLOCATION_KEY = "parquet.max_native_allocation"; |
| 45 | + static final double HEADROOM_FACTOR = 1.2; |
| 46 | + |
| 47 | + private final LongSupplier rssAnonSupplier; |
| 48 | + private final LongSupplier heapCommittedSupplier; |
| 49 | + private final LongSupplier nativeMemoryCapSupplier; |
| 50 | + |
| 51 | + /** |
| 52 | + * Production constructor. Wires the RSS reader to {@link OsProbe#getProcessRssAnon()}, the |
| 53 | + * heap-committed supplier to the JVM memory MX bean, and the native-memory cap supplier to |
| 54 | + * {@link #computeNativeMemoryCap(Settings)} so dynamic updates to |
| 55 | + * {@code datafusion.memory_pool_limit_bytes} are observed on the next polling cycle. |
| 56 | + */ |
| 57 | + public AverageNativeMemoryUsageTracker(ThreadPool threadPool, TimeValue pollingInterval, TimeValue windowDuration, Settings settings) { |
31 | 58 | super(threadPool, pollingInterval, windowDuration); |
| 59 | + this.rssAnonSupplier = () -> OsProbe.getInstance().getProcessRssAnon(); |
| 60 | + this.heapCommittedSupplier = () -> ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getCommitted(); |
| 61 | + this.nativeMemoryCapSupplier = () -> computeNativeMemoryCap(settings); |
32 | 62 | } |
33 | 63 |
|
34 | 64 | /** |
35 | | - * Get current native memory usage percentage using OS-level physical memory stats. |
36 | | - * Prefers available memory (MemAvailable) over free memory (MemFree) as it provides |
37 | | - * a more accurate measure of actual memory pressure on the node. |
| 65 | + * Package-private test constructor. Accepts three {@link LongSupplier}s in place of the |
| 66 | + * production defaults so tests can drive {@link #getUsage()} deterministically without |
| 67 | + * reading {@code /proc/self/status}, the JVM memory MX bean, or plugin settings. |
38 | 68 | */ |
| 69 | + AverageNativeMemoryUsageTracker( |
| 70 | + ThreadPool threadPool, |
| 71 | + TimeValue pollingInterval, |
| 72 | + TimeValue windowDuration, |
| 73 | + LongSupplier rssAnonSupplier, |
| 74 | + LongSupplier heapCommittedSupplier, |
| 75 | + LongSupplier nativeMemoryCapSupplier |
| 76 | + ) { |
| 77 | + super(threadPool, pollingInterval, windowDuration); |
| 78 | + this.rssAnonSupplier = rssAnonSupplier; |
| 79 | + this.heapCommittedSupplier = heapCommittedSupplier; |
| 80 | + this.nativeMemoryCapSupplier = nativeMemoryCapSupplier; |
| 81 | + } |
| 82 | + |
39 | 83 | @Override |
40 | 84 | public long getUsage() { |
41 | | - OsProbe osProbe = OsProbe.getInstance(); |
42 | | - long totalMemory = osProbe.getTotalPhysicalMemorySize(); |
43 | | - if (totalMemory <= 0) { |
44 | | - LOGGER.warn("Unable to retrieve total physical memory size"); |
45 | | - return 0; |
46 | | - } |
47 | | - long availableMemory = osProbe.getAvailableMemorySize(); |
48 | | - long unusedMemory; |
49 | | - if (availableMemory >= 0) { |
50 | | - // Use available memory (includes reclaimable cache) for a more accurate picture |
51 | | - unusedMemory = availableMemory; |
52 | | - } else { |
53 | | - LOGGER.warn("unable to retrieve available memory"); |
54 | | - return 0; |
55 | | - } |
56 | | - long usedMemory = totalMemory - unusedMemory; |
57 | | - long usage = usedMemory * 100 / totalMemory; |
58 | | - LOGGER.debug("Recording native memory usage: {}%", usage); |
59 | | - return usage; |
| 85 | + |
| 86 | + long rssAnon = rssAnonSupplier.getAsLong(); |
| 87 | + if (rssAnon < 0L) { |
| 88 | + LOGGER.debug("Native memory poll skipped: RssAnon unavailable from /proc/self/status"); |
| 89 | + return 0L; |
| 90 | + } |
| 91 | + |
| 92 | + long heapCommitted = heapCommittedSupplier.getAsLong(); |
| 93 | + long usage = Math.max(0L, rssAnon - heapCommitted); |
| 94 | + |
| 95 | + long cap = nativeMemoryCapSupplier.getAsLong(); |
| 96 | + if (cap <= 0L) { |
| 97 | + LOGGER.debug("Native memory poll: rssAnon={} heapCommitted={} usage={} cap=0 -> 0%", rssAnon, heapCommitted, usage); |
| 98 | + return 0L; |
| 99 | + } |
| 100 | + |
| 101 | + long percent = (usage * 100L) / cap; |
| 102 | + if (percent > 100L) { |
| 103 | + percent = 100L; |
| 104 | + } |
| 105 | + if (percent < 0L) { |
| 106 | + percent = 0L; |
| 107 | + } |
| 108 | + |
| 109 | + LOGGER.debug("Native memory poll: rssAnon={} heapCommitted={} usage={} cap={} pct={}", rssAnon, heapCommitted, usage, cap, percent); |
| 110 | + return percent; |
| 111 | + } |
| 112 | + |
| 113 | + |
| 114 | + /** |
| 115 | + * Computes {@code Non_Heap_Base = max(0, totalPhysicalMemory - Runtime.maxMemory())}, the |
| 116 | + * reference value against which {@code parquet.max_native_allocation} percentages resolve. |
| 117 | + */ |
| 118 | + long computeNonHeapBase() { |
| 119 | + return Math.max(0L, OsProbe.getInstance().getTotalPhysicalMemorySize() - Runtime.getRuntime().maxMemory()); |
| 120 | + } |
| 121 | + |
| 122 | + /** |
| 123 | + * Resolves the DataFusion native-pool contribution. Absent, zero, negative, or unparseable |
| 124 | + * values all collapse to {@code 0L}. |
| 125 | + */ |
| 126 | + long resolveDataFusionContribution(Settings settings) { |
| 127 | + try { |
| 128 | + long value = settings.getAsLong(DATAFUSION_MEMORY_POOL_LIMIT_KEY, 0L); |
| 129 | + return value > 0L ? value : 0L; |
| 130 | + } catch (IllegalArgumentException unparseable) { |
| 131 | + return 0L; |
| 132 | + } |
| 133 | + } |
| 134 | + |
| 135 | + /** |
| 136 | + * Resolves the Parquet native-allocation contribution from the percentage-string setting. |
| 137 | + * Absent, empty, missing trailing {@code %}, unparseable, NaN, or non-positive values all |
| 138 | + * collapse to {@code 0L}. Percentages above 100 are defensively clamped to 100. |
| 139 | + */ |
| 140 | + long resolveParquetContribution(Settings settings, long nonHeapBase) { |
| 141 | + if (nonHeapBase <= 0L) { |
| 142 | + return 0L; |
| 143 | + } |
| 144 | + String raw = settings.get(PARQUET_MAX_NATIVE_ALLOCATION_KEY); |
| 145 | + if (raw == null) { |
| 146 | + return 0L; |
| 147 | + } |
| 148 | + String trimmed = raw.trim(); |
| 149 | + if (trimmed.isEmpty() || trimmed.endsWith("%") == false) { |
| 150 | + return 0L; |
| 151 | + } |
| 152 | + String pctStr = trimmed.substring(0, trimmed.length() - 1).trim(); |
| 153 | + double pct; |
| 154 | + try { |
| 155 | + pct = Double.parseDouble(pctStr); |
| 156 | + } catch (NumberFormatException nfe) { |
| 157 | + return 0L; |
| 158 | + } |
| 159 | + if (Double.isNaN(pct) || pct <= 0.0) { |
| 160 | + return 0L; |
| 161 | + } |
| 162 | + if (pct > 100.0) { |
| 163 | + pct = 100.0; |
| 164 | + } |
| 165 | + return (long) Math.floor(nonHeapBase * pct / 100.0); |
| 166 | + } |
| 167 | + |
| 168 | + /** |
| 169 | + * Resolves both plugin settings to byte values and returns the 20% headroom-padded sum. |
| 170 | + * Returns {@code 0L} when neither plugin setting contributes a positive value, so |
| 171 | + * {@link #getUsage()} can short-circuit without dividing. |
| 172 | + */ |
| 173 | + long computeNativeMemoryCap(Settings settings) { |
| 174 | + long df = resolveDataFusionContribution(settings); |
| 175 | + long base = computeNonHeapBase(); |
| 176 | + long pq = resolveParquetContribution(settings, base); |
| 177 | + long sum = df + pq; |
| 178 | + if (sum <= 0L) { |
| 179 | + return 0L; |
| 180 | + } |
| 181 | + return (long) Math.floor(HEADROOM_FACTOR * (double) sum); |
60 | 182 | } |
61 | 183 | } |
0 commit comments