Skip to content

Commit 55d780d

Browse files
author
Pradeep L
committed
Add node-level native memory tracker budget
Replace the datafusion/parquet-derived cap in AverageNativeMemoryUsageTracker with two explicit node-scope dynamic settings: node.native_memory.limit (ByteSizeValue, default 0b) node.native_memory.buffer_percent (int 0-99, default 0) The tracker now divides observed native-memory use (max(0, RssAnon - HeapCommitted)) by limit - (limit * bufferPercent / 100). When the limit is unset or the buffer fully consumes it, getUsage() returns 0 without dividing. Also: allow read access to /proc/self/status in the Java security policy and systemd unit so OsProbe.getProcessRssAnon() works in hardened installs. Signed-off-by: Pradeep L <spradeel@amazon.com>
1 parent c2e2b55 commit 55d780d

6 files changed

Lines changed: 88 additions & 99 deletions

File tree

distribution/packages/src/common/systemd/opensearch.service

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,8 @@ ReadOnlyPaths=-/etc/os-release -/usr/lib/os-release -/etc/system-release
145145
## Allow read access to Linux IO stats
146146
ReadOnlyPaths=/proc/self/mountinfo /proc/diskstats
147147

148-
## Allow read access to Linux memory stats
149-
ReadOnlyPaths=/proc/meminfo
148+
## Allow read access to native memory stats (RssAnon via /proc/self/status)
149+
ReadOnlyPaths=/proc/self/status
150150

151151
## Allow read access to control group stats
152152
ReadOnlyPaths=/proc/self/cgroup /sys/fs/cgroup/cpu /sys/fs/cgroup/cpu/-

server/src/main/java/org/opensearch/common/settings/ClusterSettings.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,8 @@ public void apply(Settings value, Settings current, Settings previous) {
758758
ResourceTrackerSettings.GLOBAL_JVM_USAGE_AC_WINDOW_DURATION_SETTING,
759759
ResourceTrackerSettings.GLOBAL_IO_USAGE_AC_WINDOW_DURATION_SETTING,
760760
ResourceTrackerSettings.GLOBAL_NATIVE_MEMORY_USAGE_AC_WINDOW_DURATION_SETTING,
761+
ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING,
762+
ResourceTrackerSettings.NODE_NATIVE_MEMORY_BUFFER_PERCENT_SETTING,
761763

762764
// Settings related to Searchable Snapshots
763765
Node.NODE_SEARCH_CACHE_SIZE_SETTING,

server/src/main/java/org/opensearch/monitor/os/OsProbe.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ public long getProcessRssAnon() {
253253
try {
254254
return readRssAnonFromProcSelfStatus();
255255
} catch (IOException e) {
256-
logger.debug("failed to read /proc/self/status", e);
256+
logger.warn("failed to read /proc/self/status", e);
257257
return -1L;
258258
}
259259
}
@@ -282,15 +282,15 @@ long readRssAnonFromProcSelfStatus() throws IOException {
282282
}
283283
return kb * 1024L;
284284
} catch (NumberFormatException nfe) {
285-
logger.debug("malformed RssAnon value in /proc/self/status", nfe);
285+
logger.warn("malformed RssAnon value in /proc/self/status", nfe);
286286
return -1L;
287287
}
288288
}
289-
logger.debug("RssAnon line has unexpected shape: [{}]", line);
289+
logger.warn("RssAnon line has unexpected shape: [{}]", line);
290290
return -1L;
291291
}
292292
}
293-
logger.debug("RssAnon line not found in /proc/self/status");
293+
logger.warn("RssAnon line not found in /proc/self/status");
294294
return -1L;
295295
}
296296
}

server/src/main/java/org/opensearch/node/resource/tracker/AverageNativeMemoryUsageTracker.java

Lines changed: 52 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -12,25 +12,26 @@
1212
import org.apache.logging.log4j.Logger;
1313
import org.opensearch.common.settings.Settings;
1414
import org.opensearch.common.unit.TimeValue;
15+
import org.opensearch.core.common.unit.ByteSizeValue;
1516
import org.opensearch.monitor.os.OsProbe;
1617
import org.opensearch.threadpool.ThreadPool;
1718

1819
import java.lang.management.ManagementFactory;
19-
import java.util.concurrent.atomic.AtomicBoolean;
2020
import java.util.function.LongSupplier;
2121

2222
/**
2323
* AverageNativeMemoryUsageTracker reports this OpenSearch process's off-heap native memory
24-
* utilization as a percentage of the budget configured for the native analytics engines,
25-
* averaged over a rolling window.
24+
* utilization as a percentage of a configured node-level budget, averaged over a rolling window.
2625
*
2726
* <p>On Linux, each polling cycle computes
2827
* {@code usage = max(0, RssAnon - HeapCommitted)} where {@code RssAnon} comes from
2928
* {@link OsProbe#getProcessRssAnon()} and {@code HeapCommitted} comes from the JVM memory MX
30-
* bean. The denominator is a 20% padded sum of the two plugin budgets:
31-
* {@code cap = 1.2 * (datafusion.memory_pool_limit_bytes + resolved(parquet.max_native_allocation))}.
32-
* When either plugin setting is absent, zero, negative, or unparseable, its contribution is 0;
33-
* when both contribute 0 the cap is 0 and {@code getUsage()} returns 0 without dividing.
29+
* bean. The denominator is the effective native memory budget:
30+
* {@code effective = limit - (limit * bufferPercent / 100)}, resolved per poll from
31+
* {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING} and
32+
* {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_BUFFER_PERCENT_SETTING}. When the limit is
33+
* absent or non-positive, or the buffer fully consumes the limit, {@code getUsage()} returns
34+
* {@code 0} without dividing.
3435
*
3536
* <p>Activation is already gated to Linux in {@link NodeResourceUsageTracker}; on non-Linux
3637
* platforms the polling loop is not started and {@link OsProbe#getProcessRssAnon()} returns
@@ -40,144 +41,104 @@ public class AverageNativeMemoryUsageTracker extends AbstractAverageUsageTracker
4041

4142
private static final Logger LOGGER = LogManager.getLogger(AverageNativeMemoryUsageTracker.class);
4243

43-
static final String DATAFUSION_MEMORY_POOL_LIMIT_KEY = "datafusion.memory_pool_limit_bytes";
44-
static final String PARQUET_MAX_NATIVE_ALLOCATION_KEY = "parquet.max_native_allocation";
45-
static final double HEADROOM_FACTOR = 1.2;
46-
4744
private final LongSupplier rssAnonSupplier;
4845
private final LongSupplier heapCommittedSupplier;
49-
private final LongSupplier nativeMemoryCapSupplier;
46+
private final LongSupplier effectiveNativeMemorySupplier;
5047

5148
/**
5249
* Production constructor. Wires the RSS reader to {@link OsProbe#getProcessRssAnon()}, the
53-
* heap-committed supplier to the JVM memory MX bean, and the native-memory cap supplier to
54-
* {@link #computeNativeMemoryCap(Settings)} so dynamic updates to
55-
* {@code datafusion.memory_pool_limit_bytes} are observed on the next polling cycle.
50+
* heap-committed supplier to the JVM memory MX bean, and the effective-native-memory supplier
51+
* to {@link #computeEffectiveNativeMemory(Settings)} so dynamic updates to the two node
52+
* settings are observed on the next polling cycle.
5653
*/
5754
public AverageNativeMemoryUsageTracker(ThreadPool threadPool, TimeValue pollingInterval, TimeValue windowDuration, Settings settings) {
5855
super(threadPool, pollingInterval, windowDuration);
5956
this.rssAnonSupplier = () -> OsProbe.getInstance().getProcessRssAnon();
6057
this.heapCommittedSupplier = () -> ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getCommitted();
61-
this.nativeMemoryCapSupplier = () -> computeNativeMemoryCap(settings);
58+
this.effectiveNativeMemorySupplier = () -> computeEffectiveNativeMemory(settings);
6259
}
6360

6461
/**
6562
* Package-private test constructor. Accepts three {@link LongSupplier}s in place of the
6663
* production defaults so tests can drive {@link #getUsage()} deterministically without
67-
* reading {@code /proc/self/status}, the JVM memory MX bean, or plugin settings.
64+
* reading {@code /proc/self/status}, the JVM memory MX bean, or node settings. The third
65+
* supplier provides the effective native memory (limit minus buffer) directly.
6866
*/
6967
AverageNativeMemoryUsageTracker(
7068
ThreadPool threadPool,
7169
TimeValue pollingInterval,
7270
TimeValue windowDuration,
7371
LongSupplier rssAnonSupplier,
7472
LongSupplier heapCommittedSupplier,
75-
LongSupplier nativeMemoryCapSupplier
73+
LongSupplier effectiveNativeMemorySupplier
7674
) {
7775
super(threadPool, pollingInterval, windowDuration);
7876
this.rssAnonSupplier = rssAnonSupplier;
7977
this.heapCommittedSupplier = heapCommittedSupplier;
80-
this.nativeMemoryCapSupplier = nativeMemoryCapSupplier;
78+
this.effectiveNativeMemorySupplier = effectiveNativeMemorySupplier;
8179
}
8280

8381
@Override
8482
public long getUsage() {
8583

8684
long rssAnon = rssAnonSupplier.getAsLong();
8785
if (rssAnon < 0L) {
88-
LOGGER.debug("Native memory poll skipped: RssAnon unavailable from /proc/self/status");
86+
LOGGER.warn("Native memory poll skipped: RssAnon unavailable from /proc/self/status");
8987
return 0L;
9088
}
9189

9290
long heapCommitted = heapCommittedSupplier.getAsLong();
93-
long usage = Math.max(0L, rssAnon - heapCommitted);
94-
95-
long cap = nativeMemoryCapSupplier.getAsLong();
96-
if (cap <= 0L) {
97-
LOGGER.debug("Native memory poll: rssAnon={} heapCommitted={} usage={} cap=0 -> 0%", rssAnon, heapCommitted, usage);
91+
long nativeUsed = Math.max(0L, rssAnon - heapCommitted);
92+
93+
long effectiveNativeMemory = effectiveNativeMemorySupplier.getAsLong();
94+
if (effectiveNativeMemory <= 0L) {
95+
LOGGER.warn(
96+
"Native memory poll: rssAnon={} heapCommitted={} nativeUsed={} effectiveNativeMemory=0 -> 0%",
97+
rssAnon,
98+
heapCommitted,
99+
nativeUsed
100+
);
98101
return 0L;
99102
}
100103

101-
long percent = (usage * 100L) / cap;
104+
double utilization = (double) nativeUsed / effectiveNativeMemory * 100.0;
105+
long percent = (long) utilization;
102106
if (percent > 100L) {
103107
percent = 100L;
104108
}
105109
if (percent < 0L) {
106110
percent = 0L;
107111
}
108112

109-
LOGGER.debug("Native memory poll: rssAnon={} heapCommitted={} usage={} cap={} pct={}", rssAnon, heapCommitted, usage, cap, percent);
110-
return percent;
111-
}
112-
113-
114-
/**
115-
* Computes {@code Non_Heap_Base = max(0, totalPhysicalMemory - Runtime.maxMemory())}, the
116-
* reference value against which {@code parquet.max_native_allocation} percentages resolve.
117-
*/
118-
long computeNonHeapBase() {
119-
return Math.max(0L, OsProbe.getInstance().getTotalPhysicalMemorySize() - Runtime.getRuntime().maxMemory());
120-
}
121-
122-
/**
123-
* Resolves the DataFusion native-pool contribution. Absent, zero, negative, or unparseable
124-
* values all collapse to {@code 0L}.
125-
*/
126-
long resolveDataFusionContribution(Settings settings) {
127-
try {
128-
long value = settings.getAsLong(DATAFUSION_MEMORY_POOL_LIMIT_KEY, 0L);
129-
return value > 0L ? value : 0L;
130-
} catch (IllegalArgumentException unparseable) {
131-
return 0L;
113+
if (LOGGER.isDebugEnabled()) {
114+
LOGGER.debug(
115+
"Native memory poll: rssAnon={} heapCommitted={} nativeUsed={} effectiveNativeMemory={} pct={}",
116+
rssAnon,
117+
heapCommitted,
118+
nativeUsed,
119+
effectiveNativeMemory,
120+
percent
121+
);
132122
}
133-
}
134123

135-
/**
136-
* Resolves the Parquet native-allocation contribution from the percentage-string setting.
137-
* Absent, empty, missing trailing {@code %}, unparseable, NaN, or non-positive values all
138-
* collapse to {@code 0L}. Percentages above 100 are defensively clamped to 100.
139-
*/
140-
long resolveParquetContribution(Settings settings, long nonHeapBase) {
141-
if (nonHeapBase <= 0L) {
142-
return 0L;
143-
}
144-
String raw = settings.get(PARQUET_MAX_NATIVE_ALLOCATION_KEY);
145-
if (raw == null) {
146-
return 0L;
147-
}
148-
String trimmed = raw.trim();
149-
if (trimmed.isEmpty() || trimmed.endsWith("%") == false) {
150-
return 0L;
151-
}
152-
String pctStr = trimmed.substring(0, trimmed.length() - 1).trim();
153-
double pct;
154-
try {
155-
pct = Double.parseDouble(pctStr);
156-
} catch (NumberFormatException nfe) {
157-
return 0L;
158-
}
159-
if (Double.isNaN(pct) || pct <= 0.0) {
160-
return 0L;
161-
}
162-
if (pct > 100.0) {
163-
pct = 100.0;
164-
}
165-
return (long) Math.floor(nonHeapBase * pct / 100.0);
124+
return percent;
166125
}
167126

168127
/**
169-
* Resolves both plugin settings to byte values and returns the 20% headroom-padded sum.
170-
* Returns {@code 0L} when neither plugin setting contributes a positive value, so
171-
* {@link #getUsage()} can short-circuit without dividing.
128+
* Resolves the configured native-memory limit and buffer percentage from the node settings and
129+
* returns the effective native-memory budget in bytes: {@code limit - (limit * buffer / 100)}.
130+
* Returns {@code 0L} when the limit is absent or non-positive, or when the buffer fully
131+
* consumes the limit, so {@link #getUsage()} can short-circuit without dividing.
172132
*/
173-
long computeNativeMemoryCap(Settings settings) {
174-
long df = resolveDataFusionContribution(settings);
175-
long base = computeNonHeapBase();
176-
long pq = resolveParquetContribution(settings, base);
177-
long sum = df + pq;
178-
if (sum <= 0L) {
133+
long computeEffectiveNativeMemory(Settings settings) {
134+
ByteSizeValue limitValue = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
135+
long limit = limitValue.getBytes();
136+
if (limit <= 0L) {
179137
return 0L;
180138
}
181-
return (long) Math.floor(HEADROOM_FACTOR * (double) sum);
139+
int bufferPercent = ResourceTrackerSettings.NODE_NATIVE_MEMORY_BUFFER_PERCENT_SETTING.get(settings);
140+
long buffer = limit * bufferPercent / 100L;
141+
long effective = limit - buffer;
142+
return Math.max(0L, effective);
182143
}
183144
}

server/src/main/java/org/opensearch/node/resource/tracker/ResourceTrackerSettings.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import org.opensearch.common.settings.Setting;
1212
import org.opensearch.common.settings.Settings;
1313
import org.opensearch.common.unit.TimeValue;
14+
import org.opensearch.core.common.unit.ByteSizeValue;
1415

1516
/**
1617
* Settings related to resource usage trackers such as polling interval, window duration etc
@@ -86,6 +87,30 @@ private static class Defaults {
8687
Setting.Property.NodeScope
8788
);
8889

90+
/**
91+
* Absolute native-memory budget for this node, in bytes. When the value is {@link ByteSizeValue#ZERO}
92+
* (default) the tracker treats the budget as unconfigured and reports {@code 0%}.
93+
*/
94+
public static final Setting<ByteSizeValue> NODE_NATIVE_MEMORY_LIMIT_SETTING = Setting.byteSizeSetting(
95+
"node.native_memory.limit",
96+
ByteSizeValue.ZERO,
97+
Setting.Property.Dynamic,
98+
Setting.Property.NodeScope
99+
);
100+
101+
/**
102+
* Percentage of the native-memory limit that is reserved as buffer (not usable). The effective
103+
* native memory the tracker divides against is {@code limit - (limit * bufferPercent / 100)}.
104+
*/
105+
public static final Setting<Integer> NODE_NATIVE_MEMORY_BUFFER_PERCENT_SETTING = Setting.intSetting(
106+
"node.native_memory.buffer_percent",
107+
0,
108+
0,
109+
100,
110+
Setting.Property.Dynamic,
111+
Setting.Property.NodeScope
112+
);
113+
89114
private volatile TimeValue cpuWindowDuration;
90115
private volatile TimeValue cpuPollingInterval;
91116
private volatile TimeValue memoryWindowDuration;

server/src/main/resources/org/opensearch/bootstrap/security.policy

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,9 @@ grant {
235235
permission java.io.FilePermission "/proc/self/mountinfo", "read";
236236
permission java.io.FilePermission "/proc/diskstats", "read";
237237

238-
// memory stats on Linux
239-
permission java.io.FilePermission "/proc/meminfo", "read";
238+
// native memory stats on Linux (RssAnon from /proc/self/status)
239+
permission java.io.FilePermission "/proc/self/status", "read";
240+
240241

241242
// control group stats on Linux
242243
// TODO: update later when wildcard is supported in policy

0 commit comments

Comments
 (0)