opensearch-project
diff --git a/‎libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java‎
Lines changed: 3 additions & 19 deletions b/‎libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java‎
Lines changed: 3 additions & 19 deletions
diff --git a/‎libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorListener.java‎
Lines changed: 0 additions & 39 deletions b/‎libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorListener.java‎
Lines changed: 0 additions & 39 deletions
diff --git a/‎plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java‎
Lines changed: 101 additions & 38 deletions b/‎plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java‎
Lines changed: 101 additions & 38 deletions
diff --git a/‎plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowNativeAllocator.java‎
Lines changed: 4 additions & 44 deletions b/‎plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowNativeAllocator.java‎
Lines changed: 4 additions & 44 deletions
@@ -36,7 +36,9 @@ public interface NativeAllocator extends Closeable {
     PoolHandle getOrCreatePool(String poolName, long limit);
 
     /**
-     * Updates the limit of an existing pool.
+     * Updates the limit of an existing pool. Children of the pool allocator
+     * inherit the change automatically via Arrow's parent-cap check at
+     * allocation time — no notification SPI is needed.
      *
      * @param poolName logical pool name
      * @param newLimit new maximum bytes for the pool
@@ -55,24 +57,6 @@ public interface NativeAllocator extends Closeable {
      */
     NativeAllocatorPoolStats stats();
 
-    /**
-     * Registers a listener that is invoked after a pool's limit changes.
-     *
-     * <p>Listener invocation is synchronous on the caller thread. See
-     * {@link NativeAllocatorListener} for threading constraints.
-     *
-     * @param listener the listener to register
-     */
-    void addListener(NativeAllocatorListener listener);
-
-    /**
-     * Unregisters a previously registered listener. No-op if the listener
-     * was not registered.
-     *
-     * @param listener the listener to remove
-     */
-    void removeListener(NativeAllocatorListener listener);
-
     /**
      * Opaque handle to a memory pool. Plugins downcast to the concrete type
      * (e.g., Arrow's {@code BufferAllocator}) in the implementation layer.
 
@@ -11,8 +11,6 @@
 import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
 import org.opensearch.cluster.metadata.IndexNameExpressionResolver;
 import org.opensearch.cluster.service.ClusterService;
-import org.opensearch.common.inject.AbstractModule;
-import org.opensearch.common.inject.Module;
 import org.opensearch.common.settings.ClusterSettings;
 import org.opensearch.common.settings.Setting;
 import org.opensearch.common.settings.Settings;
@@ -55,11 +53,13 @@ public ArrowBasePlugin() {}
     /**
      * Maximum bytes for the root Arrow allocator.
      *
-     * <p>When unset, the default is derived from the admission-control budget
-     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING} reduced by
-     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_BUFFER_PERCENT_SETTING} —
-     * the same budget AC throttles on. If AC is unconfigured (limit = 0), the
-     * default is {@link Long#MAX_VALUE}, preserving pre-AC behaviour.
+     * <p>When unset, the default is 20% of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
+     * {@link #deriveRootLimitDefault}. The Arrow framework gets a small fraction of the
+     * native budget because the dominant consumer of native memory in analytics workloads
+     * is the DataFusion Rust runtime (~75% of {@code node.native_memory.limit}), not Arrow.
+     * If AC is unconfigured (limit = 0), the default is {@link Long#MAX_VALUE}, preserving
+     * pre-AC behaviour.
      */
     public static final Setting<Long> ROOT_LIMIT_SETTING = new Setting<>(
         NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT,
@@ -78,17 +78,22 @@ public ArrowBasePlugin() {}
     );
 
     /**
-     * Computes the default for {@link #ROOT_LIMIT_SETTING} from the AC native-memory budget.
-     * Returns the bytes-as-string representation expected by the Setting parser.
+     * Computes the default for {@link #ROOT_LIMIT_SETTING} as 20% of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}. The Arrow framework's
+     * hard cap covers only Arrow allocations — DataFusion's Rust runtime is a sibling of
+     * Arrow root and gets the larger share of the native budget (see
+     * {@code DataFusionPlugin#deriveMemoryPoolLimitDefault}).
+     *
+     * <p>Returns the bytes-as-string representation expected by the {@link Setting} parser.
+     * If the AC limit is unset (== 0), the default is {@link Long#MAX_VALUE} — unbounded —
+     * preserving pre-AC behaviour.
      */
     static String deriveRootLimitDefault(Settings settings) {
-        ByteSizeValue acLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
-        if (acLimit.getBytes() <= 0) {
+        ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
+        if (nativeLimit.getBytes() <= 0) {
             return Long.toString(Long.MAX_VALUE);
         }
-        int bufferPercent = ResourceTrackerSettings.NODE_NATIVE_MEMORY_BUFFER_PERCENT_SETTING.get(settings);
-        long usable = acLimit.getBytes() - (acLimit.getBytes() * bufferPercent / 100L);
-        return Long.toString(Math.max(0L, usable));
+        return Long.toString(nativeLimit.getBytes() * 20 / 100);
     }
 
     /** Minimum guaranteed bytes for the Flight pool. */
@@ -100,11 +105,24 @@ static String deriveRootLimitDefault(Settings settings) {
         Setting.Property.Dynamic
     );
 
-    /** Maximum bytes the Flight pool can burst to. */
-    public static final Setting<Long> FLIGHT_MAX_SETTING = Setting.longSetting(
+    /**
+     * Maximum bytes the Flight pool can burst to. Default is 5% of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
+     * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is
+     * unconfigured. Matches the partitioning model documented in PR #21732.
+     */
+    public static final Setting<Long> FLIGHT_MAX_SETTING = new Setting<>(
         NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX,
-        Long.MAX_VALUE,
-        0L,
+        s -> derivePoolMaxDefault(s, 5),
+        s -> {
+            long v = Long.parseLong(s);
+            if (v < 0) {
+                throw new IllegalArgumentException(
+                    "Setting [" + NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX + "] must be >= 0, got " + v
+                );
+            }
+            return v;
+        },
         Setting.Property.NodeScope,
         Setting.Property.Dynamic
     );
@@ -118,11 +136,25 @@ static String deriveRootLimitDefault(Settings settings) {
         Setting.Property.Dynamic
     );
 
-    /** Maximum bytes the ingest pool can burst to. */
-    public static final Setting<Long> INGEST_MAX_SETTING = Setting.longSetting(
+    /**
+     * Maximum bytes the ingest pool can burst to. Default is 8% of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
+     * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is
+     * unconfigured. Ingest gets a larger fraction than Flight/Query because parquet VSR
+     * allocators dominate write-path memory usage — see partitioning model in PR #21732.
+     */
+    public static final Setting<Long> INGEST_MAX_SETTING = new Setting<>(
         NativeAllocatorPoolConfig.SETTING_INGEST_MAX,
-        Long.MAX_VALUE,
-        0L,
+        s -> derivePoolMaxDefault(s, 8),
+        s -> {
+            long v = Long.parseLong(s);
+            if (v < 0) {
+                throw new IllegalArgumentException(
+                    "Setting [" + NativeAllocatorPoolConfig.SETTING_INGEST_MAX + "] must be >= 0, got " + v
+                );
+            }
+            return v;
+        },
         Setting.Property.NodeScope,
         Setting.Property.Dynamic
     );
@@ -141,24 +173,65 @@ static String deriveRootLimitDefault(Settings settings) {
     );
 
     /**
-     * Maximum bytes the query pool can allocate. Enforced by Arrow's child-allocator
-     * limit — analytics-engine's per-query allocators are children of this pool, so the
-     * sum of in-flight per-query allocations is capped here.
+     * Maximum bytes the query pool can allocate. Default is 5% of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
+     * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is
+     * unconfigured. Enforced by Arrow's child-allocator limit — analytics-engine's
+     * per-query allocators are children of this pool, so the sum of in-flight per-query
+     * allocations is capped here.
      *
      * <p>Note: each individual analytics query is also bounded by
      * {@code analytics.exec.QueryContext} per-query limit (currently the constant
      * {@code DEFAULT_PER_QUERY_MEMORY_LIMIT = 256 MB}). Lowering {@code QUERY_MAX}
      * below {@code 256 MB × concurrent-queries} can starve queries even when each
      * individual query is within its per-query limit.
      */
-    public static final Setting<Long> QUERY_MAX_SETTING = Setting.longSetting(
+    public static final Setting<Long> QUERY_MAX_SETTING = new Setting<>(
         NativeAllocatorPoolConfig.SETTING_QUERY_MAX,
-        Long.MAX_VALUE,
-        0L,
+        s -> derivePoolMaxDefault(s, 5),
+        s -> {
+            long v = Long.parseLong(s);
+            if (v < 0) {
+                throw new IllegalArgumentException(
+                    "Setting [" + NativeAllocatorPoolConfig.SETTING_QUERY_MAX + "] must be >= 0, got " + v
+                );
+            }
+            return v;
+        },
         Setting.Property.NodeScope,
         Setting.Property.Dynamic
     );
 
+    /**
+     * Computes the default for a pool max as a percentage of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING} (the operator's
+     * declared off-heap budget), falling back to {@link Long#MAX_VALUE} when AC is
+     * unconfigured. Returns the bytes-as-string representation expected by the
+     * {@link Setting} parser.
+     *
+     * <p>Pools are anchored to {@code node.native_memory.limit} rather than to
+     * {@link #ROOT_LIMIT_SETTING} so the diagrammed partitioning (PR #21732) holds:
+     * sum of pool maxes (5+8+5 = 18% of native_memory.limit) fits within the framework
+     * root cap (20% of native_memory.limit) by default. Operator overrides of
+     * {@code root.limit} that drop it below {@code sum(pool.max)} are caught by the
+     * grouped validator.
+     *
+     * <p>The fraction is taken straight from {@code node.native_memory.limit}, not from
+     * {@code limit - buffer_percent}. {@code buffer_percent} is an admission-control
+     * throttle margin, not a framework budget reduction.
+     *
+     * @param settings node settings
+     * @param percent  fraction of {@code node.native_memory.limit} the pool max defaults to
+     */
+    static String derivePoolMaxDefault(Settings settings, int percent) {
+        ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
+        if (nativeLimit.getBytes() <= 0) {
+            return Long.toString(Long.MAX_VALUE);
+        }
+        long pool = Math.max(0L, nativeLimit.getBytes() * percent / 100);
+        return Long.toString(pool);
+    }
+
     /** Interval in seconds between pool rebalance cycles. 0 disables rebalancing. */
     public static final Setting<Long> REBALANCE_INTERVAL_SETTING = Setting.longSetting(
         "native.allocator.rebalance.interval_seconds",
@@ -206,16 +279,6 @@ public Collection<Object> createComponents(
         return components;
     }
 
-    @Override
-    public Collection<Module> createGuiceModules() {
-        return List.of(new AbstractModule() {
-            @Override
-            protected void configure() {
-                bind(ArrowNativeAllocator.class).toInstance(allocator);
-            }
-        });
-    }
-
     /**
      * Registers cluster-settings update consumers that propagate dynamic setting changes
      * into the live {@link ArrowNativeAllocator}. Package-private so unit tests can exercise
 
@@ -10,10 +10,7 @@
 
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
 import org.opensearch.arrow.spi.NativeAllocator;
-import org.opensearch.arrow.spi.NativeAllocatorListener;
 import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
 import org.opensearch.arrow.spi.NativeAllocatorPoolStats;
 
@@ -24,7 +21,6 @@
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.ScheduledExecutorService;
 import java.util.concurrent.ScheduledFuture;
 import java.util.concurrent.TimeUnit;
@@ -49,13 +45,10 @@
  */
 public class ArrowNativeAllocator implements NativeAllocator {
 
-    private static final Logger logger = LogManager.getLogger(ArrowNativeAllocator.class);
-
     private final RootAllocator root;
     private final ConcurrentMap<String, ArrowPoolHandle> pools = new ConcurrentHashMap<>();
     private final ConcurrentMap<String, Long> poolMins = new ConcurrentHashMap<>();
     private final ConcurrentMap<String, Long> poolMaxes = new ConcurrentHashMap<>();
-    private final CopyOnWriteArrayList<NativeAllocatorListener> listeners = new CopyOnWriteArrayList<>();
     private final ScheduledExecutorService rebalancer;
     private volatile ScheduledFuture<?> rebalanceTask;
     /**
@@ -140,37 +133,7 @@ public void setPoolLimit(String poolName, long newLimit) {
             throw new IllegalStateException("Pool '" + poolName + "' does not exist");
         }
         poolMaxes.put(poolName, newLimit);
-        long previous = handle.allocator.getLimit();
         handle.allocator.setLimit(newLimit);
-        if (newLimit != previous) {
-            fireListeners(poolName, newLimit);
-        }
-    }
-
-    @Override
-    public void addListener(NativeAllocatorListener listener) {
-        listeners.addIfAbsent(listener);
-    }
-
-    @Override
-    public void removeListener(NativeAllocatorListener listener) {
-        listeners.remove(listener);
-    }
-
-    /**
-     * Notifies all registered listeners of a pool limit change. Each listener
-     * is invoked synchronously on the caller thread; exceptions thrown by one
-     * listener are logged and isolated so they do not block the others or the
-     * caller.
-     */
-    private void fireListeners(String poolName, long newLimit) {
-        for (NativeAllocatorListener listener : listeners) {
-            try {
-                listener.onPoolLimitChanged(poolName, newLimit);
-            } catch (Exception e) {
-                logger.warn("NativeAllocatorListener threw on pool [{}] limit update", poolName, e);
-            }
-        }
     }
 
     /**
@@ -183,8 +146,10 @@ private void fireListeners(String poolName, long newLimit) {
      * <p>Live propagation rules:
      * <ul>
      *   <li>If {@code newMin} exceeds the pool's current limit, the limit is raised to
-     *       {@code newMin} (capped at the configured pool max), and listeners fire so
-     *       downstream consumers (e.g. the DataFusion Rust runtime) see the new ceiling.
+     *       {@code newMin} (capped at the configured pool max). Children of the pool
+     *       allocator inherit the change automatically via Arrow's parent-cap check at
+     *       allocation time, so dynamic resizes reach in-flight workloads without an
+     *       explicit notification SPI.
      *   <li>If {@code newMin} is below the current limit, the limit is left alone —
      *       the rebalancer is the only path that shrinks live limits, so a min change
      *       on its own never reduces capacity in flight.
@@ -204,7 +169,6 @@ public void setPoolMin(String poolName, long newMin) {
         long target = Math.min(newMin, max);
         if (target > current) {
             handle.allocator.setLimit(target);
-            fireListeners(poolName, target);
         }
     }
 
@@ -292,11 +256,7 @@ void rebalance() {
             // Never exceed root
             effectiveLimit = Math.min(effectiveLimit, rootLimit);
 
-            long previous = alloc.getLimit();
             alloc.setLimit(effectiveLimit);
-            if (effectiveLimit != previous) {
-                fireListeners(name, effectiveLimit);
-            }
         }
     }