opensearch-project
diff --git a/‎libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java‎
Lines changed: 3 additions & 19 deletions b/‎libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java‎
Lines changed: 3 additions & 19 deletions
diff --git a/‎libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorListener.java‎
Lines changed: 0 additions & 39 deletions b/‎libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorListener.java‎
Lines changed: 0 additions & 39 deletions
diff --git a/‎plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java‎
Lines changed: 106 additions & 43 deletions b/‎plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java‎
Lines changed: 106 additions & 43 deletions
@@ -36,7 +36,9 @@ public interface NativeAllocator extends Closeable {
     PoolHandle getOrCreatePool(String poolName, long limit);
 
     /**
-     * Updates the limit of an existing pool.
+     * Updates the limit of an existing pool. Children of the pool allocator
+     * inherit the change automatically via Arrow's parent-cap check at
+     * allocation time — no notification SPI is needed.
      *
      * @param poolName logical pool name
      * @param newLimit new maximum bytes for the pool
@@ -55,24 +57,6 @@ public interface NativeAllocator extends Closeable {
      */
     NativeAllocatorPoolStats stats();
 
-    /**
-     * Registers a listener that is invoked after a pool's limit changes.
-     *
-     * <p>Listener invocation is synchronous on the caller thread. See
-     * {@link NativeAllocatorListener} for threading constraints.
-     *
-     * @param listener the listener to register
-     */
-    void addListener(NativeAllocatorListener listener);
-
-    /**
-     * Unregisters a previously registered listener. No-op if the listener
-     * was not registered.
-     *
-     * @param listener the listener to remove
-     */
-    void removeListener(NativeAllocatorListener listener);
-
     /**
      * Opaque handle to a memory pool. Plugins downcast to the concrete type
      * (e.g., Arrow's {@code BufferAllocator}) in the implementation layer.
 
@@ -11,8 +11,6 @@
 import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
 import org.opensearch.cluster.metadata.IndexNameExpressionResolver;
 import org.opensearch.cluster.service.ClusterService;
-import org.opensearch.common.inject.AbstractModule;
-import org.opensearch.common.inject.Module;
 import org.opensearch.common.settings.ClusterSettings;
 import org.opensearch.common.settings.Setting;
 import org.opensearch.common.settings.Settings;
@@ -55,21 +53,21 @@ public ArrowBasePlugin() {}
     /**
      * Maximum bytes for the root Arrow allocator.
      *
-     * <p>When unset, the default is derived from the admission-control budget
-     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING} reduced by
-     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_BUFFER_PERCENT_SETTING} —
-     * the same budget AC throttles on. If AC is unconfigured (limit = 0), the
-     * default is {@link Long#MAX_VALUE}, preserving pre-AC behaviour.
+     * <p>When unset, the default is 20% of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
+     * {@link #deriveRootLimitDefault}. The Arrow framework gets a small fraction of the
+     * native budget because the dominant consumer of native memory in analytics workloads
+     * is the DataFusion Rust runtime (~75% of {@code node.native_memory.limit}), not Arrow.
+     * If AC is unconfigured (limit = 0), the default is {@link Long#MAX_VALUE}, preserving
+     * pre-AC behaviour.
      */
     public static final Setting<Long> ROOT_LIMIT_SETTING = new Setting<>(
         NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT,
         ArrowBasePlugin::deriveRootLimitDefault,
         s -> {
             long v = Long.parseLong(s);
             if (v < 0) {
-                throw new IllegalArgumentException(
-                    "Setting [" + NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT + "] must be >= 0, got " + v
-                );
+                throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT + "] must be >= 0, got " + v);
             }
             return v;
         },
@@ -78,17 +76,22 @@ public ArrowBasePlugin() {}
     );
 
     /**
-     * Computes the default for {@link #ROOT_LIMIT_SETTING} from the AC native-memory budget.
-     * Returns the bytes-as-string representation expected by the Setting parser.
+     * Computes the default for {@link #ROOT_LIMIT_SETTING} as 20% of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}. The Arrow framework's
+     * hard cap covers only Arrow allocations — DataFusion's Rust runtime is a sibling of
+     * Arrow root and gets the larger share of the native budget (see
+     * {@code DataFusionPlugin#deriveMemoryPoolLimitDefault}).
+     *
+     * <p>Returns the bytes-as-string representation expected by the {@link Setting} parser.
+     * If the AC limit is unset (== 0), the default is {@link Long#MAX_VALUE} — unbounded —
+     * preserving pre-AC behaviour.
      */
     static String deriveRootLimitDefault(Settings settings) {
-        ByteSizeValue acLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
-        if (acLimit.getBytes() <= 0) {
+        ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
+        if (nativeLimit.getBytes() <= 0) {
             return Long.toString(Long.MAX_VALUE);
         }
-        int bufferPercent = ResourceTrackerSettings.NODE_NATIVE_MEMORY_BUFFER_PERCENT_SETTING.get(settings);
-        long usable = acLimit.getBytes() - (acLimit.getBytes() * bufferPercent / 100L);
-        return Long.toString(Math.max(0L, usable));
+        return Long.toString(nativeLimit.getBytes() * 20 / 100);
     }
 
     /** Minimum guaranteed bytes for the Flight pool. */
@@ -100,11 +103,22 @@ static String deriveRootLimitDefault(Settings settings) {
         Setting.Property.Dynamic
     );
 
-    /** Maximum bytes the Flight pool can burst to. */
-    public static final Setting<Long> FLIGHT_MAX_SETTING = Setting.longSetting(
+    /**
+     * Maximum bytes the Flight pool can burst to. Default is 5% of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
+     * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is
+     * unconfigured. Matches the partitioning model documented in PR #21732.
+     */
+    public static final Setting<Long> FLIGHT_MAX_SETTING = new Setting<>(
         NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX,
-        Long.MAX_VALUE,
-        0L,
+        s -> derivePoolMaxDefault(s, 5),
+        s -> {
+            long v = Long.parseLong(s);
+            if (v < 0) {
+                throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX + "] must be >= 0, got " + v);
+            }
+            return v;
+        },
         Setting.Property.NodeScope,
         Setting.Property.Dynamic
     );
@@ -118,11 +132,23 @@ static String deriveRootLimitDefault(Settings settings) {
         Setting.Property.Dynamic
     );
 
-    /** Maximum bytes the ingest pool can burst to. */
-    public static final Setting<Long> INGEST_MAX_SETTING = Setting.longSetting(
+    /**
+     * Maximum bytes the ingest pool can burst to. Default is 8% of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
+     * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is
+     * unconfigured. Ingest gets a larger fraction than Flight/Query because parquet VSR
+     * allocators dominate write-path memory usage — see partitioning model in PR #21732.
+     */
+    public static final Setting<Long> INGEST_MAX_SETTING = new Setting<>(
         NativeAllocatorPoolConfig.SETTING_INGEST_MAX,
-        Long.MAX_VALUE,
-        0L,
+        s -> derivePoolMaxDefault(s, 8),
+        s -> {
+            long v = Long.parseLong(s);
+            if (v < 0) {
+                throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_INGEST_MAX + "] must be >= 0, got " + v);
+            }
+            return v;
+        },
         Setting.Property.NodeScope,
         Setting.Property.Dynamic
     );
@@ -141,24 +167,63 @@ static String deriveRootLimitDefault(Settings settings) {
     );
 
     /**
-     * Maximum bytes the query pool can allocate. Enforced by Arrow's child-allocator
-     * limit — analytics-engine's per-query allocators are children of this pool, so the
-     * sum of in-flight per-query allocations is capped here.
+     * Maximum bytes the query pool can allocate. Default is 5% of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
+     * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is
+     * unconfigured. Enforced by Arrow's child-allocator limit — analytics-engine's
+     * per-query allocators are children of this pool, so the sum of in-flight per-query
+     * allocations is capped here.
      *
      * <p>Note: each individual analytics query is also bounded by
      * {@code analytics.exec.QueryContext} per-query limit (currently the constant
      * {@code DEFAULT_PER_QUERY_MEMORY_LIMIT = 256 MB}). Lowering {@code QUERY_MAX}
      * below {@code 256 MB × concurrent-queries} can starve queries even when each
      * individual query is within its per-query limit.
      */
-    public static final Setting<Long> QUERY_MAX_SETTING = Setting.longSetting(
+    public static final Setting<Long> QUERY_MAX_SETTING = new Setting<>(
         NativeAllocatorPoolConfig.SETTING_QUERY_MAX,
-        Long.MAX_VALUE,
-        0L,
+        s -> derivePoolMaxDefault(s, 5),
+        s -> {
+            long v = Long.parseLong(s);
+            if (v < 0) {
+                throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_QUERY_MAX + "] must be >= 0, got " + v);
+            }
+            return v;
+        },
         Setting.Property.NodeScope,
         Setting.Property.Dynamic
     );
 
+    /**
+     * Computes the default for a pool max as a percentage of
+     * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING} (the operator's
+     * declared off-heap budget), falling back to {@link Long#MAX_VALUE} when AC is
+     * unconfigured. Returns the bytes-as-string representation expected by the
+     * {@link Setting} parser.
+     *
+     * <p>Pools are anchored to {@code node.native_memory.limit} rather than to
+     * {@link #ROOT_LIMIT_SETTING} so the diagrammed partitioning (PR #21732) holds:
+     * sum of pool maxes (5+8+5 = 18% of native_memory.limit) fits within the framework
+     * root cap (20% of native_memory.limit) by default. Operator overrides of
+     * {@code root.limit} that drop it below {@code sum(pool.max)} are caught by the
+     * grouped validator.
+     *
+     * <p>The fraction is taken straight from {@code node.native_memory.limit}, not from
+     * {@code limit - buffer_percent}. {@code buffer_percent} is an admission-control
+     * throttle margin, not a framework budget reduction.
+     *
+     * @param settings node settings
+     * @param percent  fraction of {@code node.native_memory.limit} the pool max defaults to
+     */
+    static String derivePoolMaxDefault(Settings settings, int percent) {
+        ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
+        if (nativeLimit.getBytes() <= 0) {
+            return Long.toString(Long.MAX_VALUE);
+        }
+        long pool = Math.max(0L, nativeLimit.getBytes() * percent / 100);
+        return Long.toString(pool);
+    }
+
     /** Interval in seconds between pool rebalance cycles. 0 disables rebalancing. */
     public static final Setting<Long> REBALANCE_INTERVAL_SETTING = Setting.longSetting(
         "native.allocator.rebalance.interval_seconds",
@@ -194,8 +259,16 @@ public Collection<Object> createComponents(
         // dynamic updates via the grouped consumer below.
         validateUpdate(settings);
 
-        allocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_FLIGHT, FLIGHT_MIN_SETTING.get(settings), FLIGHT_MAX_SETTING.get(settings));
-        allocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, INGEST_MIN_SETTING.get(settings), INGEST_MAX_SETTING.get(settings));
+        allocator.getOrCreatePool(
+            NativeAllocatorPoolConfig.POOL_FLIGHT,
+            FLIGHT_MIN_SETTING.get(settings),
+            FLIGHT_MAX_SETTING.get(settings)
+        );
+        allocator.getOrCreatePool(
+            NativeAllocatorPoolConfig.POOL_INGEST,
+            INGEST_MIN_SETTING.get(settings),
+            INGEST_MAX_SETTING.get(settings)
+        );
         allocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_QUERY, QUERY_MIN_SETTING.get(settings), QUERY_MAX_SETTING.get(settings));
 
         ClusterSettings cs = clusterService.getClusterSettings();
@@ -206,16 +279,6 @@ public Collection<Object> createComponents(
         return components;
     }
 
-    @Override
-    public Collection<Module> createGuiceModules() {
-        return List.of(new AbstractModule() {
-            @Override
-            protected void configure() {
-                bind(ArrowNativeAllocator.class).toInstance(allocator);
-            }
-        });
-    }
-
     /**
      * Registers cluster-settings update consumers that propagate dynamic setting changes
      * into the live {@link ArrowNativeAllocator}. Package-private so unit tests can exercise