opensearch-project
diff --git a/‎sandbox/libs/dataformat-native/rust/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎sandbox/libs/dataformat-native/rust/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sandbox/plugins/composite-engine/src/internalClusterTest/java/org/opensearch/composite/CompositeMergeIT.java‎
Lines changed: 211 additions & 67 deletions b/‎sandbox/plugins/composite-engine/src/internalClusterTest/java/org/opensearch/composite/CompositeMergeIT.java‎
Lines changed: 211 additions & 67 deletions
diff --git a/‎sandbox/plugins/parquet-data-format/benchmarks/src/main/java/org/opensearch/parquet/benchmark/VSRRotationBenchmark.java‎
Lines changed: 8 additions & 1 deletion b/‎sandbox/plugins/parquet-data-format/benchmarks/src/main/java/org/opensearch/parquet/benchmark/VSRRotationBenchmark.java‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetSettings.java‎
Lines changed: 137 additions & 17 deletions b/‎sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetSettings.java‎
Lines changed: 137 additions & 17 deletions
diff --git a/‎sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/bridge/NativeParquetWriter.java‎
Lines changed: 5 additions & 3 deletions b/‎sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/bridge/NativeParquetWriter.java‎
Lines changed: 5 additions & 3 deletions
@@ -58,6 +58,7 @@ once_cell = "1.21.3"
 crc32fast = "1.4"
 parking_lot = "0.12.5"
 lazy_static = "1.4.0"
+rayon = "1.10"
 thiserror = "1.0"
 async-trait = "0.1"
 bytes = "1"
 
@@ -10,7 +10,10 @@
 
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.Schema;
+import org.opensearch.Version;
+import org.opensearch.cluster.metadata.IndexMetadata;
 import org.opensearch.common.settings.Settings;
+import org.opensearch.index.IndexSettings;
 import org.opensearch.index.mapper.KeywordFieldMapper;
 import org.opensearch.index.mapper.MappedFieldType;
 import org.opensearch.index.mapper.NumberFieldMapper;
@@ -80,6 +83,7 @@ public class VSRRotationBenchmark {
     private List<MappedFieldType> fieldTypes;
     private VSRManager vsrManager;
     private String filePath;
+    private IndexSettings indexSettings;
 
     @Setup(Level.Trial)
     public void setupTrial() {
@@ -123,7 +127,10 @@ public void setupTrial() {
     public void setup() throws IOException {
         bufferPool = new ArrowBufferPool(Settings.EMPTY);
         filePath = Path.of(System.getProperty("java.io.tmpdir"), "benchmark_vsr_" + System.nanoTime() + ".parquet").toString();
-        vsrManager = new VSRManager(filePath, schema, bufferPool, maxRowsPerVSR, threadPool, runAsync);
+        Settings idxSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).build();
+        IndexMetadata indexMetadata = IndexMetadata.builder("benchmark-index").settings(idxSettings).build();
+        indexSettings = new IndexSettings(indexMetadata, Settings.EMPTY);
+        vsrManager = new VSRManager(filePath, indexSettings, schema, bufferPool, maxRowsPerVSR, threadPool, runAsync);
     }
 
     @Benchmark
 
@@ -9,49 +9,169 @@
 package org.opensearch.parquet;
 
 import org.opensearch.common.settings.Setting;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.core.common.unit.ByteSizeUnit;
+import org.opensearch.core.common.unit.ByteSizeValue;
 
 import java.util.List;
 
 /**
- * Node-scoped settings for the Parquet data format plugin.
- *
- * <p>All settings are registered with OpenSearch via
- * {@link ParquetDataFormatPlugin#getSettings()} and can be configured in
- * {@code opensearch.yml} or via cluster settings API.
- *
- * <ul>
- *   <li>{@link #MAX_NATIVE_ALLOCATION} — Maximum native memory allocation for Arrow buffers,
- *       expressed as a percentage of available non-heap system memory (default {@code "10%"}).</li>
- *   <li>{@link #MAX_ROWS_PER_VSR} — Row count threshold that triggers VectorSchemaRoot rotation
- *       during document ingestion (default {@code 50000}).</li>
- * </ul>
+ * Settings for Parquet data format.
  */
 public final class ParquetSettings {
 
     private ParquetSettings() {}
 
-    /** Default maximum native memory allocation as a percentage of available non-heap memory. */
     public static final String DEFAULT_MAX_NATIVE_ALLOCATION = "10%";
-    /** Default maximum number of rows per VectorSchemaRoot before rotation. */
     public static final int DEFAULT_MAX_ROWS_PER_VSR = 50000;
 
-    /** Maximum native memory allocation for Arrow buffers, as a percentage of non-heap memory. */
+    /** Group setting prefix for all Parquet settings. */
+    public static final Setting<Settings> PARQUET_SETTINGS = Setting.groupSetting("index.parquet.", Setting.Property.IndexScope);
+
+    /** Data page size limit in bytes (default 1MB). */
+    public static final Setting<ByteSizeValue> PAGE_SIZE_BYTES = Setting.byteSizeSetting(
+        "index.parquet.page_size_bytes",
+        new ByteSizeValue(1, ByteSizeUnit.MB),
+        Setting.Property.IndexScope
+    );
+
+    /** Maximum number of rows per data page (default 20000). */
+    public static final Setting<Integer> PAGE_ROW_LIMIT = Setting.intSetting(
+        "index.parquet.page_row_limit",
+        20000,
+        1,
+        Setting.Property.IndexScope
+    );
+
+    /** Dictionary page size limit in bytes (default 2MB). */
+    public static final Setting<ByteSizeValue> DICT_SIZE_BYTES = Setting.byteSizeSetting(
+        "index.parquet.dict_size_bytes",
+        new ByteSizeValue(2, ByteSizeUnit.MB),
+        Setting.Property.IndexScope
+    );
+
+    /** Compression codec for Parquet files, e.g. ZSTD, SNAPPY, LZ4_RAW (default LZ4_RAW). */
+    public static final Setting<String> COMPRESSION_TYPE = Setting.simpleString(
+        "index.parquet.compression_type",
+        "LZ4_RAW",
+        Setting.Property.IndexScope
+    );
+
+    /** Compression level for the chosen codec (default 2, range 1–9). */
+    public static final Setting<Integer> COMPRESSION_LEVEL = Setting.intSetting(
+        "index.parquet.compression_level",
+        2,
+        1,
+        9,
+        Setting.Property.IndexScope
+    );
+
+    /** Whether bloom filters are enabled for Parquet columns (default true). */
+    public static final Setting<Boolean> BLOOM_FILTER_ENABLED = Setting.boolSetting(
+        "index.parquet.bloom_filter_enabled",
+        true,
+        Setting.Property.IndexScope
+    );
+
+    /** Bloom filter false positive probability (default 0.1). */
+    public static final Setting<Double> BLOOM_FILTER_FPP = Setting.doubleSetting(
+        "index.parquet.bloom_filter_fpp",
+        0.1,
+        0.0,
+        1.0,
+        Setting.Property.IndexScope
+    );
+
+    /** Bloom filter number of distinct values hint (default 100000). */
+    public static final Setting<Long> BLOOM_FILTER_NDV = Setting.longSetting(
+        "index.parquet.bloom_filter_ndv",
+        100_000L,
+        1L,
+        Setting.Property.IndexScope
+    );
+
+    /** Maximum native memory allocation for Arrow buffers, as a percentage of non-heap memory (default 10%). */
     public static final Setting<String> MAX_NATIVE_ALLOCATION = Setting.simpleString(
         "parquet.max_native_allocation",
         DEFAULT_MAX_NATIVE_ALLOCATION,
         Setting.Property.NodeScope
     );
 
-    /** Maximum number of rows per VectorSchemaRoot before rotation is triggered. */
+    /** Maximum rows per VectorSchemaRoot before rotation is triggered (default 50000). */
     public static final Setting<Integer> MAX_ROWS_PER_VSR = Setting.intSetting(
         "parquet.max_rows_per_vsr",
         DEFAULT_MAX_ROWS_PER_VSR,
         1,
         Setting.Property.NodeScope
     );
 
+    /** File size threshold for in-memory sort vs streaming merge sort (default 32MB). */
+    public static final Setting<ByteSizeValue> SORT_IN_MEMORY_THRESHOLD = Setting.byteSizeSetting(
+        "index.parquet.sort_in_memory_threshold",
+        new ByteSizeValue(32, ByteSizeUnit.MB),
+        Setting.Property.IndexScope
+    );
+
+    /** Batch size for streaming merge sort (default 8192 rows). */
+    public static final Setting<Integer> SORT_BATCH_SIZE = Setting.intSetting(
+        "index.parquet.sort_batch_size",
+        8192,
+        1,
+        Setting.Property.IndexScope
+    );
+
+    /** Maximum number of rows per row group (default 1000000). */
+    public static final Setting<Integer> ROW_GROUP_MAX_ROWS = Setting.intSetting(
+        "index.parquet.row_group_max_rows",
+        1_000_000,
+        1,
+        Setting.Property.IndexScope
+    );
+
+    /** Batch size for reading records during merge (default 100000 rows). */
+    public static final Setting<Integer> MERGE_BATCH_SIZE = Setting.intSetting(
+        "index.parquet.merge_batch_size",
+        100_000,
+        1,
+        Setting.Property.IndexScope
+    );
+
+    /** Number of Rayon threads for parallel column encoding during merge (default num_cores/8, min 1). */
+    public static final Setting<Integer> MERGE_RAYON_THREADS = Setting.intSetting(
+        "parquet.merge_rayon_threads",
+        Math.max(1, Runtime.getRuntime().availableProcessors() / 8),
+        1,
+        Setting.Property.NodeScope
+    );
+
+    /** Number of Tokio IO threads for async disk writes during merge (default num_cores/8, min 1). */
+    public static final Setting<Integer> MERGE_IO_THREADS = Setting.intSetting(
+        "parquet.merge_io_threads",
+        Math.max(1, Runtime.getRuntime().availableProcessors() / 8),
+        1,
+        Setting.Property.NodeScope
+    );
+
     /** Returns all settings defined by the Parquet plugin. */
     public static List<Setting<?>> getSettings() {
-        return List.of(MAX_NATIVE_ALLOCATION, MAX_ROWS_PER_VSR);
+        return List.of(
+            PARQUET_SETTINGS,
+            PAGE_SIZE_BYTES,
+            PAGE_ROW_LIMIT,
+            DICT_SIZE_BYTES,
+            COMPRESSION_TYPE,
+            COMPRESSION_LEVEL,
+            BLOOM_FILTER_ENABLED,
+            BLOOM_FILTER_FPP,
+            BLOOM_FILTER_NDV,
+            MAX_NATIVE_ALLOCATION,
+            MAX_ROWS_PER_VSR,
+            SORT_IN_MEMORY_THRESHOLD,
+            SORT_BATCH_SIZE,
+            ROW_GROUP_MAX_ROWS,
+            MERGE_BATCH_SIZE,
+            MERGE_RAYON_THREADS,
+            MERGE_IO_THREADS
+        );
     }
 }
@@ -18,7 +18,7 @@
  *
  * <p>Wraps the stateless JNI methods in {@link RustBridge} with a file-scoped lifecycle:
  * <ol>
- *   <li>{@code new NativeParquetWriter(filePath, schemaAddress)} — creates the native writer</li>
+ *   <li>{@code new NativeParquetWriter(filePath, indexName, schemaAddress, sortConfig)} — creates the native writer</li>
  *   <li>{@link #write(long, long)} — sends one or more Arrow batches (repeatable)</li>
  *   <li>{@link #flush()} — finalizes the Parquet file and returns metadata</li>
  *   <li>{@link #sync()} — fsyncs the file to durable storage (calls flush if needed)</li>
@@ -37,12 +37,14 @@ public class NativeParquetWriter {
      * Creates a new NativeParquetWriter.
      *
      * @param filePath      the path to the Parquet file to write
+     * @param indexName     the index name for settings lookup
      * @param schemaAddress the native memory address of the Arrow schema
+     * @param sortConfig    the sort configuration for the Parquet file
      * @throws IOException if the native writer creation fails
      */
-    public NativeParquetWriter(String filePath, long schemaAddress) throws IOException {
+    public NativeParquetWriter(String filePath, String indexName, long schemaAddress, ParquetSortConfig sortConfig) throws IOException {
         this.filePath = filePath;
-        RustBridge.createWriter(filePath, schemaAddress);
+        RustBridge.createWriter(filePath, indexName, schemaAddress, sortConfig);
     }
 
     /**