feat: add ParquetIndexingConfig with sort_fields and window_duration_secs

g-talbot · claude · g-talbot · commit 1fbece545664 · 2026-04-30T09:40:13.000-04:00
Adds `parquet_indexing` section to `IndexingSettings` for per-index
Parquet pipeline configuration:

- `sort_fields`: sort schema override (Husky-style pipe-delimited
  syntax with /V2 suffix). Controls row ordering, query pruning,
  compression locality, and compaction scope. When omitted, uses
  the product-type default.
- `window_duration_secs`: time window for split partitioning
  (default 900s / 15 min). Must divide 3600.

Updates docs/configuration/index-config.md with:
- "Parquet indexing settings" section explaining both parameters
- Full sort schema syntax reference (column types, direction
  overrides, &amp; LSM cutoff marker)
- Examples showing minimal, custom, and advanced configurations

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md
@@ -596,6 +596,7 @@ This section describes indexing settings for a given index.
 | `split_num_docs_target` | Target number of docs per split.   | `10000000` |
 | `merge_policy` | Describes the strategy used to trigger split merge operations for logs/traces (see [Merge policies](#merge-policies) section below). |
 | `parquet_merge_policy` | Describes the merge policy for Parquet (metrics/sketches) splits (see [Parquet merge policy](#parquet-merge-policy) section below). |
+| `parquet_indexing` | Parquet-specific indexing settings: sort schema, window duration (see [Parquet indexing settings](#parquet-indexing-settings) section below). |
 | `resources.heap_size`      | Indexer heap size per source per index.   | `2000000000` |
 | `docstore_compression_level` | Level of compression used by zstd for the docstore. Lower values may increase ingest speed, at the cost of index size | `8` |
 | `docstore_blocksize` | Size of blocks in the docstore, in bytes. Lower values may improve doc retrieval speed, at the cost of index size | `1000000` |
@@ -688,6 +689,57 @@ indexing_settings:
         type: "no_merge"
 ```
 
+### Parquet indexing settings
+
+*For indexes using the Parquet indexing pipeline (metrics, sketches).*
+
+These settings control how the Parquet pipeline sorts, windows, and writes incoming data. They affect both ingest-time performance and downstream query/compaction efficiency.
+
+```yaml
+version: 0.7
+index_id: "my-metrics-index"
+# ...
+indexing_settings:
+  parquet_indexing:
+    sort_fields: "metric_name|service|env|host|timeseries_id|timestamp_secs/V2"
+    window_duration_secs: 900
+```
+
+| Variable      | Description   | Default value |
+| ------------- | ------------- | ------------- |
+| `sort_fields` | Sort schema for row ordering in Parquet files (see syntax below). When omitted, the product-type default is used. | `metric_name\|service\|env\|datacenter\|region\|host\|timeseries_id\|timestamp_secs/V2` |
+| `window_duration_secs` | Time window duration in seconds for split partitioning. Must evenly divide 3600. Larger values = fewer splits but coarser time pruning. | `900` (15 minutes) |
+
+#### Sort schema syntax
+
+The sort schema uses pipe-delimited column names with a `/V2` version suffix:
+
+```text
+column1|column2|...|timestamp_secs/V2
+```
+
+**Column types** are inferred from name suffixes:
+- `__s` → string (e.g., `custom_tag__s`)
+- `__i` → int64 (e.g., `priority__i`)
+- Well-known names like `metric_name`, `service`, `env`, `host`, `timestamp_secs`, and `timeseries_id` have built-in type mappings and don't need suffixes.
+
+**Sort direction** defaults to ascending for most columns and descending for timestamp columns. Override with `+` (ascending) or `-` (descending) as a prefix or suffix on the column name:
+
+```text
+# Explicit descending timestamp
+metric_name|host|-timestamp_secs/V2
+
+# Ascending host (default), descending timestamp (default)
+metric_name|host|timestamp_secs/V2
+```
+
+**How the sort schema affects behavior:**
+- **Query pruning**: queries filtering on leading columns (e.g., `metric_name`) can skip entire splits whose row key ranges don't match.
+- **Compression**: grouping similar values together (e.g., all rows for the same metric name) improves columnar compression ratios.
+- **Compaction scope**: splits with different sort schemas are never merged together. Changing the sort schema on an existing index creates a new compaction scope — old splits are not re-sorted.
+
+**The `&` marker** (advanced) sets the LSM comparison cutoff: columns after `&` are used for sort order but not for compaction locality decisions. For example, `metric_name|&host|timestamp_secs/V2` sorts by metric_name then host, but only metric_name determines which splits can be merged.
+
 #### Parquet merge policy
 
 *For indexes using the Parquet indexing pipeline (metrics, sketches).*
diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs
@@ -123,10 +123,86 @@ pub struct IndexingSettings {
     /// indexes that use the Parquet indexing pipeline.
     #[serde(default)]
     pub parquet_merge_policy: ParquetMergePolicyConfig,
+    /// Parquet-specific indexing settings (sort schema, window duration,
+    /// compression). Only used by indexes that use the Parquet pipeline.
+    #[serde(default)]
+    pub parquet_indexing: ParquetIndexingConfig,
     #[serde(default)]
     pub resources: IndexingResources,
 }
 
+/// Configuration for the Parquet indexing pipeline (metrics, sketches).
+///
+/// Controls how incoming data is sorted, windowed, and compressed before
+/// writing to Parquet split files. These settings affect both ingest-time
+/// performance and downstream query/compaction efficiency.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Hash, utoipa::ToSchema)]
+#[serde(deny_unknown_fields)]
+pub struct ParquetIndexingConfig {
+    /// Sort schema defining the physical sort order of rows in Parquet files.
+    ///
+    /// Uses Husky-style pipe-delimited syntax with a `/V2` version suffix.
+    /// Each column is sorted ascending by default; use `+` or `-` prefix/suffix
+    /// to override. Column types are inferred from well-known suffixes
+    /// (`__s` = string, `__i` = int64, `_secs` = uint64 timestamp).
+    ///
+    /// The sort order determines:
+    /// - **Query pruning**: queries that filter on leading sort columns can
+    ///   skip entire splits whose row key ranges don't match.
+    /// - **Compression**: columns with good locality (e.g., metric_name first)
+    ///   compress better in Parquet's columnar format.
+    /// - **Compaction scope**: splits with different sort schemas are never
+    ///   merged together.
+    ///
+    /// When `None`, the product-type default is used (see below).
+    ///
+    /// # Default (metrics/sketches)
+    /// ```text
+    /// metric_name|service|env|datacenter|region|host|timeseries_id|timestamp_secs/V2
+    /// ```
+    ///
+    /// # Examples
+    /// ```text
+    /// # Minimal: just metric name and timestamp
+    /// metric_name|timestamp_secs/V2
+    ///
+    /// # Custom tags in sort order
+    /// metric_name|service|cluster|host|timestamp_secs/V2
+    ///
+    /// # Explicit descending timestamp
+    /// metric_name|host|-timestamp_secs/V2
+    /// ```
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub sort_fields: Option<String>,
+
+    /// Time window duration in seconds for split partitioning.
+    ///
+    /// Incoming data is partitioned into time windows of this duration.
+    /// Splits within the same window may be compacted together; splits in
+    /// different windows are never merged. Must evenly divide 3600 (one hour).
+    ///
+    /// Larger values produce fewer, larger splits (better for bulk queries)
+    /// but coarser time-based pruning. Smaller values give finer pruning
+    /// but more splits to manage.
+    #[serde(default = "ParquetIndexingConfig::default_window_duration_secs")]
+    pub window_duration_secs: u32,
+}
+
+impl ParquetIndexingConfig {
+    fn default_window_duration_secs() -> u32 {
+        900
+    }
+}
+
+impl Default for ParquetIndexingConfig {
+    fn default() -> Self {
+        Self {
+            sort_fields: None,
+            window_duration_secs: Self::default_window_duration_secs(),
+        }
+    }
+}
+
 impl IndexingSettings {
     pub fn commit_timeout(&self) -> Duration {
         Duration::from_secs(self.commit_timeout_secs as u64)
@@ -166,6 +242,7 @@ impl Default for IndexingSettings {
             split_num_docs_target: Self::default_split_num_docs_target(),
             merge_policy: MergePolicyConfig::default(),
             parquet_merge_policy: ParquetMergePolicyConfig::default(),
+            parquet_indexing: ParquetIndexingConfig::default(),
             resources: IndexingResources::default(),
         }
     }
diff --git a/quickwit/quickwit-config/src/lib.rs b/quickwit/quickwit-config/src/lib.rs
@@ -45,9 +45,9 @@ pub use cluster_config::ClusterConfig;
 // See #2048
 use index_config::serialize::{IndexConfigV0_8, VersionedIndexConfig};
 pub use index_config::{
-    IndexConfig, IndexingResources, IndexingSettings, IngestSettings, RetentionPolicy,
-    SearchSettings, build_doc_mapper, load_index_config_from_user_config, load_index_config_update,
-    prepare_doc_mapping_update,
+    IndexConfig, IndexingResources, IndexingSettings, IngestSettings, ParquetIndexingConfig,
+    RetentionPolicy, SearchSettings, build_doc_mapper, load_index_config_from_user_config,
+    load_index_config_update, prepare_doc_mapping_update,
 };
 pub use quickwit_doc_mapper::DocMapping;
 use serde::Serialize;
@@ -114,6 +114,7 @@ pub fn disable_ingest_v1() -> bool {
     KafkaSourceParams,
     KinesisSourceParams,
     MergePolicyConfig,
+    ParquetIndexingConfig,
     ParquetMergePolicyConfig,
     PubSubSourceParams,
     PulsarSourceAuth,