Skip to content

Commit ea9adea

Browse files
g-talbotclaude
andcommitted
feat: add configurable ParquetMergePolicyConfig to index settings
Adds `parquet_merge_policy` section to `IndexingSettings`, making the Parquet merge policy configurable per-index via YAML. Parameters: - merge_factor (default 10): min splits to trigger a merge - max_merge_factor (default 12): max splits per merge - max_merge_ops (default 4): bounds write amplification - target_split_size_bytes (default 256 MiB): target output size - maturation_period (default 48h): split maturity timeout - max_finalize_merge_operations (default 3): cold-window shutdown limit Mirrors the existing merge_policy config pattern for logs/traces. Updates index-config.md documentation with the new section. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4d6fd21 commit ea9adea

4 files changed

Lines changed: 105 additions & 3 deletions

File tree

docs/configuration/index-config.md

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,8 @@ This section describes indexing settings for a given index.
594594
| ------------- | ------------- | ------------- |
595595
| `commit_timeout_secs` | Maximum number of seconds before committing a split since its creation. | `60` |
596596
| `split_num_docs_target` | Target number of docs per split. | `10000000` |
597-
| `merge_policy` | Describes the strategy used to trigger split merge operations (see [Merge policies](#merge-policies) section below). |
597+
| `merge_policy` | Describes the strategy used to trigger split merge operations for logs/traces (see [Merge policies](#merge-policies) section below). |
598+
| `parquet_merge_policy` | Describes the merge policy for Parquet (metrics/sketches) splits (see [Parquet merge policy](#parquet-merge-policy) section below). |
598599
| `resources.heap_size` | Indexer heap size per source per index. | `2000000000` |
599600
| `docstore_compression_level` | Level of compression used by zstd for the docstore. Lower values may increase ingest speed, at the cost of index size | `8` |
600601
| `docstore_blocksize` | Size of blocks in the docstore, in bytes. Lower values may improve doc retrieval speed, at the cost of index size | `1000000` |
@@ -687,6 +688,35 @@ indexing_settings:
687688
type: "no_merge"
688689
```
689690

691+
#### Parquet merge policy
692+
693+
*For indexes using the Parquet indexing pipeline (metrics, sketches).*
694+
695+
The Parquet merge policy controls how Parquet splits within a compaction scope (same time window, partition, and sort schema) are merged. It uses a constant write amplification strategy: splits at the same merge level are greedily accumulated until reaching `max_merge_factor` or `target_split_size_bytes`.
696+
697+
```yaml
698+
version: 0.7
699+
index_id: "my-metrics-index"
700+
# ...
701+
indexing_settings:
702+
parquet_merge_policy:
703+
merge_factor: 10
704+
max_merge_factor: 12
705+
max_merge_ops: 4
706+
target_split_size_bytes: 268435456
707+
maturation_period: 48h
708+
max_finalize_merge_operations: 3
709+
```
710+
711+
712+
| Variable | Description | Default value |
713+
| ------------- | ------------- | ------------- |
714+
| `merge_factor` | Minimum number of splits to trigger a merge. | `10` |
715+
| `max_merge_factor` | Maximum number of splits in a single merge operation. | `12` |
716+
| `max_merge_ops` | Maximum number of merges a split can undergo before becoming mature. Bounds total write amplification. | `4` |
717+
| `target_split_size_bytes` | Target size for merged output splits in bytes. Merges trigger when accumulated bytes reach this threshold, even if `merge_factor` is not reached. | `268435456` (256 MiB) |
718+
| `maturation_period` | Duration after creation when a split becomes mature (never merged again). | `48h` |
719+
| `max_finalize_merge_operations` | *(advanced)* Maximum number of merge operations emitted during cold-window finalization at pipeline shutdown. Set to `0` to disable. | `3` |
690720

691721

692722
### Indexer memory usage

quickwit/quickwit-config/src/index_config/mod.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ use siphasher::sip::SipHasher;
3636
use tracing::warn;
3737

3838
use crate::index_config::serialize::VersionedIndexConfig;
39-
use crate::merge_policy_config::MergePolicyConfig;
39+
use crate::merge_policy_config::{MergePolicyConfig, ParquetMergePolicyConfig};
4040

4141
#[derive(Clone, Debug, Serialize, Deserialize, utoipa::ToSchema)]
4242
#[serde(deny_unknown_fields)]
@@ -118,6 +118,11 @@ pub struct IndexingSettings {
118118
pub split_num_docs_target: usize,
119119
#[serde(default)]
120120
pub merge_policy: MergePolicyConfig,
121+
/// Merge policy for Parquet (metrics/sketches) splits. Controls how
122+
/// Parquet splits are compacted within time windows. Only used by
123+
/// indexes that use the Parquet indexing pipeline.
124+
#[serde(default)]
125+
pub parquet_merge_policy: ParquetMergePolicyConfig,
121126
#[serde(default)]
122127
pub resources: IndexingResources,
123128
}
@@ -160,6 +165,7 @@ impl Default for IndexingSettings {
160165
docstore_compression_level: Self::default_docstore_compression_level(),
161166
split_num_docs_target: Self::default_split_num_docs_target(),
162167
merge_policy: MergePolicyConfig::default(),
168+
parquet_merge_policy: ParquetMergePolicyConfig::default(),
163169
resources: IndexingResources::default(),
164170
}
165171
}

quickwit/quickwit-config/src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ use tracing::warn;
6767
use crate::index_template::IndexTemplateV0_8;
6868
pub use crate::index_template::{IndexTemplate, IndexTemplateId, VersionedIndexTemplate};
6969
use crate::merge_policy_config::{
70-
ConstWriteAmplificationMergePolicyConfig, MergePolicyConfig, StableLogMergePolicyConfig,
70+
ConstWriteAmplificationMergePolicyConfig, MergePolicyConfig, ParquetMergePolicyConfig,
71+
StableLogMergePolicyConfig,
7172
};
7273
pub use crate::metastore_config::{
7374
MetastoreBackend, MetastoreConfig, MetastoreConfigs, PostgresMetastoreConfig,
@@ -113,6 +114,7 @@ pub fn disable_ingest_v1() -> bool {
113114
KafkaSourceParams,
114115
KinesisSourceParams,
115116
MergePolicyConfig,
117+
ParquetMergePolicyConfig,
116118
PubSubSourceParams,
117119
PulsarSourceAuth,
118120
PulsarSourceParams,

quickwit/quickwit-config/src/merge_policy_config.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,70 @@ impl Default for StableLogMergePolicyConfig {
119119
}
120120
}
121121

122+
// --- Parquet merge policy config ---
123+
124+
fn default_target_split_size_bytes() -> u64 {
125+
256 * 1024 * 1024 // 256 MiB
126+
}
127+
128+
fn default_max_finalize_merge_operations() -> usize {
129+
3
130+
}
131+
132+
/// Configuration for the Parquet (metrics/sketches) merge policy.
133+
///
134+
/// Controls how Parquet splits within a compaction scope are merged.
135+
/// Splits at the same `num_merge_ops` level are greedily accumulated
136+
/// until reaching `max_merge_factor` or `target_split_size_bytes`.
137+
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, utoipa::ToSchema)]
138+
#[serde(deny_unknown_fields)]
139+
pub struct ParquetMergePolicyConfig {
140+
/// Minimum number of splits to trigger a merge.
141+
#[serde(default = "default_merge_factor")]
142+
pub merge_factor: usize,
143+
/// Maximum number of splits in a single merge operation.
144+
#[serde(default = "default_max_merge_factor")]
145+
pub max_merge_factor: usize,
146+
/// Maximum number of merges a split can undergo before becoming mature.
147+
/// Bounds total write amplification.
148+
#[serde(default = "default_parquet_max_merge_ops")]
149+
pub max_merge_ops: u32,
150+
/// Target size for merged output splits in bytes. Merges are triggered
151+
/// when accumulated bytes reach this threshold, even if `merge_factor`
152+
/// is not reached.
153+
#[serde(default = "default_target_split_size_bytes")]
154+
pub target_split_size_bytes: u64,
155+
/// Duration after creation when a split becomes mature regardless of
156+
/// size or merge count. Mature splits are never merged.
157+
#[schema(value_type = String)]
158+
#[serde(default = "default_maturation_period")]
159+
#[serde(deserialize_with = "parse_human_duration")]
160+
#[serde(serialize_with = "serialize_duration")]
161+
pub maturation_period: Duration,
162+
/// Maximum number of merge operations emitted during cold-window
163+
/// finalization at shutdown. Set to 0 to disable.
164+
#[serde(default = "default_max_finalize_merge_operations")]
165+
#[serde(skip_serializing_if = "is_zero")]
166+
pub max_finalize_merge_operations: usize,
167+
}
168+
169+
fn default_parquet_max_merge_ops() -> u32 {
170+
4
171+
}
172+
173+
impl Default for ParquetMergePolicyConfig {
174+
fn default() -> Self {
175+
Self {
176+
merge_factor: default_merge_factor(),
177+
max_merge_factor: default_max_merge_factor(),
178+
max_merge_ops: default_parquet_max_merge_ops(),
179+
target_split_size_bytes: default_target_split_size_bytes(),
180+
maturation_period: default_maturation_period(),
181+
max_finalize_merge_operations: default_max_finalize_merge_operations(),
182+
}
183+
}
184+
}
185+
122186
fn parse_human_duration<'de, D>(deserializer: D) -> Result<Duration, D::Error>
123187
where D: Deserializer<'de> {
124188
let value: String = Deserialize::deserialize(deserializer)?;

0 commit comments

Comments
 (0)