fix(merge): adapter rejects unsorted input; consumer honors SS-3; stronger test verifiers

g-talbot · claude · g-talbot · commit a255ae3cdb80 · 2026-05-15T17:52:04.000-04:00
Three adversarial-review findings on the prefix/RG machinery, bundled
because they touch the same producer/consumer contract:

**F8: Legacy adapter rejects SS-1-violating input upfront.**
The adapter walked rows in physical order and emitted one RG per
prefix-value run. An unsorted legacy input (rows `[A,A,B,B,A,A]`)
produced a 3-RG file where two RGs shared prefix `A`, violating PA-3.
The streaming merge engine would later reject it mid-merge — but only
after a quietly-bad file had been built. Now `compute_prefix_value_slices`
tracks each slice's composite prefix-value bytes and bails with
`LegacyAdapterError::InputNotSorted` on duplicates, surfacing the
SS-1 violation before any file lands on disk.

**F12: Consumer-side SS-3 (cross-layer divergence, discovered while
wiring F2's chunk-level verifier into the SS-3 test).** The adapter
implements SS-3 correctly (missing-from-schema → synthesized NullArray
during slice computation, file stamps `prefix_len = N`). The streaming
engine's reader did not: `find_prefix_parquet_col_indices` hard-required
every named prefix column to be physically present, so a file the
adapter produced from an SS-3 input was unreadable by the merge engine.
Now `find_prefix_parquet_col_indices` returns `Vec&lt;Option&lt;PrefixColumn&gt;&gt;`
and `extract_rg_composite_prefix_key` emits a constant null marker
(`encode_byte_array_prefix(&amp;[])`) for None slots. The column contributes
no cross-RG ordering signal (constant everywhere) so region boundaries
are driven entirely by the present columns. Both halves of SS-3 now
agree end-to-end.

Known limitation: cross-file SS-3 — where some inputs have a sort
column and others don't — uses [0x00, 0x00] for the null contribution,
which sorts BEFORE non-null per the encoded-empty-string convention.
That weakly violates SS-2 (nulls sort last). Single-file SS-3 is
correct because every RG in such a file contributes the same constant.
If cross-file SS-3 becomes a production scenario, the encoding needs
a leading-0xff sentinel instead. Not exercised today.

**F2/F9/F11: Wire `assert_unique_rg_prefix_keys` into prefix-claiming
tests.** Tests asserting `num_row_groups == N` + KV stamped to N would
have passed even with an off-by-one in slice-boundary detection or
column-content scrambling. The verifier reads chunk-level statistics
directly: PA-1 (intra-RG `min == max`) + PA-3 (inter-RG uniqueness)
on the composite key. Wired into six tests:
- streaming engine: `test_streaming_merge_with_prefix_len_two`,
  `test_multi_rg_metric_aligned_input_produces_multi_rg_output`,
  `test_streaming_merge_with_desc_prefix_col`
- legacy adapter: `test_target_prefix_len_two_splits_by_metric_and_service`,
  `test_legacy_input_with_sort_fields_produces_prefix_aligned_multi_rg`,
  `test_missing_prefix_col_treated_as_null_satisfies_alignment` (now
  passes thanks to F12).

Also: `assert_unique_rg_prefix_keys` no longer short-circuits on
single-RG files — they still go through PA-1 because an unsorted
single-RG file CAN have `min != max` on a prefix column.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/quickwit/quickwit-parquet-engine/src/merge/streaming.rs b/quickwit/quickwit-parquet-engine/src/merge/streaming.rs
@@ -1893,6 +1893,17 @@ mod tests {
             outputs[0].num_row_groups, 2,
             "MergeOutputFile.num_row_groups should match physical row group count",
         );
+
+        // F2 chunk-level verification: confirm each output RG actually
+        // carries a single distinct metric_name (PA-1 + PA-3 read
+        // straight off the column-chunk statistics).
+        assert_unique_rg_prefix_keys(
+            reader.metadata(),
+            "metric_name|-timestamp_secs/V2",
+            1,
+            "test_multi_rg_metric_aligned_input_produces_multi_rg_output output",
+        )
+        .expect("streaming engine output must satisfy PA-1 + PA-3 on metric_name");
     }
 
     /// Regression for Codex P2 on PR-6410: a streaming merge output
@@ -2068,6 +2079,20 @@ mod tests {
             "three distinct (metric_name, service) pairs must produce three output RGs",
         );
         assert_eq!(outputs[0].num_row_groups, 3);
+
+        // F2 chunk-level verification: counting RGs and stamping a KV
+        // is not enough — the OUTPUT's row groups must actually be
+        // aligned on the composite (metric_name, service) prefix.
+        // `assert_unique_rg_prefix_keys` enforces PA-1 (intra-RG
+        // constancy) + PA-3 (inter-RG uniqueness) by reading the
+        // chunk-level statistics.
+        assert_unique_rg_prefix_keys(
+            reader.metadata(),
+            "metric_name|service|-timestamp_secs/V2",
+            2,
+            "test_streaming_merge_with_prefix_len_two output",
+        )
+        .expect("streaming engine output must satisfy PA-1 + PA-3 on the prefix columns");
     }
 
     /// Regression for Codex finding #1 on PR-6410: when one input
@@ -2784,6 +2809,16 @@ mod tests {
             (third_block - 1.0).abs() < 1e-9,
             "third output RG should be 'dev' (marker 1.0), got {third_block}",
         );
+
+        // F2 chunk-level verification: each output RG must be aligned
+        // on (metric_name, -env). PA-1 + PA-3 read from chunk stats.
+        assert_unique_rg_prefix_keys(
+            reader.metadata(),
+            "metric_name|-env|-timestamp_secs/V2",
+            2,
+            "test_streaming_merge_with_desc_prefix_col output",
+        )
+        .expect("DESC prefix output must satisfy PA-1 + PA-3");
     }
 
     /// Regression for the composite-key encoding when ASC and DESC
@@ -2805,7 +2840,10 @@ mod tests {
                 .expect("resolve");
         // Sanity: the second prefix column must be flagged DESC.
         assert!(
-            prefix_cols[1].descending,
+            prefix_cols[1]
+                .as_ref()
+                .expect("env present in this fixture")
+                .descending,
             "env must be parsed as DESC from sort schema",
         );
 
diff --git a/quickwit/quickwit-parquet-engine/src/merge/streaming/region_grouping.rs b/quickwit/quickwit-parquet-engine/src/merge/streaming/region_grouping.rs
@@ -112,14 +112,25 @@ pub(crate) struct PrefixColumn {
 
 /// Resolve the first `prefix_len` sort columns to parquet leaf
 /// indices. Honours the legacy `timestamp` → `timestamp_secs` alias.
-/// Errors if the sort schema has fewer columns than `prefix_len` or
-/// if any column is missing from the parquet schema.
+///
+/// Returns one entry per requested prefix column. `Some(PrefixColumn)`
+/// when the column is present in the parquet schema; `None` when the
+/// column is named in `sort_fields_str` but absent from the parquet
+/// schema. Per SS-3 the missing column is treated as constant null at
+/// every row of the file — [`extract_rg_composite_prefix_key`]
+/// synthesizes a fixed byte sequence in that slot so ordering is
+/// driven entirely by the present columns.
+///
+/// Errors only when the sort schema declares fewer columns than
+/// requested — that means we don't have a *name* for one of the
+/// prefix columns and can't claim alignment on something we can't
+/// identify.
 pub(crate) fn find_prefix_parquet_col_indices(
     metadata: &ParquetMetaData,
     sort_fields_str: &str,
     prefix_len: usize,
-    input_idx: usize,
-) -> Result<Vec<PrefixColumn>> {
+    _input_idx: usize,
+) -> Result<Vec<Option<PrefixColumn>>> {
     let sort_field_schema = parse_sort_fields(sort_fields_str)?;
     if sort_field_schema.column.len() < prefix_len {
         bail!(
@@ -129,7 +140,7 @@ pub(crate) fn find_prefix_parquet_col_indices(
     }
     let parquet_schema = metadata.file_metadata().schema_descr();
     let mut prefix_cols = Vec::with_capacity(prefix_len);
-    for (pos, sort_col) in sort_field_schema.column.iter().take(prefix_len).enumerate() {
+    for sort_col in sort_field_schema.column.iter().take(prefix_len) {
         // Apply the same `timestamp` / `timestamp_secs` alias the rest
         // of the engine uses.
         let resolved = if is_timestamp_column_name(&sort_col.name)
@@ -139,27 +150,23 @@ pub(crate) fn find_prefix_parquet_col_indices(
         } else {
             sort_col.name.as_str()
         };
+        let descending = sort_col.sort_direction
+            == quickwit_proto::sortschema::SortColumnDirection::SortDirectionDescending as i32;
         let mut found = None;
         for (col_idx, col) in parquet_schema.columns().iter().enumerate() {
             if col.path().parts()[0] == resolved {
                 found = Some(col_idx);
                 break;
             }
         }
-        let parquet_col_idx = found.ok_or_else(|| {
-            anyhow!(
-                "input {input_idx} parquet schema is missing prefix sort column '{}' (position \
-                 {pos})",
-                sort_col.name,
-            )
-        })?;
-        let descending = sort_col.sort_direction
-            == quickwit_proto::sortschema::SortColumnDirection::SortDirectionDescending as i32;
-        prefix_cols.push(PrefixColumn {
+        // SS-3: missing column → None. Caller treats it as constant
+        // null at every row, which trivially satisfies alignment on
+        // that column.
+        prefix_cols.push(found.map(|parquet_col_idx| PrefixColumn {
             name: sort_col.name.clone(),
             parquet_col_idx,
             descending,
-        });
+        }));
     }
     Ok(prefix_cols)
 }
@@ -179,19 +186,36 @@ fn parquet_has_column(
 /// prefix column's value bytes in declared order, with each column's
 /// encoding chosen so that lexicographic order on the composite
 /// matches the sort schema's order across the prefix columns. Each
-/// column is required to be **constant within the RG** — either
-/// `min == max` on the non-null cells with zero nulls, or all rows
-/// null. A mix of nulls and non-nulls in the same RG breaks the
-/// at-most-one-prefix-value-per-RG invariant (PA-1) and is rejected.
+/// present column is required to be **constant within the RG** —
+/// either `min == max` on the non-null cells with zero nulls, or
+/// all rows null. A mix of nulls and non-nulls in the same RG
+/// breaks the at-most-one-prefix-value-per-RG invariant (PA-1) and
+/// is rejected by [`extract_aligned_prefix_value`].
+///
+/// A `None` slot in `prefix_cols` represents an SS-3 case: the
+/// column is declared in `qh.sort_fields` but absent from the
+/// parquet schema. Per SS-3 every row's value in that column is
+/// implicitly null. Since the value is constant across all RGs in
+/// the file, we contribute a fixed byte sequence (the encoded
+/// empty value) in that slot — ordering on this column does no
+/// work, and ordering on the other prefix columns picks the region
+/// boundaries.
 pub(crate) fn extract_rg_composite_prefix_key(
     metadata: &ParquetMetaData,
     rg_idx: usize,
-    prefix_cols: &[PrefixColumn],
+    prefix_cols: &[Option<PrefixColumn>],
     input_idx: usize,
 ) -> Result<Vec<u8>> {
     let rg_meta = metadata.row_group(rg_idx);
     let mut key = Vec::new();
-    for col in prefix_cols {
+    for col_opt in prefix_cols {
+        let Some(col) = col_opt else {
+            // SS-3 implicit null: constant for every RG, so any fixed
+            // marker works. Use the encoded empty byte string so the
+            // contribution is byte-recognizable in dumps.
+            key.extend_from_slice(&encode_byte_array_prefix(&[]));
+            continue;
+        };
         let chunk = rg_meta.column(col.parquet_col_idx);
         let stats = chunk.statistics().ok_or_else(|| {
             anyhow!(
@@ -569,10 +593,22 @@ pub(crate) fn extract_regions_from_metadata(
         .collect())
 }
 
-/// Post-write check: verify the parquet file at `metadata` has no two
-/// row groups sharing the same composite prefix key, for the first
-/// `prefix_len` sort columns. Returns `Ok(())` immediately if
-/// `prefix_len == 0` (no alignment claim).
+/// Post-write check: verify every row group in `metadata` satisfies
+/// the prefix-alignment claim declared by `prefix_len`.
+///
+/// Enforces both halves of the prefix-alignment contract in one pass:
+/// - **PA-1 (intra-RG constancy):** within each RG, each of the first `prefix_len` sort columns has
+///   `min == max` (the column is constant across the RG). This is checked transitively by
+///   [`extract_rg_composite_prefix_key`] — it returns an error when any prefix column's chunk stats
+///   show `min != max`.
+/// - **PA-3 (inter-RG uniqueness):** no two RGs share the same composite prefix value. The
+///   streaming engine pairs at most one input RG per region per prefix value, so a duplicate would
+///   silently drop rows or corrupt the body-col / sort-col mapping.
+///
+/// Returns `Ok(())` immediately when `prefix_len == 0` (no claim to
+/// verify) or `num_rgs == 0` (no RGs to check). Single-RG files are
+/// NOT short-circuited — they still go through PA-1 because an
+/// unsorted single-RG file CAN have `min != max` on a prefix column.
 ///
 /// This is the writer-side mirror of the read-side check in
 /// `extract_regions_from_metadata` — both indexing and the compaction
@@ -594,8 +630,8 @@ pub(crate) fn assert_unique_rg_prefix_keys(
         return Ok(());
     }
     let num_rgs = metadata.num_row_groups();
-    if num_rgs <= 1 {
-        // Single-RG (or zero-RG) files vacuously satisfy the invariant.
+    if num_rgs == 0 {
+        // Zero-RG files vacuously satisfy both halves of the claim.
         return Ok(());
     }
     let prefix_cols =
diff --git a/quickwit/quickwit-parquet-engine/src/storage/legacy_adapter.rs b/quickwit/quickwit-parquet-engine/src/storage/legacy_adapter.rs