fix(merge): adapter rejects unsorted input; consumer honors SS-3; stronger test verifiers (#6426)

g-talbot · claude · g-talbot · commit 6fdced26bc0c · 2026-05-20T21:42:32.000-04:00
Three adversarial-review findings on the prefix/RG machinery, bundled
because they touch the same producer/consumer contract:

**F8: Legacy adapter rejects SS-1-violating input upfront.**
The adapter walked rows in physical order and emitted one RG per
prefix-value run. An unsorted legacy input (rows `[A,A,B,B,A,A]`)
produced a 3-RG file where two RGs shared prefix `A`, violating PA-3.
The streaming merge engine would later reject it mid-merge — but only
after a quietly-bad file had been built. Now `compute_prefix_value_slices`
tracks each slice's composite prefix-value bytes and bails with
`LegacyAdapterError::InputNotSorted` on duplicates, surfacing the
SS-1 violation before any file lands on disk.

**F12: Consumer-side SS-3 (cross-layer divergence, discovered while
wiring F2's chunk-level verifier into the SS-3 test).** The adapter
implements SS-3 correctly (missing-from-schema → synthesized NullArray
during slice computation, file stamps `prefix_len = N`). The streaming
engine's reader did not: `find_prefix_parquet_col_indices` hard-required
every named prefix column to be physically present, so a file the
adapter produced from an SS-3 input was unreadable by the merge engine.
Now `find_prefix_parquet_col_indices` returns `Vec&lt;Option&lt;PrefixColumn&gt;&gt;`
and `extract_rg_composite_prefix_key` emits a constant null marker
(`encode_byte_array_prefix(&amp;[])`) for None slots. The column contributes
no cross-RG ordering signal (constant everywhere) so region boundaries
are driven entirely by the present columns. Both halves of SS-3 now
agree end-to-end.

Known limitation: cross-file SS-3 — where some inputs have a sort
column and others don't — uses [0x00, 0x00] for the null contribution,
which sorts BEFORE non-null per the encoded-empty-string convention.
That weakly violates SS-2 (nulls sort last). Single-file SS-3 is
correct because every RG in such a file contributes the same constant.
If cross-file SS-3 becomes a production scenario, the encoding needs
a leading-0xff sentinel instead. Not exercised today.

**F2/F9/F11: Wire `assert_unique_rg_prefix_keys` into prefix-claiming
tests.** Tests asserting `num_row_groups == N` + KV stamped to N would
have passed even with an off-by-one in slice-boundary detection or
column-content scrambling. The verifier reads chunk-level statistics
directly: PA-1 (intra-RG `min == max`) + PA-3 (inter-RG uniqueness)
on the composite key. Wired into six tests:
- streaming engine: `test_streaming_merge_with_prefix_len_two`,
  `test_multi_rg_metric_aligned_input_produces_multi_rg_output`,
  `test_streaming_merge_with_desc_prefix_col`
- legacy adapter: `test_target_prefix_len_two_splits_by_metric_and_service`,
  `test_legacy_input_with_sort_fields_produces_prefix_aligned_multi_rg`,
  `test_missing_prefix_col_treated_as_null_satisfies_alignment` (now
  passes thanks to F12).

Also: `assert_unique_rg_prefix_keys` no longer short-circuits on
single-RG files — they still go through PA-1 because an unsorted
single-RG file CAN have `min != max` on a prefix column.

Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/quickwit/quickwit-parquet-engine/src/merge/streaming.rs b/quickwit/quickwit-parquet-engine/src/merge/streaming.rs
@@ -1893,6 +1893,17 @@ mod tests {
             outputs[0].num_row_groups, 2,
             "MergeOutputFile.num_row_groups should match physical row group count",
         );
+
+        // F2 chunk-level verification: confirm each output RG actually
+        // carries a single distinct metric_name (PA-1 + PA-3 read
+        // straight off the column-chunk statistics).
+        assert_unique_rg_prefix_keys(
+            reader.metadata(),
+            "metric_name|-timestamp_secs/V2",
+            1,
+            "test_multi_rg_metric_aligned_input_produces_multi_rg_output output",
+        )
+        .expect("streaming engine output must satisfy PA-1 + PA-3 on metric_name");
     }
 
     /// Regression for Codex P2 on PR-6410: a streaming merge output
@@ -2081,6 +2092,20 @@ mod tests {
             "three distinct (metric_name, service) pairs must produce three output RGs",
         );
         assert_eq!(outputs[0].num_row_groups, 3);
+
+        // F2 chunk-level verification: counting RGs and stamping a KV
+        // is not enough — the OUTPUT's row groups must actually be
+        // aligned on the composite (metric_name, service) prefix.
+        // `assert_unique_rg_prefix_keys` enforces PA-1 (intra-RG
+        // constancy) + PA-3 (inter-RG uniqueness) by reading the
+        // chunk-level statistics.
+        assert_unique_rg_prefix_keys(
+            reader.metadata(),
+            "metric_name|service|-timestamp_secs/V2",
+            2,
+            "test_streaming_merge_with_prefix_len_two output",
+        )
+        .expect("streaming engine output must satisfy PA-1 + PA-3 on the prefix columns");
     }
 
     /// Regression for Codex finding #1 on PR-6410: when one input
@@ -2768,6 +2793,16 @@ mod tests {
             (third_block - 1.0).abs() < 1e-9,
             "third output RG should be 'dev' (marker 1.0), got {third_block}",
         );
+
+        // F2 chunk-level verification: each output RG must be aligned
+        // on (metric_name, -env). PA-1 + PA-3 read from chunk stats.
+        assert_unique_rg_prefix_keys(
+            reader.metadata(),
+            "metric_name|-env|-timestamp_secs/V2",
+            2,
+            "test_streaming_merge_with_desc_prefix_col output",
+        )
+        .expect("DESC prefix output must satisfy PA-1 + PA-3");
     }
 
     /// Regression for the composite-key encoding when ASC and DESC
diff --git a/quickwit/quickwit-parquet-engine/src/storage/legacy_adapter.rs b/quickwit/quickwit-parquet-engine/src/storage/legacy_adapter.rs
@@ -57,6 +57,7 @@
 // deprecated items at module scope keeps that lookup direct.
 #![allow(deprecated)]
 
+use std::collections::HashMap;
 use std::io;
 use std::ops::Range;
 use std::path::{Path, PathBuf};
@@ -141,6 +142,26 @@ pub enum LegacyAdapterError {
          enough sort information to safely synthesize prefix-aligned row groups)"
     )]
     PrefixUnresolvable { target: u32, reason: String },
+
+    /// The legacy file's rows are not sorted by its declared sort schema
+    /// (SS-1 violation): two row regions in the file carry the same
+    /// composite prefix value with other prefix values in between. The
+    /// adapter walks rows in physical order and emits one RG per
+    /// prefix-value run, so an unsorted input produces multiple RGs
+    /// sharing a prefix key — which violates PA-3 (per-input uniqueness).
+    /// Bail upfront instead of producing a file the downstream merge
+    /// engine will reject mid-merge.
+    #[error(
+        "legacy input is not sorted by its declared sort schema: rows at offset {first_offset} \
+         and offset {second_offset} share composite prefix value (target_prefix_len = {target}). \
+         The adapter relies on the file being sorted per SS-1; an unsorted file would synthesize \
+         multiple row groups with the same prefix key (PA-3 violation)."
+    )]
+    InputNotSorted {
+        target: u32,
+        first_offset: usize,
+        second_offset: usize,
+    },
 }
 
 /// 4 GiB upper bound on the input file size we will buffer into RAM.
@@ -305,7 +326,7 @@ fn reencode_prefix_aligned(
     let slices = if consolidated_batch.num_rows() == 0 {
         Vec::new()
     } else {
-        compute_prefix_value_slices(&consolidated_batch, &prefix_col_indices)?
+        compute_prefix_value_slices(&consolidated_batch, &prefix_col_indices, target_prefix_len)?
     };
     let kv_with_prefix = inject_prefix_len_kv(original_kv, target_prefix_len);
     let props = build_writer_properties(
@@ -404,9 +425,19 @@ fn resolve_prefix_sort_cols(
 /// constant and contributes no transitions to the composite key —
 /// equivalent to skipping it, but kept explicit so the resulting
 /// alignment claim matches the caller's requested `target_prefix_len`.
+///
+/// Detects SS-1 violations (unsorted input) up-front: each emitted
+/// slice's composite prefix-value bytes must be unique. If two
+/// non-adjacent slices carry the same prefix value (e.g., rows
+/// `[A,A,B,B,A,A]`), the input is not sorted by its declared sort
+/// schema, so we'd synthesize a file with two RGs sharing the prefix
+/// — a PA-3 violation the downstream merge engine would reject
+/// mid-merge. Bailing here with `InputNotSorted` keeps that bad file
+/// from ever landing on disk.
 fn compute_prefix_value_slices(
     batch: &RecordBatch,
     prefix_col_indices: &[Option<usize>],
+    target_prefix_len: u32,
 ) -> Result<Vec<(usize, usize)>, LegacyAdapterError> {
     let n = batch.num_rows();
     let cols: Vec<ArrayRef> = prefix_col_indices
@@ -428,15 +459,35 @@ fn compute_prefix_value_slices(
     if n_rows == 0 {
         return Ok(Vec::new());
     }
+    // Track each emitted slice's starting prefix-value bytes; any
+    // repeat signals SS-1 violation on the input.
+    let mut seen: HashMap<Vec<u8>, usize> = HashMap::new();
     let mut slices = Vec::new();
     let mut start = 0;
+    let record_slice = |slices: &mut Vec<(usize, usize)>,
+                        seen: &mut HashMap<Vec<u8>, usize>,
+                        slice_start: usize,
+                        slice_len: usize|
+     -> Result<(), LegacyAdapterError> {
+        let key = rows.row(slice_start).as_ref().to_vec();
+        if let Some(&first_offset) = seen.get(&key) {
+            return Err(LegacyAdapterError::InputNotSorted {
+                target: target_prefix_len,
+                first_offset,
+                second_offset: slice_start,
+            });
+        }
+        seen.insert(key, slice_start);
+        slices.push((slice_start, slice_len));
+        Ok(())
+    };
     for i in 1..n_rows {
         if rows.row(i) != rows.row(i - 1) {
-            slices.push((start, i - start));
+            record_slice(&mut slices, &mut seen, start, i - start)?;
             start = i;
         }
     }
-    slices.push((start, n_rows - start));
+    record_slice(&mut slices, &mut seen, start, n_rows - start)?;
     Ok(slices)
 }
 
@@ -1363,6 +1414,19 @@ mod tests {
             Some("1"),
             "re-encoded file must declare rg_partition_prefix_len=1",
         );
+
+        // F9 chunk-level verification: the count + KV checks above
+        // would still pass if `compute_prefix_value_slices` had an
+        // off-by-one in its boundary detection. PA-1 + PA-3 on chunk
+        // statistics nail down that each RG's metric_name column is
+        // actually constant and no two RGs share a value.
+        crate::merge::streaming::region_grouping::assert_unique_rg_prefix_keys(
+            adapter.metadata(),
+            "metric_name|-timestamp_secs/V2",
+            1,
+            "test_legacy_input_with_sort_fields_produces_prefix_aligned_multi_rg adapter output",
+        )
+        .expect("adapter output must satisfy PA-1 + PA-3 on metric_name");
     }
 
     /// Single-metric legacy file: only one prefix value, so the
@@ -1660,6 +1724,19 @@ mod tests {
             Some("2"),
             "stamped prefix_len must match caller's request",
         );
+
+        // F9 chunk-level verification: a `compute_prefix_value_slices`
+        // bug splitting on only the first prefix col (or off by one)
+        // would still yield 4 RGs of [20,20,20,20] but with the wrong
+        // CONTENTS. PA-1 + PA-3 on the composite (metric, service)
+        // composite key verifies content alignment directly.
+        crate::merge::streaming::region_grouping::assert_unique_rg_prefix_keys(
+            adapter.metadata(),
+            "metric_name|service|-timestamp_secs/V2",
+            2,
+            "test_target_prefix_len_two_splits_by_metric_and_service adapter output",
+        )
+        .expect("composite prefix output must satisfy PA-1 + PA-3");
     }
 
     /// SS-3: a sort column named in `qh.sort_fields` but missing from
@@ -1732,6 +1809,58 @@ mod tests {
         .expect("SS-3 null col must satisfy PA-1 + PA-3 (null is constant across all RGs)");
     }
 
+    /// F8 regression: an unsorted legacy input (rows
+    /// `[A,A,B,B,A,A]` on `metric_name`) violates SS-1. Walking
+    /// row-by-row to find prefix transitions would emit three slices —
+    /// `A`, `B`, `A` — and synthesize a file with two RGs sharing the
+    /// prefix value `A`, violating PA-3. The downstream streaming
+    /// merge engine would catch this later, but only once the bad
+    /// file had been built and possibly archived. The adapter must
+    /// bail upfront with `InputNotSorted` so no PA-3-violating file
+    /// ever lands on disk.
+    #[tokio::test]
+    async fn test_unsorted_legacy_input_rejected_by_adapter() {
+        // metric_name in row order: cpu.usage, memory.used, cpu.usage.
+        // That's an SS-1 violation under sort schema `metric_name ASC`.
+        let bad_metrics = [
+            ("cpu.usage", 20usize),
+            ("memory.used", 20),
+            ("cpu.usage", 20),
+        ];
+        let bytes =
+            write_sorted_multi_rg_legacy_file(&bad_metrics, "metric_name|-timestamp_secs/V2", 20);
+
+        let source = CountingInMemorySource::new(bytes);
+        let result = LegacyInputAdapter::try_open(source, dummy_path(), 1).await;
+        let Err(err) = result else {
+            panic!(
+                "unsorted legacy input must surface as InputNotSorted, got Ok(...) — the adapter \
+                 would have written a PA-3-violating file"
+            );
+        };
+        match err {
+            LegacyAdapterError::InputNotSorted {
+                target,
+                first_offset,
+                second_offset,
+            } => {
+                assert_eq!(target, 1);
+                // First `cpu.usage` run is at offset 0; second is at
+                // offset 40 (after the 20-row `cpu.usage` then 20-row
+                // `memory.used` runs).
+                assert_eq!(
+                    first_offset, 0,
+                    "first duplicate prefix offset should point at the first cpu.usage run",
+                );
+                assert_eq!(
+                    second_offset, 40,
+                    "second duplicate prefix offset should point at the second cpu.usage run",
+                );
+            }
+            other => panic!("expected InputNotSorted, got: {other}"),
+        }
+    }
+
     /// Composite-prefix fixture: rows grouped by `(metric, service)`
     /// in the order supplied. Used by the prefix_len=2 test to verify
     /// transitions on the second prefix column trigger RG splits.