@@ -118,14 +118,25 @@ pub(crate) struct PrefixColumn {
118118
119119/// Resolve the first `prefix_len` sort columns to parquet leaf
120120/// indices. Honours the legacy `timestamp` → `timestamp_secs` alias.
121- /// Errors if the sort schema has fewer columns than `prefix_len` or
122- /// if any column is missing from the parquet schema.
121+ ///
122+ /// Returns one entry per requested prefix column. `Some(PrefixColumn)`
123+ /// when the column is present in the parquet schema; `None` when the
124+ /// column is named in `sort_fields_str` but absent from the parquet
125+ /// schema. Per SS-3 the missing column is treated as constant null at
126+ /// every row of the file — [`extract_rg_composite_prefix_key`]
127+ /// synthesizes a fixed byte sequence in that slot so ordering is
128+ /// driven entirely by the present columns.
129+ ///
130+ /// Errors only when the sort schema declares fewer columns than
131+ /// requested — that means we don't have a *name* for one of the
132+ /// prefix columns and can't claim alignment on something we can't
133+ /// identify.
123134pub ( crate ) fn find_prefix_parquet_col_indices (
124135 metadata : & ParquetMetaData ,
125136 sort_fields_str : & str ,
126137 prefix_len : usize ,
127- input_idx : usize ,
128- ) -> Result < Vec < PrefixColumn > > {
138+ _input_idx : usize ,
139+ ) -> Result < Vec < Option < PrefixColumn > > > {
129140 let sort_field_schema = parse_sort_fields ( sort_fields_str) ?;
130141 if sort_field_schema. column . len ( ) < prefix_len {
131142 bail ! (
@@ -145,34 +156,34 @@ pub(crate) fn find_prefix_parquet_col_indices(
145156 } else {
146157 sort_col. name . as_str ( )
147158 } ;
159+ let descending = sort_col. sort_direction
160+ == quickwit_proto:: sortschema:: SortColumnDirection :: SortDirectionDescending as i32 ;
148161 let mut found = None ;
149162 for ( col_idx, col) in parquet_schema. columns ( ) . iter ( ) . enumerate ( ) {
150163 if col. path ( ) . parts ( ) [ 0 ] == resolved {
151164 found = Some ( col_idx) ;
152165 break ;
153166 }
154167 }
155- let parquet_col_idx = found. ok_or_else ( || {
156- anyhow ! (
157- "input {input_idx} parquet schema is missing prefix sort column '{}' (position \
158- {pos})",
159- sort_col. name,
160- )
161- } ) ?;
162- let descending = sort_col. sort_direction
163- == quickwit_proto:: sortschema:: SortColumnDirection :: SortDirectionDescending as i32 ;
168+ // SS-3: missing column → `None`. The composite-key extractor
169+ // skips this slot entirely (no ordinal byte, no value bytes);
170+ // the trailing prefix-length sentinel in
171+ // `extract_rg_composite_prefix_key` ensures the resulting key
172+ // still sorts cleanly relative to RGs with present values
173+ // (and matches sorted_series's row-level null-skip).
174+ //
164175 // Ordinal matches the column's position in `qh.sort_fields`.
165176 // For prefix cols (always the first `prefix_len` entries of
166177 // the sort schema) the ordinal equals the iteration index
167178 // `pos`, which is also the ordinal `sorted_series` would
168179 // assign — so the per-RG prefix key composes as a literal
169180 // byte prefix of every sorted_series key.
170- prefix_cols. push ( PrefixColumn {
181+ prefix_cols. push ( found . map ( |parquet_col_idx| PrefixColumn {
171182 name : sort_col. name . clone ( ) ,
172183 parquet_col_idx,
173184 descending,
174185 ordinal : pos as u8 ,
175- } ) ;
186+ } ) ) ;
176187 }
177188 Ok ( prefix_cols)
178189}
@@ -197,23 +208,34 @@ fn parquet_has_column(
197208/// in this RG.
198209///
199210/// Null handling:
200- /// - **All-null RG on a prefix column**: the column is skipped entirely (the next column's higher
201- /// ordinal byte appears in its place), so the RG sorts after any RG carrying a non-null value
202- /// for this column. This mirrors the row-level convention in `sorted_series` and gives
203- /// nulls-last ordering for free.
211+ /// - **Column absent from schema (`None` in `prefix_cols`)**: SS-3 case. Every row of the file has
212+ /// a constant null in this slot, so the contribution to the composite is empty (column skipped).
213+ /// The trailing prefix-length sentinel keeps the resulting key well-formed.
214+ /// - **All-null RG on a present prefix column**: column skipped for this RG (the next column's
215+ /// higher ordinal byte — or the trailing sentinel — appears in its place), so the RG sorts after
216+ /// any RG carrying a non-null value for this column. Mirrors the row-level convention in
217+ /// `sorted_series` and gives nulls-last ordering for free.
204218/// - **Mixed null + non-null in one RG**: rows in the RG would encode to two distinct prefix keys
205219/// (the non-null value's key and the column-skipped key), breaking the
206220/// at-most-one-prefix-value-per-RG invariant (PA-1). Reject.
207221/// - **No nulls**: standard `min == max` check on stats, then encode that single value.
208222pub ( crate ) fn extract_rg_composite_prefix_key (
209223 metadata : & ParquetMetaData ,
210224 rg_idx : usize ,
211- prefix_cols : & [ PrefixColumn ] ,
225+ prefix_cols : & [ Option < PrefixColumn > ] ,
212226 input_idx : usize ,
213227) -> Result < Vec < u8 > > {
214228 let rg_meta = metadata. row_group ( rg_idx) ;
215229 let mut key = Vec :: new ( ) ;
216- for col in prefix_cols {
230+ for col_opt in prefix_cols {
231+ let Some ( col) = col_opt else {
232+ // SS-3 implicit null: column absent from schema, so every
233+ // row's value is null. Skip the slot entirely — the
234+ // trailing prefix-length sentinel will keep this from
235+ // colliding with present-value keys, and sorted_series
236+ // applies the same "skip null cols" rule at the row level.
237+ continue ;
238+ } ;
217239 let chunk = rg_meta. column ( col. parquet_col_idx ) ;
218240 let stats = chunk. statistics ( ) . ok_or_else ( || {
219241 anyhow ! (
@@ -245,9 +267,8 @@ pub(crate) fn extract_rg_composite_prefix_key(
245267 bail ! (
246268 "input {input_idx} rg {rg_idx} col '{}' is NOT prefix-aligned: contains \
247269 {null_count} nulls plus {} non-null values. PA-1 requires each row group to \
248- carry a single prefix value; rows with null on this column encode to a \
249- different prefix key (with the column skipped) than rows with the non-null \
250- value.",
270+ carry a single prefix value; rows with null on this column encode to a different \
271+ prefix key (with the column skipped) than rows with the non-null value.",
251272 col. name,
252273 num_values - null_count,
253274 ) ;
@@ -259,22 +280,17 @@ pub(crate) fn extract_rg_composite_prefix_key(
259280 // Trailing prefix-length sentinel: an additional `u8(prefix_len)`
260281 // ordinal byte that does two things at once:
261282 //
262- // 1. **Forces nulls-last ordering across RGs.** For prefix_len=1
263- // an all-null RG produces an empty per-column body and would
264- // otherwise lex-sort *before* any non-null RG. With the
265- // sentinel, the all-null key becomes `[prefix_len]` and the
266- // non-null key becomes `[ord(0), storekey(value), ..., prefix_len]`.
267- // The non-null key starts with `ord(0) = 0x00`, smaller than
268- // `prefix_len >= 1`, so non-null RGs sort first — matching
269- // `sorted_series`'s row-level nulls-last convention via the
270- // same "the next ordinal byte appears in the skipped slot"
283+ // 1. **Forces nulls-last ordering across RGs.** For prefix_len=1 an all-null RG produces an
284+ // empty per-column body and would otherwise lex-sort *before* any non-null RG. With the
285+ // sentinel, the all-null key becomes `[prefix_len]` and the non-null key becomes `[ord(0),
286+ // storekey(value), ..., prefix_len]`. The non-null key starts with `ord(0) = 0x00`, smaller
287+ // than `prefix_len >= 1`, so non-null RGs sort first — matching `sorted_series`'s row-level
288+ // nulls-last convention via the same "the next ordinal byte appears in the skipped slot"
271289 // mechanism.
272- // 2. **Preserves the "literal prefix of sorted_series" property.**
273- // The byte we append is exactly what `sorted_series` writes
274- // right after the prefix columns: the ordinal of the next
275- // sort-schema column (`u8(prefix_len)`). So the per-RG key
276- // remains a byte-for-byte prefix of every row's
277- // `sorted_series` value in that RG.
290+ // 2. **Preserves the "literal prefix of sorted_series" property.** The byte we append is
291+ // exactly what `sorted_series` writes right after the prefix columns: the ordinal of the
292+ // next sort-schema column (`u8(prefix_len)`). So the per-RG key remains a byte-for-byte
293+ // prefix of every row's `sorted_series` value in that RG.
278294 storekey:: encode ( & mut key, & ( prefix_cols. len ( ) as u8 ) )
279295 . map_err ( |e| anyhow ! ( "storekey encode prefix-length sentinel: {}" , e) ) ?;
280296
@@ -581,10 +597,22 @@ pub(crate) fn extract_regions_from_metadata(
581597 . collect ( ) )
582598}
583599
584- /// Post-write check: verify the parquet file at `metadata` has no two
585- /// row groups sharing the same composite prefix key, for the first
586- /// `prefix_len` sort columns. Returns `Ok(())` immediately if
587- /// `prefix_len == 0` (no alignment claim).
600+ /// Post-write check: verify every row group in `metadata` satisfies
601+ /// the prefix-alignment claim declared by `prefix_len`.
602+ ///
603+ /// Enforces both halves of the prefix-alignment contract in one pass:
604+ /// - **PA-1 (intra-RG constancy):** within each RG, each of the first `prefix_len` sort columns has
605+ /// `min == max` (the column is constant across the RG). This is checked transitively by
606+ /// [`extract_rg_composite_prefix_key`] — it returns an error when any prefix column's chunk stats
607+ /// show `min != max`.
608+ /// - **PA-3 (inter-RG uniqueness):** no two RGs share the same composite prefix value. The
609+ /// streaming engine pairs at most one input RG per region per prefix value, so a duplicate would
610+ /// silently drop rows or corrupt the body-col / sort-col mapping.
611+ ///
612+ /// Returns `Ok(())` immediately when `prefix_len == 0` (no claim to
613+ /// verify) or `num_rgs == 0` (no RGs to check). Single-RG files are
614+ /// NOT short-circuited — they still go through PA-1 because an
615+ /// unsorted single-RG file CAN have `min != max` on a prefix column.
588616///
589617/// This is the writer-side mirror of the read-side check in
590618/// `extract_regions_from_metadata` — both indexing and the compaction
@@ -606,8 +634,8 @@ pub(crate) fn assert_unique_rg_prefix_keys(
606634 return Ok ( ( ) ) ;
607635 }
608636 let num_rgs = metadata. num_row_groups ( ) ;
609- if num_rgs <= 1 {
610- // Single -RG (or zero-RG) files vacuously satisfy the invariant .
637+ if num_rgs == 0 {
638+ // Zero -RG files vacuously satisfy both halves of the claim .
611639 return Ok ( ( ) ) ;
612640 }
613641 let prefix_cols =
0 commit comments