Skip to content

Commit ff1b561

Browse files
g-talbotclaude
andcommitted
feat(31): sort schema foundation — proto, parser, display, validation, window, TableConfig
Add the sort fields module for metrics compaction sort schema handling: - Vendored event_store_sortschema.proto (SortSchema, SortColumn, RowKeys) - Sort fields parser (direct port of Go StringToSchema) with V2-only enforcement - Display/serialization (SchemaToString, SchemaToStringShort) - Schema equivalence comparison for compaction decisions - Time-window arithmetic with rem_euclid for negative timestamp correctness - TableConfig with per-product-type default sort fields - SortFieldsError type in quickwit-proto - 97 tests including proptests for window invariants Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 440631b commit ff1b561

File tree

18 files changed

+2819
-0
lines changed

18 files changed

+2819
-0
lines changed

quickwit/Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

quickwit/quickwit-parquet-engine/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,11 @@ license.workspace = true
1313
[dependencies]
1414
anyhow = { workspace = true }
1515
arrow = { workspace = true }
16+
chrono = { workspace = true }
1617
parquet = { workspace = true }
18+
prost = { workspace = true }
1719
quickwit-common = { workspace = true }
20+
quickwit-proto = { workspace = true }
1821
sea-query = { workspace = true, optional = true }
1922
serde = { workspace = true }
2023
serde_json = { workspace = true }

quickwit/quickwit-parquet-engine/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ pub mod index;
2424
pub mod ingest;
2525
pub mod metrics;
2626
pub mod schema;
27+
pub mod sort_fields;
2728
pub mod split;
2829
pub mod storage;
30+
pub mod table_config;
2931

3032
#[cfg(any(test, feature = "testsuite"))]
3133
pub mod test_helpers;
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
// Copyright 2021-Present Datadog, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
//! Column type identification from name suffixes and string names.
16+
//!
17+
//! Type can be specified via Husky-convention suffixes (`__s`, `__i`, `__nf`)
18+
//! or inferred from well-known bare names. The discriminant values match
19+
//! the Go iota exactly for cross-system interoperability.
20+
21+
use std::str::FromStr;
22+
23+
use super::SortFieldsError;
24+
25+
/// Well-known column name for timestamps.
26+
pub const TIMESTAMP: &str = "timestamp";
27+
28+
/// Well-known column name for tiebreaker.
29+
pub const TIEBREAKER: &str = "tiebreaker";
30+
31+
/// Well-known column name for timeseries ID hash.
32+
pub const TIMESERIES_ID: &str = "timeseries_id";
33+
34+
/// Well-known column name for metric value.
35+
pub const METRIC_VALUE: &str = "metric_value";
36+
37+
/// Column type IDs matching Go `types.TypeID` iota values.
38+
///
39+
/// Only the types that appear in sort schemas are included here.
40+
/// The discriminant values MUST match Go exactly for cross-system interop.
41+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42+
#[repr(u64)]
43+
pub enum ColumnTypeId {
44+
Int64 = 2,
45+
Float64 = 10,
46+
String = 14,
47+
Sketch = 17,
48+
CpcSketch = 20,
49+
ItemSketch = 22,
50+
}
51+
52+
impl ColumnTypeId {
53+
/// The Husky-convention suffix for this column type.
54+
///
55+
/// Used when serializing back to the string format with explicit types.
56+
pub fn suffix(self) -> &'static str {
57+
match self {
58+
Self::Int64 => "__i",
59+
Self::Float64 => "__nf",
60+
Self::String => "__s",
61+
Self::Sketch => "__sk",
62+
Self::CpcSketch => "__cpcsk",
63+
Self::ItemSketch => "__isk",
64+
}
65+
}
66+
67+
/// Human-readable type name matching Go `TypeID.String()`.
68+
pub fn as_str(self) -> &'static str {
69+
match self {
70+
Self::Int64 => "dense-int64",
71+
Self::Float64 => "dense-float64",
72+
Self::String => "dense-string",
73+
Self::Sketch => "dense-sketch",
74+
Self::CpcSketch => "dense-cpc-sketch",
75+
Self::ItemSketch => "dense-item-sketch",
76+
}
77+
}
78+
79+
/// Resolve column type from a column name, stripping any type suffix.
80+
///
81+
/// Returns `(bare_name, type)`. Type resolution order:
82+
/// 1. Explicit suffix (`__s`, `__i`, `__nf`, etc.) — stripped, type from suffix
83+
/// 2. Well-known bare name defaults:
84+
/// - `timestamp`, `tiebreaker`, `timeseries_id` → Int64
85+
/// - `metric_value` → Float64
86+
/// - everything else → String
87+
pub fn from_column_name(name: &str) -> Result<(&str, Self), SortFieldsError> {
88+
// Try explicit suffixes first (longest match first to avoid ambiguity).
89+
if let Some(bare) = name.strip_suffix("__isk") {
90+
return Ok((bare, Self::ItemSketch));
91+
}
92+
if let Some(bare) = name.strip_suffix("__cpcsk") {
93+
return Ok((bare, Self::CpcSketch));
94+
}
95+
if let Some(bare) = name.strip_suffix("__sk") {
96+
return Ok((bare, Self::Sketch));
97+
}
98+
if let Some(bare) = name.strip_suffix("__nf") {
99+
return Ok((bare, Self::Float64));
100+
}
101+
if let Some(bare) = name.strip_suffix("__i") {
102+
return Ok((bare, Self::Int64));
103+
}
104+
if let Some(bare) = name.strip_suffix("__s") {
105+
return Ok((bare, Self::String));
106+
}
107+
108+
// No suffix — use well-known name defaults.
109+
Ok((name, default_type_for_name(name)))
110+
}
111+
}
112+
113+
/// Default column type and sort direction for a bare column name.
114+
///
115+
/// This is the single source of truth for well-known column defaults.
116+
/// Used by the parser (type inference, default direction), display
117+
/// (suffix omission, direction omission), and validation.
118+
pub struct ColumnDefaults {
119+
pub column_type: ColumnTypeId,
120+
/// True if the default sort direction is descending.
121+
pub descending: bool,
122+
}
123+
124+
/// Well-known name → default type and sort direction lookup table.
125+
///
126+
/// Columns not in this table default to String, ascending.
127+
static WELL_KNOWN_COLUMNS: &[(&str, ColumnDefaults)] = &[
128+
(TIMESTAMP, ColumnDefaults { column_type: ColumnTypeId::Int64, descending: true }),
129+
("timestamp_secs", ColumnDefaults { column_type: ColumnTypeId::Int64, descending: true }),
130+
(TIEBREAKER, ColumnDefaults { column_type: ColumnTypeId::Int64, descending: false }),
131+
(TIMESERIES_ID, ColumnDefaults { column_type: ColumnTypeId::Int64, descending: false }),
132+
(METRIC_VALUE, ColumnDefaults { column_type: ColumnTypeId::Float64, descending: false }),
133+
("value", ColumnDefaults { column_type: ColumnTypeId::Float64, descending: false }),
134+
];
135+
136+
const DEFAULT_COLUMN: ColumnDefaults = ColumnDefaults {
137+
column_type: ColumnTypeId::String,
138+
descending: false,
139+
};
140+
141+
/// Look up default type and direction for a bare column name.
142+
pub fn column_defaults(name: &str) -> &'static ColumnDefaults {
143+
WELL_KNOWN_COLUMNS
144+
.iter()
145+
.find(|(n, _)| *n == name)
146+
.map(|(_, d)| d)
147+
.unwrap_or(&DEFAULT_COLUMN)
148+
}
149+
150+
/// Default column type for a bare name (convenience wrapper).
151+
pub fn default_type_for_name(name: &str) -> ColumnTypeId {
152+
column_defaults(name).column_type
153+
}
154+
155+
/// Whether this bare name defaults to descending sort.
156+
pub fn default_is_descending(name: &str) -> bool {
157+
column_defaults(name).descending
158+
}
159+
160+
impl std::fmt::Display for ColumnTypeId {
161+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162+
f.write_str(self.as_str())
163+
}
164+
}
165+
166+
/// Parse a type name string (e.g., "dense-int64") into a `ColumnTypeId`.
167+
impl FromStr for ColumnTypeId {
168+
type Err = SortFieldsError;
169+
170+
fn from_str(s: &str) -> Result<Self, Self::Err> {
171+
match s {
172+
"dense-int64" => Ok(Self::Int64),
173+
"dense-float64" => Ok(Self::Float64),
174+
"dense-string" => Ok(Self::String),
175+
"dense-sketch" => Ok(Self::Sketch),
176+
"dense-cpc-sketch" => Ok(Self::CpcSketch),
177+
"dense-item-sketch" => Ok(Self::ItemSketch),
178+
_ => Err(SortFieldsError::UnknownColumnType(format!(
179+
"unknown column type '{}'",
180+
s
181+
))),
182+
}
183+
}
184+
}
185+
186+
/// Convert a proto `column_type` u64 back to a `ColumnTypeId`.
187+
impl TryFrom<u64> for ColumnTypeId {
188+
type Error = SortFieldsError;
189+
190+
fn try_from(value: u64) -> Result<Self, Self::Error> {
191+
match value {
192+
2 => Ok(Self::Int64),
193+
10 => Ok(Self::Float64),
194+
14 => Ok(Self::String),
195+
17 => Ok(Self::Sketch),
196+
20 => Ok(Self::CpcSketch),
197+
22 => Ok(Self::ItemSketch),
198+
_ => Err(SortFieldsError::UnknownColumnType(format!(
199+
"unknown column type id: {}",
200+
value
201+
))),
202+
}
203+
}
204+
}
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
// Copyright 2021-Present Datadog, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
//! Sort schema to string serialization -- direct port of Go `SchemaToString` and
16+
//! `SchemaToStringShort`.
17+
//!
18+
//! The proto `SortColumn.name` stores the bare Parquet column name (no type suffix).
19+
//! These functions reconstruct the Husky-format suffixed name for serialization
20+
//! using `SortColumn.column_type` to determine the suffix.
21+
22+
use quickwit_proto::sortschema::{SortColumn, SortColumnDirection, SortSchema};
23+
24+
use super::column_type::{ColumnTypeId, default_is_descending, default_type_for_name};
25+
26+
fn direction_str(sort_direction: i32) -> &'static str {
27+
match SortColumnDirection::try_from(sort_direction) {
28+
Ok(SortColumnDirection::SortDirectionAscending) => ":+",
29+
Ok(SortColumnDirection::SortDirectionDescending) => ":-",
30+
_ => ":???",
31+
}
32+
}
33+
34+
fn type_str(column_type: u64) -> &'static str {
35+
match ColumnTypeId::try_from(column_type) {
36+
Ok(ct) => ct.as_str(),
37+
Err(_) => "unknown",
38+
}
39+
}
40+
41+
/// Reconstruct the column name for the Husky string format.
42+
///
43+
/// Only appends the type suffix when the column's type differs from the
44+
/// default for its bare name. This keeps the string short and readable:
45+
/// `metric_name` (default String) → no suffix needed
46+
/// `timestamp` (default Int64) → no suffix needed
47+
/// `my_counter__i` → suffix needed (Int64 differs from default String)
48+
fn display_name(col: &SortColumn) -> String {
49+
let col_type = match ColumnTypeId::try_from(col.column_type) {
50+
Ok(ct) => ct,
51+
Err(_) => return col.name.clone(),
52+
};
53+
let default_type = default_type_for_name(&col.name);
54+
if col_type == default_type {
55+
col.name.clone()
56+
} else {
57+
format!("{}{}", col.name, col_type.suffix())
58+
}
59+
}
60+
61+
/// Convert a `SortSchema` to its full string representation.
62+
///
63+
/// Format: `[name=]column__suffix:type:+/-[|...][/V#]`
64+
///
65+
/// Direct port of Go `SchemaToString`.
66+
pub fn schema_to_string(schema: &SortSchema) -> String {
67+
schema_to_string_inner(schema, true)
68+
}
69+
70+
/// Convert a `SortSchema` to its short string representation.
71+
///
72+
/// Format: `[name=]column__suffix[|...][/V#]`
73+
///
74+
/// Omits the explicit type and skips the sort direction when it matches the
75+
/// default (ascending for non-timestamp, descending for timestamp).
76+
///
77+
/// Direct port of Go `SchemaToStringShort`.
78+
pub fn schema_to_string_short(schema: &SortSchema) -> String {
79+
schema_to_string_inner(schema, false)
80+
}
81+
82+
/// Shared implementation for both full and short schema string formats.
83+
///
84+
/// When `verbose` is true, includes the explicit type and always emits direction.
85+
/// When `verbose` is false, omits type and skips direction when it matches the default.
86+
fn schema_to_string_inner(schema: &SortSchema, verbose: bool) -> String {
87+
let mut rv = String::new();
88+
89+
if !schema.name.is_empty() {
90+
rv.push_str(&schema.name);
91+
rv.push('=');
92+
}
93+
94+
for (i, col) in schema.column.iter().enumerate() {
95+
if i > 0 {
96+
rv.push('|');
97+
}
98+
if schema.lsm_comparison_cutoff > 0 && i == schema.lsm_comparison_cutoff as usize {
99+
rv.push('&');
100+
}
101+
rv.push_str(&display_name(col));
102+
103+
if verbose {
104+
rv.push(':');
105+
rv.push_str(type_str(col.column_type));
106+
rv.push_str(direction_str(col.sort_direction));
107+
} else {
108+
let is_default_direction = if default_is_descending(&col.name) {
109+
col.sort_direction == SortColumnDirection::SortDirectionDescending as i32
110+
} else {
111+
col.sort_direction == SortColumnDirection::SortDirectionAscending as i32
112+
};
113+
if !is_default_direction {
114+
rv.push_str(direction_str(col.sort_direction));
115+
}
116+
}
117+
}
118+
119+
if schema.sort_version > 0 {
120+
rv.push_str(&format!("/V{}", schema.sort_version));
121+
}
122+
123+
rv
124+
}

0 commit comments

Comments
 (0)