Skip to content

Commit a64ddaf

Browse files
g-talbotclaude
andcommitted
feat(31): sort schema foundation — proto, parser, display, validation, window, TableConfig
Add the sort fields module for metrics compaction sort schema handling: - Vendored event_store_sortschema.proto (SortSchema, SortColumn, RowKeys) - Sort fields parser (direct port of Go StringToSchema) with V2-only enforcement - Display/serialization (SchemaToString, SchemaToStringShort) - Schema equivalence comparison for compaction decisions - Time-window arithmetic with rem_euclid for negative timestamp correctness - TableConfig with per-product-type default sort fields - SortFieldsError type in quickwit-proto - 97 tests including proptests for window invariants Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 440631b commit a64ddaf

File tree

19 files changed

+3467
-85
lines changed

19 files changed

+3467
-85
lines changed

quickwit/Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

quickwit/quickwit-parquet-engine/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,11 @@ license.workspace = true
1313
[dependencies]
1414
anyhow = { workspace = true }
1515
arrow = { workspace = true }
16+
chrono = { workspace = true }
1617
parquet = { workspace = true }
18+
prost = { workspace = true }
1719
quickwit-common = { workspace = true }
20+
quickwit-proto = { workspace = true }
1821
sea-query = { workspace = true, optional = true }
1922
serde = { workspace = true }
2023
serde_json = { workspace = true }

quickwit/quickwit-parquet-engine/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ pub mod index;
2424
pub mod ingest;
2525
pub mod metrics;
2626
pub mod schema;
27+
pub mod sort_fields;
2728
pub mod split;
2829
pub mod storage;
30+
pub mod table_config;
2931

3032
#[cfg(any(test, feature = "testsuite"))]
3133
pub mod test_helpers;
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
// Copyright 2021-Present Datadog, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
//! Column type identification from name suffixes and string names.
16+
//!
17+
//! Type can be specified via Husky-convention suffixes (`__s`, `__i`, `__nf`)
18+
//! or inferred from well-known bare names. The discriminant values match
19+
//! the Go iota exactly for cross-system interoperability.
20+
21+
use std::str::FromStr;
22+
23+
use super::SortFieldsError;
24+
25+
/// Well-known column name for timestamps.
26+
pub const TIMESTAMP: &str = "timestamp";
27+
28+
/// Well-known column name for tiebreaker.
29+
pub const TIEBREAKER: &str = "tiebreaker";
30+
31+
/// Well-known column name for timeseries ID hash.
32+
pub const TIMESERIES_ID: &str = "timeseries_id";
33+
34+
/// Well-known column name for metric value.
35+
pub const METRIC_VALUE: &str = "metric_value";
36+
37+
/// Column type IDs matching Go `types.TypeID` iota values.
38+
///
39+
/// Only the types that appear in sort schemas are included here.
40+
/// The discriminant values MUST match Go exactly for cross-system interop.
41+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42+
#[repr(u64)]
43+
pub enum ColumnTypeId {
44+
Int64 = 2,
45+
Float64 = 10,
46+
String = 14,
47+
Sketch = 17,
48+
CpcSketch = 20,
49+
ItemSketch = 22,
50+
}
51+
52+
impl ColumnTypeId {
53+
/// The Husky-convention suffix for this column type.
54+
///
55+
/// Used when serializing back to the string format with explicit types.
56+
pub fn suffix(self) -> &'static str {
57+
match self {
58+
Self::Int64 => "__i",
59+
Self::Float64 => "__nf",
60+
Self::String => "__s",
61+
Self::Sketch => "__sk",
62+
Self::CpcSketch => "__cpcsk",
63+
Self::ItemSketch => "__isk",
64+
}
65+
}
66+
67+
/// Human-readable type name matching Go `TypeID.String()`.
68+
pub fn as_str(self) -> &'static str {
69+
match self {
70+
Self::Int64 => "dense-int64",
71+
Self::Float64 => "dense-float64",
72+
Self::String => "dense-string",
73+
Self::Sketch => "dense-sketch",
74+
Self::CpcSketch => "dense-cpc-sketch",
75+
Self::ItemSketch => "dense-item-sketch",
76+
}
77+
}
78+
79+
/// Resolve column type from a column name, stripping any type suffix.
80+
///
81+
/// Returns `(bare_name, type)`. Type resolution order:
82+
/// 1. Explicit suffix (`__s`, `__i`, `__nf`, etc.) — stripped, type from suffix
83+
/// 2. Well-known bare name defaults:
84+
/// - `timestamp`, `tiebreaker`, `timeseries_id` → Int64
85+
/// - `metric_value` → Float64
86+
/// - everything else → String
87+
pub fn from_column_name(name: &str) -> Result<(&str, Self), SortFieldsError> {
88+
// Try explicit suffixes first (longest match first to avoid ambiguity).
89+
if let Some(bare) = name.strip_suffix("__isk") {
90+
return Ok((bare, Self::ItemSketch));
91+
}
92+
if let Some(bare) = name.strip_suffix("__cpcsk") {
93+
return Ok((bare, Self::CpcSketch));
94+
}
95+
if let Some(bare) = name.strip_suffix("__sk") {
96+
return Ok((bare, Self::Sketch));
97+
}
98+
if let Some(bare) = name.strip_suffix("__nf") {
99+
return Ok((bare, Self::Float64));
100+
}
101+
if let Some(bare) = name.strip_suffix("__i") {
102+
return Ok((bare, Self::Int64));
103+
}
104+
if let Some(bare) = name.strip_suffix("__s") {
105+
return Ok((bare, Self::String));
106+
}
107+
108+
// No suffix — use well-known name defaults.
109+
Ok((name, default_type_for_name(name)))
110+
}
111+
}
112+
113+
/// Default column type and sort direction for a bare column name.
114+
///
115+
/// This is the single source of truth for well-known column defaults.
116+
/// Used by the parser (type inference, default direction), display
117+
/// (suffix omission, direction omission), and validation.
118+
pub struct ColumnDefaults {
119+
pub column_type: ColumnTypeId,
120+
/// True if the default sort direction is descending.
121+
pub descending: bool,
122+
}
123+
124+
/// Well-known name → default type and sort direction lookup table.
125+
///
126+
/// Columns not in this table default to String, ascending.
127+
static WELL_KNOWN_COLUMNS: &[(&str, ColumnDefaults)] = &[
128+
(
129+
TIMESTAMP,
130+
ColumnDefaults {
131+
column_type: ColumnTypeId::Int64,
132+
descending: true,
133+
},
134+
),
135+
(
136+
"timestamp_secs",
137+
ColumnDefaults {
138+
column_type: ColumnTypeId::Int64,
139+
descending: true,
140+
},
141+
),
142+
(
143+
TIEBREAKER,
144+
ColumnDefaults {
145+
column_type: ColumnTypeId::Int64,
146+
descending: false,
147+
},
148+
),
149+
(
150+
TIMESERIES_ID,
151+
ColumnDefaults {
152+
column_type: ColumnTypeId::Int64,
153+
descending: false,
154+
},
155+
),
156+
(
157+
METRIC_VALUE,
158+
ColumnDefaults {
159+
column_type: ColumnTypeId::Float64,
160+
descending: false,
161+
},
162+
),
163+
(
164+
"value",
165+
ColumnDefaults {
166+
column_type: ColumnTypeId::Float64,
167+
descending: false,
168+
},
169+
),
170+
];
171+
172+
const DEFAULT_COLUMN: ColumnDefaults = ColumnDefaults {
173+
column_type: ColumnTypeId::String,
174+
descending: false,
175+
};
176+
177+
/// Look up default type and direction for a bare column name.
178+
pub fn column_defaults(name: &str) -> &'static ColumnDefaults {
179+
WELL_KNOWN_COLUMNS
180+
.iter()
181+
.find(|(n, _)| *n == name)
182+
.map(|(_, d)| d)
183+
.unwrap_or(&DEFAULT_COLUMN)
184+
}
185+
186+
/// Default column type for a bare name (convenience wrapper).
187+
pub fn default_type_for_name(name: &str) -> ColumnTypeId {
188+
column_defaults(name).column_type
189+
}
190+
191+
/// Whether this bare name defaults to descending sort.
192+
pub fn default_is_descending(name: &str) -> bool {
193+
column_defaults(name).descending
194+
}
195+
196+
impl std::fmt::Display for ColumnTypeId {
197+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
198+
f.write_str(self.as_str())
199+
}
200+
}
201+
202+
/// Parse a type name string (e.g., "dense-int64") into a `ColumnTypeId`.
203+
impl FromStr for ColumnTypeId {
204+
type Err = SortFieldsError;
205+
206+
fn from_str(s: &str) -> Result<Self, Self::Err> {
207+
match s {
208+
"dense-int64" => Ok(Self::Int64),
209+
"dense-float64" => Ok(Self::Float64),
210+
"dense-string" => Ok(Self::String),
211+
"dense-sketch" => Ok(Self::Sketch),
212+
"dense-cpc-sketch" => Ok(Self::CpcSketch),
213+
"dense-item-sketch" => Ok(Self::ItemSketch),
214+
_ => Err(SortFieldsError::UnknownColumnType(format!(
215+
"unknown column type '{}'",
216+
s
217+
))),
218+
}
219+
}
220+
}
221+
222+
/// Convert a proto `column_type` u64 back to a `ColumnTypeId`.
223+
impl TryFrom<u64> for ColumnTypeId {
224+
type Error = SortFieldsError;
225+
226+
fn try_from(value: u64) -> Result<Self, Self::Error> {
227+
match value {
228+
2 => Ok(Self::Int64),
229+
10 => Ok(Self::Float64),
230+
14 => Ok(Self::String),
231+
17 => Ok(Self::Sketch),
232+
20 => Ok(Self::CpcSketch),
233+
22 => Ok(Self::ItemSketch),
234+
_ => Err(SortFieldsError::UnknownColumnType(format!(
235+
"unknown column type id: {}",
236+
value
237+
))),
238+
}
239+
}
240+
}

0 commit comments

Comments
 (0)