Skip to content

Commit 69e130d

Browse files
committed
initial
Signed-off-by: Mikhail Kot <to@myrrc.dev>
1 parent 3e6834e commit 69e130d

15 files changed

Lines changed: 358 additions & 57 deletions

File tree

vortex-array/src/scalar/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,12 @@ use crate::dtype::DType;
3838
#[derive(Clone, Debug, Eq)]
3939
pub struct Scalar {
4040
/// The type of the scalar.
41-
dtype: DType,
41+
pub dtype: DType,
4242

4343
/// The value of the scalar. This is [`None`] if the value is null, otherwise it is [`Some`].
4444
///
4545
/// Invariant: If the [`DType`] is non-nullable, then this value _cannot_ be [`None`].
46-
value: Option<ScalarValue>,
46+
pub value: Option<ScalarValue>,
4747
}
4848

4949
#[cfg(test)]

vortex-duckdb/cpp/include/duckdb_vx/table_function.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,18 @@ typedef struct {
9797
bool has_max_cardinality;
9898
} duckdb_vx_node_statistics;
9999

100+
typedef struct {
101+
duckdb_value min;
102+
duckdb_value max;
103+
// Duckdb's max_string length, uint32_t, is stored in lower bits.
104+
// upper bit is "max string length is set"
105+
uint64_t max_string_length;
106+
} duckdb_column_statistics;
107+
108+
#define U64_HIGHEST_BIT_SET(x) (((x) & (UINT64_C(1) << 63)) != 0)
109+
110+
typedef idx_t column_t;
111+
100112
// A transparent DuckDB table function vtable, which can be used to configure a table function.
101113
// See duckdb/include/function/tfunc.hpp for details on each field.
102114
typedef struct {
@@ -137,7 +149,11 @@ typedef struct {
137149

138150
// void *in_out_function;
139151
// void *in_out_function_final;
140-
void *statistics;
152+
153+
void (*statistics)(duckdb_client_context context,
154+
const void *bind_data,
155+
size_t column_index,
156+
duckdb_column_statistics *stats_out);
141157

142158
// void *dependency;
143159
void (*cardinality)(void *bind_data, duckdb_vx_node_statistics *node_stats_out);

vortex-duckdb/cpp/table_function.cpp

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
#include "duckdb_vx/table_function.h"
45
#include "duckdb_vx/duckdb_diagnostics.h"
56

67
DUCKDB_INCLUDES_BEGIN
@@ -30,8 +31,10 @@ struct CTableFunctionInfo final : TableFunctionInfo {
3031
};
3132

3233
struct CTableBindData final : TableFunctionData {
33-
CTableBindData(unique_ptr<CTableFunctionInfo> info_p, unique_ptr<vortex::CData> ffi_data_p)
34-
: info(std::move(info_p)), ffi_data(std::move(ffi_data_p)) {
34+
CTableBindData(unique_ptr<CTableFunctionInfo> info_p,
35+
unique_ptr<vortex::CData> ffi_data_p,
36+
const vector<LogicalType> &types)
37+
: info(std::move(info_p)), ffi_data(std::move(ffi_data_p)), types(types) {
3538
}
3639

3740
unique_ptr<FunctionData> Copy() const override {
@@ -43,11 +46,13 @@ struct CTableBindData final : TableFunctionData {
4346
throw BinderException(IntoErrString(error_out));
4447
}
4548
return make_uniq<CTableBindData>(make_uniq<CTableFunctionInfo>(info->vtab),
46-
unique_ptr<CData>(reinterpret_cast<CData *>(copied_ffi_data)));
49+
unique_ptr<CData>(reinterpret_cast<CData *>(copied_ffi_data)),
50+
types);
4751
}
4852

4953
unique_ptr<CTableFunctionInfo> info;
5054
unique_ptr<CData> ffi_data;
55+
vector<LogicalType> types;
5156
};
5257

5358
struct CTableGlobalData final : GlobalTableFunctionState {
@@ -88,6 +93,75 @@ double c_table_scan_progress(ClientContext &context,
8893
return bind.info->vtab.table_scan_progress(c_ctx, c_bind_data, c_global_state);
8994
}
9095

96+
static Value &UnwrapValue(duckdb_value value) {
97+
return *(reinterpret_cast<Value *>(value));
98+
}
99+
100+
unique_ptr<BaseStatistics> numeric_stats(duckdb_column_statistics &stats, LogicalType type) {
101+
BaseStatistics out = StringStats::CreateUnknown(type);
102+
if (stats.min) {
103+
NumericStats::SetMin(out, UnwrapValue(stats.min));
104+
duckdb_destroy_value(&stats.min);
105+
}
106+
if (stats.max) {
107+
NumericStats::SetMax(out, UnwrapValue(stats.max));
108+
duckdb_destroy_value(&stats.max);
109+
}
110+
return out.ToUnique();
111+
}
112+
113+
unique_ptr<BaseStatistics> string_stats(duckdb_column_statistics &stats, LogicalType type) {
114+
BaseStatistics out = StringStats::CreateUnknown(type);
115+
if (stats.min) {
116+
StringStats::SetMin(out, StringValue::Get(UnwrapValue(stats.min)));
117+
duckdb_destroy_value(&stats.min);
118+
}
119+
if (stats.max) {
120+
StringStats::SetMax(out, StringValue::Get(UnwrapValue(stats.max)));
121+
duckdb_destroy_value(&stats.max);
122+
}
123+
if (U64_HIGHEST_BIT_SET(stats.max_string_length)) {
124+
StringStats::SetMaxStringLength(out, uint32_t(stats.max_string_length));
125+
}
126+
return out.ToUnique();
127+
}
128+
129+
unique_ptr<BaseStatistics>
130+
c_statistics(ClientContext &context, const FunctionData *bind_data, column_t column_index) {
131+
const auto &bind = bind_data->Cast<CTableBindData>();
132+
void *const ffi_bind = bind.ffi_data->DataPtr();
133+
134+
duckdb_client_context c_ctx = reinterpret_cast<duckdb_client_context>(&context);
135+
duckdb_column_statistics statistics = {};
136+
const LogicalType type = bind.types[column_index];
137+
138+
switch (type.id()) {
139+
case LogicalTypeId::BOOLEAN:
140+
case LogicalTypeId::TINYINT:
141+
case LogicalTypeId::SMALLINT:
142+
case LogicalTypeId::INTEGER:
143+
case LogicalTypeId::BIGINT:
144+
case LogicalTypeId::FLOAT:
145+
case LogicalTypeId::DOUBLE:
146+
case LogicalTypeId::UTINYINT:
147+
case LogicalTypeId::USMALLINT:
148+
case LogicalTypeId::UINTEGER:
149+
case LogicalTypeId::UBIGINT:
150+
case LogicalTypeId::UHUGEINT:
151+
case LogicalTypeId::HUGEINT: {
152+
bind.info->vtab.statistics(c_ctx, ffi_bind, column_index, &statistics);
153+
return numeric_stats(statistics, type);
154+
}
155+
case LogicalTypeId::VARCHAR:
156+
case LogicalTypeId::BLOB: {
157+
bind.info->vtab.statistics(c_ctx, ffi_bind, column_index, &statistics);
158+
return string_stats(statistics, type);
159+
}
160+
default:
161+
return BaseStatistics::CreateUnknown(type).ToUnique();
162+
}
163+
}
164+
91165
unique_ptr<FunctionData> c_bind(ClientContext &context,
92166
TableFunctionBindInput &input,
93167
vector<LogicalType> &return_types,
@@ -111,7 +185,8 @@ unique_ptr<FunctionData> c_bind(ClientContext &context,
111185
}
112186

113187
return make_uniq<CTableBindData>(make_uniq<CTableFunctionInfo>(info.vtab),
114-
unique_ptr<CData>(reinterpret_cast<CData *>(ffi_bind_data)));
188+
unique_ptr<CData>(reinterpret_cast<CData *>(ffi_bind_data)),
189+
return_types);
115190
}
116191

117192
unique_ptr<GlobalTableFunctionState> c_init_global(ClientContext &context, TableFunctionInitInput &input) {
@@ -363,6 +438,7 @@ extern "C" duckdb_state duckdb_vx_tfunc_register(duckdb_database ffi_db, const d
363438
tf.get_virtual_columns = c_get_virtual_columns;
364439
tf.to_string = c_to_string;
365440
tf.table_scan_progress = c_table_scan_progress;
441+
tf.statistics = c_statistics;
366442

367443
// Set up the parameters
368444
tf.arguments.reserve(vtab->parameter_count);

vortex-duckdb/src/datasource.rs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ use vortex::expr::col;
3737
use vortex::expr::root;
3838
use vortex::expr::select;
3939
use vortex::expr::stats::Precision;
40+
use vortex::file::FileStatistics;
4041
use vortex::io::kanal_ext::KanalExt;
4142
use vortex::io::runtime::BlockingRuntime;
4243
use vortex::io::runtime::current::ThreadSafeIterator;
@@ -54,6 +55,7 @@ use crate::duckdb::BindInputRef;
5455
use crate::duckdb::BindResultRef;
5556
use crate::duckdb::Cardinality;
5657
use crate::duckdb::ClientContextRef;
58+
use crate::duckdb::ColumnStatistics;
5759
use crate::duckdb::DataChunkRef;
5860
use crate::duckdb::ExpressionRef;
5961
use crate::duckdb::LogicalType;
@@ -97,7 +99,10 @@ pub(crate) trait DataSourceTableFunction: Sized + Debug {
9799
}
98100

99101
/// Bind the table function and return a [`DataSourceRef`].
100-
fn bind(ctx: &ClientContextRef, input: &BindInputRef) -> VortexResult<DataSourceRef>;
102+
fn bind(
103+
ctx: &ClientContextRef,
104+
input: &BindInputRef,
105+
) -> VortexResult<(DataSourceRef, FileStatistics)>;
101106
}
102107

103108
/// Bind data produced by a [`DataSourceTableFunction`].
@@ -106,6 +111,7 @@ pub struct DataSourceBindData {
106111
filter_exprs: Vec<Expression>,
107112
column_names: Vec<String>,
108113
column_types: Vec<LogicalType>,
114+
stats: Vec<ColumnStatistics>,
109115
}
110116

111117
impl Clone for DataSourceBindData {
@@ -116,6 +122,7 @@ impl Clone for DataSourceBindData {
116122
filter_exprs: vec![],
117123
column_names: self.column_names.clone(),
118124
column_types: self.column_types.clone(),
125+
stats: self.stats.clone(),
119126
}
120127
}
121128
}
@@ -189,19 +196,24 @@ impl<T: DataSourceTableFunction> TableFunction for T {
189196
input: &BindInputRef,
190197
result: &mut BindResultRef,
191198
) -> VortexResult<Self::BindData> {
192-
let data_source = T::bind(ctx, input)?;
199+
let (data_source, file_stats) = T::bind(ctx, input)?;
193200

194201
let (column_names, column_types) = extract_schema_from_dtype(data_source.dtype())?;
195202

196-
for (column_name, column_type) in column_names.iter().zip(&column_types) {
203+
let mut stats = Vec::new();
204+
for (i, (column_name, column_type)) in column_names.iter().zip(&column_types).enumerate() {
197205
result.add_result_column(column_name, column_type);
206+
let stats_set = &file_stats.stats_sets()[i];
207+
let dtype = &file_stats.dtypes()[i];
208+
stats.push(ColumnStatistics::new(&stats_set, dtype.clone()));
198209
}
199210

200211
Ok(DataSourceBindData {
201212
data_source,
202213
filter_exprs: vec![],
203214
column_names,
204215
column_types,
216+
stats,
205217
})
206218
}
207219

@@ -412,6 +424,14 @@ impl<T: DataSourceTableFunction> TableFunction for T {
412424
Ok(false)
413425
}
414426

427+
fn statistics(
428+
_client_context: &ClientContextRef,
429+
bind_data: &Self::BindData,
430+
column_index: usize,
431+
) -> ColumnStatistics {
432+
bind_data.stats[column_index].clone()
433+
}
434+
415435
fn cardinality(bind_data: &Self::BindData) -> Cardinality {
416436
match bind_data.data_source.row_count() {
417437
Some(Precision::Exact(v)) => Cardinality::Maximum(v),

vortex-duckdb/src/duckdb/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ mod reusable_dict;
1919
mod scalar_function;
2020
mod selection_vector;
2121
mod table_filter;
22-
mod table_function;
22+
// TODO(myrrc) should it be private?
23+
pub mod table_function;
2324
mod value;
2425
mod vector;
2526
mod vector_buffer;

vortex-duckdb/src/duckdb/table_function/mod.rs

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,26 @@ use std::ffi::c_void;
77
use std::fmt::Debug;
88
use std::ptr;
99

10+
use vortex::dtype::DType;
1011
use vortex::error::VortexExpect;
1112
use vortex::error::VortexResult;
1213
mod bind;
1314
mod cardinality;
1415
mod init;
1516
mod partition;
1617
mod pushdown_complex_filter;
18+
mod statistics;
1719
mod table_scan_progress;
1820
mod virtual_columns;
1921

2022
pub use bind::*;
2123
pub use init::*;
2224
pub use virtual_columns::VirtualColumnsResult;
2325
pub use virtual_columns::VirtualColumnsResultRef;
26+
use vortex::array::stats::StatsSet;
27+
use vortex::expr::stats::Precision;
28+
use vortex::expr::stats::Stat;
29+
use vortex::scalar::ScalarValue;
2430

2531
use crate::cpp;
2632
use crate::cpp::duckdb_client_context;
@@ -34,10 +40,50 @@ use crate::duckdb::expr::ExpressionRef;
3440
use crate::duckdb::table_function::cardinality::cardinality_callback;
3541
use crate::duckdb::table_function::partition::get_partition_data_callback;
3642
use crate::duckdb::table_function::pushdown_complex_filter::pushdown_complex_filter_callback;
43+
use crate::duckdb::table_function::statistics::statistics;
3744
use crate::duckdb::table_function::table_scan_progress::table_scan_progress_callback;
3845
use crate::duckdb::table_function::virtual_columns::get_virtual_columns_callback;
3946
use crate::duckdb_try;
4047

48+
#[derive(Clone)]
49+
pub struct ColumnStatistics {
50+
pub minmax_dtype: DType,
51+
pub min: Option<ScalarValue>,
52+
pub max: Option<ScalarValue>,
53+
pub max_string_length: Option<u32>,
54+
}
55+
56+
impl ColumnStatistics {
57+
pub fn new(stats: &StatsSet, dtype: DType) -> Self {
58+
let min = if let Some(Precision::Exact(value)) = stats.get(Stat::Min) {
59+
Some(value)
60+
} else {
61+
None
62+
};
63+
64+
let max = if let Some(Precision::Exact(value)) = stats.get(Stat::Max) {
65+
Some(value)
66+
} else {
67+
None
68+
};
69+
70+
// TODO(myrrc): does it calculate string length?
71+
let max_string_length =
72+
if let Some(Precision::Exact(value)) = stats.get(Stat::UncompressedSizeInBytes) {
73+
Some(value.as_primitive().as_u64().expect("not a u64") as u32)
74+
} else {
75+
None
76+
};
77+
78+
Self {
79+
minmax_dtype: dtype,
80+
min,
81+
max,
82+
max_string_length,
83+
}
84+
}
85+
}
86+
4187
/// A trait that defines the supported operations for a table function in DuckDB.
4288
///
4389
/// This trait does not yet cover the full C++ API, see table_function.hpp.
@@ -84,6 +130,12 @@ pub trait TableFunction: Sized + Debug {
84130
result: &mut BindResultRef,
85131
) -> VortexResult<Self::BindData>;
86132

133+
fn statistics(
134+
client_context: &ClientContextRef,
135+
bind_data: &Self::BindData,
136+
column_index: usize,
137+
) -> ColumnStatistics;
138+
87139
/// The function is called during query execution and is responsible for producing the output
88140
fn scan(
89141
client_context: &ClientContextRef,
@@ -188,7 +240,7 @@ impl DatabaseRef {
188240
init_global: Some(init_global_callback::<T>),
189241
init_local: Some(init_local_callback::<T>),
190242
function: Some(function::<T>),
191-
statistics: ptr::null_mut::<c_void>(),
243+
statistics: Some(statistics::<T>),
192244
cardinality: Some(cardinality_callback::<T>),
193245
pushdown_complex_filter: Some(pushdown_complex_filter_callback::<T>),
194246
pushdown_expression: ptr::null_mut::<c_void>(),

0 commit comments

Comments
 (0)