Skip to content

Commit 102de51

Browse files
authored
Propagate min/max/string length statistics to duckdb (#7416)
1. Open all files eagerly in duckdb: this is needed to get file statistics 2. Save these statistics, if present, in duckdb table function bind data 3. Use this data to propagate min/max numeric and min/max/max_lendth string statistics Signed-off-by: Mikhail Kot <to@myrrc.dev>
1 parent 1ece694 commit 102de51

23 files changed

Lines changed: 438 additions & 58 deletions

File tree

vortex-array/src/stats/stats_set.rs

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -459,11 +459,28 @@ impl MutTypedStatsSetRef<'_, '_> {
459459
) {
460460
(Some(m1), Some(m2)) => {
461461
// If the combine sum is exact, then we can sum them.
462-
if let Some(scalar_value) = m1.zip(m2).as_exact().and_then(|(s1, s2)| {
463-
s1.as_primitive()
464-
.checked_add(&s2.as_primitive())
465-
.and_then(|pscalar| pscalar.pvalue().map(ScalarValue::Primitive))
466-
}) {
462+
if let Some(scalar_value) =
463+
m1.zip(m2).as_exact().and_then(|(s1, s2)| match s1.dtype() {
464+
DType::Primitive(..) => s1
465+
.as_primitive()
466+
.checked_add(&s2.as_primitive())
467+
.and_then(|pscalar| pscalar.pvalue().map(ScalarValue::Primitive)),
468+
DType::Decimal(..) => s1
469+
.as_decimal()
470+
.checked_binary_numeric(
471+
&s2.as_decimal(),
472+
crate::scalar::NumericOperator::Add,
473+
)
474+
.map(|scalar| {
475+
ScalarValue::Decimal(
476+
scalar
477+
.decimal_value()
478+
.vortex_expect("no decimal value in scalar"),
479+
)
480+
}),
481+
_ => None,
482+
})
483+
{
467484
self.set(Stat::Sum, Precision::Exact(scalar_value));
468485
}
469486
}

vortex-cuda/src/layout.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
//! A CUDA-optimized flat layout that inlines small constant array buffers into layout metadata.
55
6+
use std::any::Any;
67
use std::collections::BTreeSet;
78
use std::ops::BitAnd;
89
use std::ops::Range;
@@ -381,6 +382,10 @@ impl LayoutReader for CudaFlatReader {
381382
}
382383
.boxed())
383384
}
385+
386+
fn as_any(&self) -> &dyn Any {
387+
self
388+
}
384389
}
385390

386391
/// A [`LayoutStrategy`] that writes a [`CudaFlatLayout`] with constant array buffers inlined

vortex-duckdb/cpp/include/duckdb_vx/table_function.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,18 @@ typedef struct {
9797
bool has_max_cardinality;
9898
} duckdb_vx_node_statistics;
9999

100+
typedef struct {
101+
// Set only for strings and primitive types
102+
duckdb_value min;
103+
duckdb_value max;
104+
// upper bit: "length is set". lower 32 bits: DuckDB's max string length.
105+
// set only for strings
106+
uint64_t max_string_length;
107+
bool has_null;
108+
} duckdb_column_statistics;
109+
110+
typedef idx_t column_t;
111+
100112
// A transparent DuckDB table function vtable, which can be used to configure a table function.
101113
// See duckdb/include/function/tfunc.hpp for details on each field.
102114
typedef struct {
@@ -137,7 +149,12 @@ typedef struct {
137149

138150
// void *in_out_function;
139151
// void *in_out_function_final;
140-
void *statistics;
152+
153+
// false if statistics are not available
154+
bool (*statistics)(duckdb_client_context context,
155+
const void *bind_data,
156+
size_t column_index,
157+
duckdb_column_statistics *stats_out);
141158

142159
// void *dependency;
143160
void (*cardinality)(void *bind_data, duckdb_vx_node_statistics *node_stats_out);

vortex-duckdb/cpp/table_function.cpp

Lines changed: 108 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
#include "duckdb_vx/table_function.h"
45
#include "duckdb_vx/duckdb_diagnostics.h"
56

67
DUCKDB_INCLUDES_BEGIN
@@ -30,8 +31,10 @@ struct CTableFunctionInfo final : TableFunctionInfo {
3031
};
3132

3233
struct CTableBindData final : TableFunctionData {
33-
CTableBindData(unique_ptr<CTableFunctionInfo> info_p, unique_ptr<vortex::CData> ffi_data_p)
34-
: info(std::move(info_p)), ffi_data(std::move(ffi_data_p)) {
34+
CTableBindData(unique_ptr<CTableFunctionInfo> info_p,
35+
unique_ptr<vortex::CData> ffi_data_p,
36+
const vector<LogicalType> &types)
37+
: info(std::move(info_p)), ffi_data(std::move(ffi_data_p)), types(types) {
3538
}
3639

3740
unique_ptr<FunctionData> Copy() const override {
@@ -43,11 +46,13 @@ struct CTableBindData final : TableFunctionData {
4346
throw BinderException(IntoErrString(error_out));
4447
}
4548
return make_uniq<CTableBindData>(make_uniq<CTableFunctionInfo>(info->vtab),
46-
unique_ptr<CData>(reinterpret_cast<CData *>(copied_ffi_data)));
49+
unique_ptr<CData>(reinterpret_cast<CData *>(copied_ffi_data)),
50+
types);
4751
}
4852

4953
unique_ptr<CTableFunctionInfo> info;
5054
unique_ptr<CData> ffi_data;
55+
vector<LogicalType> types;
5156
};
5257

5358
struct CTableGlobalData final : GlobalTableFunctionState {
@@ -88,6 +93,103 @@ double c_table_scan_progress(ClientContext &context,
8893
return bind.info->vtab.table_scan_progress(c_ctx, c_bind_data, c_global_state);
8994
}
9095

96+
static Value &UnwrapValue(duckdb_value value) {
97+
return *(reinterpret_cast<Value *>(value));
98+
}
99+
100+
unique_ptr<BaseStatistics> numeric_stats(duckdb_column_statistics &stats, LogicalType type) {
101+
BaseStatistics out = StringStats::CreateUnknown(type);
102+
if (stats.min) {
103+
NumericStats::SetMin(out, UnwrapValue(stats.min));
104+
duckdb_destroy_value(&stats.min);
105+
}
106+
if (stats.max) {
107+
NumericStats::SetMax(out, UnwrapValue(stats.max));
108+
duckdb_destroy_value(&stats.max);
109+
}
110+
if (!stats.has_null) {
111+
out.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
112+
}
113+
return out.ToUnique();
114+
}
115+
116+
unique_ptr<BaseStatistics> string_stats(duckdb_column_statistics &stats, LogicalType type) {
117+
BaseStatistics out = StringStats::CreateUnknown(type);
118+
if (stats.min) {
119+
StringStats::SetMin(out, StringValue::Get(UnwrapValue(stats.min)));
120+
duckdb_destroy_value(&stats.min);
121+
}
122+
if (stats.max) {
123+
StringStats::SetMax(out, StringValue::Get(UnwrapValue(stats.max)));
124+
duckdb_destroy_value(&stats.max);
125+
}
126+
if (stats.max_string_length >> 63) {
127+
StringStats::SetMaxStringLength(out, uint32_t(stats.max_string_length));
128+
}
129+
if (!stats.has_null) {
130+
out.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
131+
}
132+
133+
return out.ToUnique();
134+
}
135+
136+
unique_ptr<BaseStatistics> base_stats(duckdb_column_statistics &stats, LogicalType type) {
137+
BaseStatistics out = StringStats::CreateUnknown(type);
138+
if (!stats.has_null) {
139+
out.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
140+
}
141+
return out.ToUnique();
142+
}
143+
144+
unique_ptr<BaseStatistics>
145+
c_statistics(ClientContext &context, const FunctionData *bind_data, column_t column_index) {
146+
if (IsVirtualColumn(column_index)) {
147+
return {};
148+
}
149+
150+
const auto &bind = bind_data->Cast<CTableBindData>();
151+
void *const ffi_bind = bind.ffi_data->DataPtr();
152+
153+
duckdb_client_context c_ctx = reinterpret_cast<duckdb_client_context>(&context);
154+
duckdb_column_statistics statistics = {};
155+
if (!bind.info->vtab.statistics(c_ctx, ffi_bind, column_index, &statistics)) {
156+
return {};
157+
}
158+
159+
const LogicalType type = bind.types[column_index];
160+
161+
switch (type.id()) {
162+
case LogicalTypeId::BOOLEAN:
163+
case LogicalTypeId::TINYINT:
164+
case LogicalTypeId::SMALLINT:
165+
case LogicalTypeId::INTEGER:
166+
case LogicalTypeId::BIGINT:
167+
case LogicalTypeId::FLOAT:
168+
case LogicalTypeId::DOUBLE:
169+
case LogicalTypeId::UTINYINT:
170+
case LogicalTypeId::USMALLINT:
171+
case LogicalTypeId::UINTEGER:
172+
case LogicalTypeId::UBIGINT:
173+
case LogicalTypeId::UHUGEINT:
174+
case LogicalTypeId::HUGEINT: {
175+
return numeric_stats(statistics, type);
176+
}
177+
case LogicalTypeId::VARCHAR:
178+
case LogicalTypeId::BLOB: {
179+
return string_stats(statistics, type);
180+
}
181+
case LogicalTypeId::STRUCT: {
182+
// TODO(myrrc)
183+
// Duckdb's has_null has a different semantics for structs.
184+
// If we propagate our has_null, this breaks Duckdb optimizer.
185+
// You can reproduce it in struct.slt test in vortex-sqllogictests:
186+
return {};
187+
}
188+
default:
189+
return base_stats(statistics, type);
190+
}
191+
}
192+
91193
unique_ptr<FunctionData> c_bind(ClientContext &context,
92194
TableFunctionBindInput &input,
93195
vector<LogicalType> &return_types,
@@ -111,7 +213,8 @@ unique_ptr<FunctionData> c_bind(ClientContext &context,
111213
}
112214

113215
return make_uniq<CTableBindData>(make_uniq<CTableFunctionInfo>(info.vtab),
114-
unique_ptr<CData>(reinterpret_cast<CData *>(ffi_bind_data)));
216+
unique_ptr<CData>(reinterpret_cast<CData *>(ffi_bind_data)),
217+
return_types);
115218
}
116219

117220
unique_ptr<GlobalTableFunctionState> c_init_global(ClientContext &context, TableFunctionInitInput &input) {
@@ -363,6 +466,7 @@ extern "C" duckdb_state duckdb_vx_tfunc_register(duckdb_database ffi_db, const d
363466
tf.get_virtual_columns = c_get_virtual_columns;
364467
tf.to_string = c_to_string;
365468
tf.table_scan_progress = c_table_scan_progress;
469+
tf.statistics = c_statistics;
366470

367471
// Set up the parameters
368472
tf.arguments.reserve(vtab->parameter_count);

0 commit comments

Comments
 (0)