Skip to content

Commit 5b9dbbf

Browse files
committed
Integrated with Identifier and MaxLogicalType, and using SetChildCardinality
1 parent 55155d7 commit 5b9dbbf

29 files changed

Lines changed: 331 additions & 296 deletions

src/duckdb_py/arrow/filter_pushdown_visitor.cpp

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@ bool ValueIsNan(const Value &value) {
2828
// `struct_extract` chains. Anything else throws NotImplementedException —
2929
// that gives the OPTIONAL_FILTER catch point a chance to swallow it.
3030
struct ResolvedColumn {
31-
vector<string> path;
31+
vector<Identifier> path;
3232
const ArrowType *leaf_type;
3333
};
3434

35-
ResolvedColumn ResolveColumn(const Expression &expr, const vector<string> &root_path, const ArrowType *root_type) {
35+
ResolvedColumn ResolveColumn(const Expression &expr, const vector<Identifier> &root_path, const ArrowType *root_type) {
3636
if (expr.GetExpressionClass() == ExpressionClass::BOUND_REF) {
3737
return {root_path, root_type};
3838
}
@@ -47,8 +47,8 @@ ResolvedColumn ResolveColumn(const Expression &expr, const vector<string> &root_
4747
ExpressionTypeToString(expr.GetExpressionType()));
4848
}
4949
// Recurse innermost-first so names accumulate root → leaf.
50-
auto inner = ResolveColumn(*func.children[0], root_path, root_type);
51-
inner.path.push_back(StructType::GetChildName(func.children[0]->GetReturnType(), child_idx));
50+
auto inner = ResolveColumn(*func.GetChildren()[0], root_path, root_type);
51+
inner.path.push_back(StructType::GetChildName(func.GetChildren()[0]->GetReturnType(), child_idx));
5252
if (inner.leaf_type) {
5353
inner.leaf_type = &inner.leaf_type->GetTypeInfo<ArrowStructInfo>().GetChild(child_idx);
5454
}
@@ -66,8 +66,8 @@ py::object EmitCompare(FilterBackend &backend, ExpressionType op, py::object col
6666

6767
} // anonymous namespace
6868

69-
py::object TransformExpression(const Expression &expression, const vector<string> &column_path, FilterBackend &backend,
70-
const ArrowType *arrow_type, const string &timezone_config) {
69+
py::object TransformExpression(const Expression &expression, const vector<Identifier> &column_path,
70+
FilterBackend &backend, const ArrowType *arrow_type, const string &timezone_config) {
7171
auto expression_class = expression.GetExpressionClass();
7272
auto expression_type = expression.GetExpressionType();
7373

@@ -93,7 +93,7 @@ py::object TransformExpression(const Expression &expression, const vector<string
9393

9494
auto resolved = ResolveColumn(*column_side, column_path, arrow_type);
9595
auto col = backend.MakeColumnRef(resolved.path);
96-
return EmitCompare(backend, expression_type, std::move(col), constant_side->value, resolved.leaf_type,
96+
return EmitCompare(backend, expression_type, std::move(col), constant_side->GetValue(), resolved.leaf_type,
9797
timezone_config);
9898
}
9999

@@ -102,20 +102,22 @@ py::object TransformExpression(const Expression &expression, const vector<string
102102
// filters no longer have dedicated TableFilter subtypes. They arrive as scalar
103103
// function wrappers inside the ExpressionFilter expression tree (see
104104
// table_filter_functions.hpp).
105-
const auto &func_name = bound_function_expression.function.GetName();
105+
const auto &func_name = bound_function_expression.Function().GetName();
106106

107107
// OPTIONAL / SELECTIVITY_OPTIONAL wrap a child predicate that lives in `bind_info`
108108
// (their `children` hold only a placeholder column ref). An optional filter is never
109109
// required for correctness, so if its child can't be translated we push nothing for
110110
// it rather than failing the whole scan.
111111
if (func_name == OptionalFilterScalarFun::NAME || func_name == SelectivityOptionalFilterScalarFun::NAME) {
112112
optional_ptr<const Expression> child;
113-
if (bound_function_expression.bind_info) {
113+
if (bound_function_expression.BindInfo()) {
114114
if (func_name == OptionalFilterScalarFun::NAME) {
115-
child =
116-
bound_function_expression.bind_info->Cast<OptionalFilterFunctionData>().child_filter_expr.get();
115+
child = bound_function_expression.BindInfo()
116+
->Cast<OptionalFilterFunctionData>()
117+
.child_filter_expr.get();
117118
} else {
118-
child = bound_function_expression.bind_info->Cast<SelectivityOptionalFilterFunctionData>()
119+
child = bound_function_expression.BindInfo()
120+
->Cast<SelectivityOptionalFilterFunctionData>()
119121
.child_filter_expr.get();
120122
}
121123
}
@@ -140,24 +142,24 @@ py::object TransformExpression(const Expression &expression, const vector<string
140142
if (expression_class == ExpressionClass::BOUND_OPERATOR) {
141143
auto &op_expr = expression.Cast<BoundOperatorExpression>();
142144
if (expression_type == ExpressionType::OPERATOR_IS_NULL) {
143-
auto resolved = ResolveColumn(*op_expr.children[0], column_path, arrow_type);
145+
auto resolved = ResolveColumn(*op_expr.GetChildren()[0], column_path, arrow_type);
144146
auto col = backend.MakeColumnRef(resolved.path);
145147
return backend.IsNull(std::move(col));
146148
}
147149
if (expression_type == ExpressionType::OPERATOR_IS_NOT_NULL) {
148-
auto resolved = ResolveColumn(*op_expr.children[0], column_path, arrow_type);
150+
auto resolved = ResolveColumn(*op_expr.GetChildren()[0], column_path, arrow_type);
149151
auto col = backend.MakeColumnRef(resolved.path);
150152
return backend.IsNotNull(std::move(col));
151153
}
152154
if (expression_type == ExpressionType::COMPARE_IN) {
153-
auto resolved = ResolveColumn(*op_expr.children[0], column_path, arrow_type);
155+
auto resolved = ResolveColumn(*op_expr.GetChildren()[0], column_path, arrow_type);
154156
auto col = backend.MakeColumnRef(resolved.path);
155157
vector<Value> values;
156-
for (idx_t i = 1; i < op_expr.children.size(); i++) {
157-
auto &const_expr = op_expr.children[i]->Cast<BoundConstantExpression>();
158-
values.push_back(const_expr.value);
158+
for (idx_t i = 1; i < op_expr.GetChildren().size(); i++) {
159+
auto &const_expr = op_expr.GetChildren()[i]->Cast<BoundConstantExpression>();
160+
values.push_back(const_expr.GetValue());
159161
}
160-
auto col_type = op_expr.children[0]->GetReturnType();
162+
auto col_type = op_expr.GetChildren()[0]->GetReturnType();
161163
return backend.IsIn(std::move(col), values, col_type, timezone_config);
162164
}
163165
}
@@ -167,9 +169,9 @@ py::object TransformExpression(const Expression &expression, const vector<string
167169
const bool is_and = expression_type == ExpressionType::CONJUNCTION_AND;
168170
auto &conj_expr = expression.Cast<BoundConjunctionExpression>();
169171
py::object result = py::none();
170-
for (idx_t i = 0; i < conj_expr.children.size(); i++) {
172+
for (idx_t i = 0; i < conj_expr.GetChildren().size(); i++) {
171173
py::object child_expression =
172-
TransformExpression(*conj_expr.children[i], column_path, backend, arrow_type, timezone_config);
174+
TransformExpression(*conj_expr.GetChildren()[i], column_path, backend, arrow_type, timezone_config);
173175
if (child_expression.is(py::none())) {
174176
if (is_and) {
175177
// A conjunct we can't push can simply be dropped: the remaining AND
@@ -198,7 +200,7 @@ py::object TransformExpression(const Expression &expression, const vector<string
198200
ExpressionClassToString(expression_class));
199201
}
200202

201-
py::object TransformFilter(const TableFilter &filter, vector<string> column_path, FilterBackend &backend,
203+
py::object TransformFilter(const TableFilter &filter, const vector<Identifier> &column_path, FilterBackend &backend,
202204
const ArrowType *arrow_type, const string &timezone_config) {
203205
switch (filter.filter_type) {
204206
case TableFilterType::EXPRESSION_FILTER: {

src/duckdb_py/arrow/polars_filter_pushdown.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@ struct PolarsBackend : public FilterBackend {
1414
: client_properties(client_properties_p), import_cache(*DuckDBPyConnection::ImportCache()) {
1515
}
1616

17-
py::object MakeColumnRef(const vector<string> &path) override {
17+
py::object MakeColumnRef(const vector<Identifier> &path) override {
1818
// pl.col(path[0]).struct.field(path[1]).struct.field(...) — polars supports arbitrary
1919
// chaining for nested struct access, verified empirically up to 3 levels.
2020
py::object col = import_cache.polars.col()(path[0]);
2121
for (idx_t i = 1; i < path.size(); i++) {
22-
col = col.attr("struct").attr("field")(path[i]);
22+
col = col.attr("struct").attr("field")(path[i].GetIdentifierName());
2323
}
2424
return col;
2525
}
@@ -131,7 +131,7 @@ py::object PolarsFilterPushdown::TransformFilter(const TableFilterSet &filter_co
131131
auto &column_name = columns[column_idx];
132132
D_ASSERT(columns.find(column_idx) != columns.end());
133133

134-
vector<string> column_path = {column_name};
134+
vector<Identifier> column_path = {Identifier(column_name)};
135135
// Polars does not need ArrowType information — `nullptr` here propagates through the
136136
// shared walker; the PolarsBackend ignores the parameter in MakeScalar.
137137
py::object child_expression = duckdb::TransformFilter(entry.Filter(), std::move(column_path), backend, nullptr,

src/duckdb_py/arrow/pyarrow_filter_pushdown.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,11 @@ struct PyArrowBackend : public FilterBackend {
187187
dataset_scalar = import_cache.pyarrow.dataset().attr("scalar");
188188
}
189189

190-
py::object MakeColumnRef(const vector<string> &path) override {
191-
return field_factory(py::tuple(py::cast(path)));
190+
py::object MakeColumnRef(const vector<Identifier> &path) override {
191+
vector<string> str_path;
192+
std::transform(path.begin(), path.end(), str_path.begin(),
193+
[](const Identifier &segment) { return segment.GetIdentifierName(); });
194+
return field_factory(py::tuple(py::cast(str_path)));
192195
}
193196

194197
py::object MakeScalar(const Value &v, const ArrowType *arrow_type, const string &timezone_config) override {
@@ -283,7 +286,7 @@ py::object PyArrowFilterPushdown::TransformFilter(TableFilterSet &filter_collect
283286
auto &column_name = columns[column_idx];
284287
D_ASSERT(columns.find(column_idx) != columns.end());
285288

286-
vector<string> column_path = {column_name};
289+
vector<Identifier> column_path = {Identifier(column_name)};
287290
auto &arrow_type = arrow_table.GetColumns().at(filter_to_col.at(column_idx));
288291
py::object child_expression = duckdb::TransformFilter(entry.Filter(), std::move(column_path), backend,
289292
arrow_type.get(), config.time_zone);

src/duckdb_py/duckdb_python.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
#include "duckdb_python/pystatement.hpp"
1010
#include "duckdb_python/pyrelation.hpp"
1111
#include "duckdb_python/expression/pyexpression.hpp"
12-
#include "duckdb_python/pyresult.hpp"
1312
#include "duckdb_python/pybind11/exceptions.hpp"
1413
#include "duckdb_python/typing.hpp"
1514
#include "duckdb_python/functional.hpp"
@@ -22,8 +21,6 @@
2221
#include "duckdb/common/enums/statement_type.hpp"
2322
#include "duckdb/common/adbc/adbc-init.hpp"
2423

25-
#include "duckdb.hpp"
26-
2724
#ifndef DUCKDB_PYTHON_LIB_NAME
2825
#define DUCKDB_PYTHON_LIB_NAME _duckdb
2926
#endif
@@ -126,7 +123,7 @@ static void InitializeConnectionMethods(py::module_ &m) {
126123
py::arg("connection") = py::none());
127124
m.def(
128125
"get_profiling_information",
129-
[](const py::str &format, shared_ptr<DuckDBPyConnection> conn = nullptr) {
126+
[](const std::string &format, shared_ptr<DuckDBPyConnection> conn = nullptr) {
130127
if (!conn) {
131128
conn = DuckDBPyConnection::DefaultConnection();
132129
}

src/duckdb_py/include/duckdb_python/arrow/filter_pushdown_visitor.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ struct FilterBackend {
3838
// Build a column expression from an accumulated path. `path` always has
3939
// at least one element (the top-level column). For nested struct
4040
// references the path accumulates one entry per `struct_extract`.
41-
virtual py::object MakeColumnRef(const vector<string> &path) = 0;
41+
virtual py::object MakeColumnRef(const vector<Identifier> &path) = 0;
4242

4343
// Convert a DuckDB Value to a backend-native Python scalar. `arrow_type`
4444
// may be nullptr for backends that don't need Arrow type information
@@ -77,7 +77,7 @@ struct FilterBackend {
7777
// - `arrow_type` is the ArrowType for the current path leaf (nullable for
7878
// backends that don't track Arrow types).
7979
// - Returns `py::none()` if no part of the filter could be pushed.
80-
py::object TransformFilter(const TableFilter &filter, vector<string> column_path, FilterBackend &backend,
80+
py::object TransformFilter(const TableFilter &filter, const vector<Identifier> &column_path, FilterBackend &backend,
8181
const ArrowType *arrow_type, const string &timezone_config);
8282

8383
// Walk a bound Expression tree (the contents of an `ExpressionFilter`) and emit
@@ -88,7 +88,7 @@ py::object TransformFilter(const TableFilter &filter, vector<string> column_path
8888
// and the internal runtime filter functions (dynamic / bloom / perfect-hash-join
8989
// / prefix-range, which are skipped). Returns `py::none()` for an optional or
9090
// runtime filter that can't be pushed.
91-
py::object TransformExpression(const Expression &expression, const vector<string> &column_path, FilterBackend &backend,
92-
const ArrowType *arrow_type, const string &timezone_config);
91+
py::object TransformExpression(const Expression &expression, const vector<Identifier> &column_path,
92+
FilterBackend &backend, const ArrowType *arrow_type, const string &timezone_config);
9393

9494
} // namespace duckdb

src/duckdb_py/include/duckdb_python/numpy/numpy_bind.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ struct PandasColumnBindData;
99
class ClientContext;
1010

1111
struct NumpyBind {
12-
static void Bind(const ClientContext &config, py::handle df, vector<PandasColumnBindData> &out,
12+
static void Bind(ClientContext &config, py::handle df, vector<PandasColumnBindData> &out,
1313
vector<LogicalType> &return_types, vector<string> &names);
1414
};
1515

src/duckdb_py/include/duckdb_python/numpy/numpy_scan.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ namespace duckdb {
88
struct PandasColumnBindData;
99

1010
struct NumpyScan {
11-
static void Scan(PandasColumnBindData &bind_data, idx_t count, idx_t offset, Vector &out);
12-
static void ScanObjectColumn(PyObject **col, idx_t stride, idx_t count, idx_t offset, Vector &out);
11+
static void Scan(ClientContext &context, PandasColumnBindData &bind_data, idx_t count, idx_t offset, Vector &out);
12+
static void ScanObjectColumn(ClientContext &context, PyObject **col, idx_t stride, idx_t count, idx_t offset,
13+
Vector &out);
1314
};
1415

1516
} // namespace duckdb

src/duckdb_py/include/duckdb_python/pandas/pandas_analyzer.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,13 @@
1212
#include "duckdb/main/config.hpp"
1313
#include "duckdb_python/pybind11/pybind_wrapper.hpp"
1414
#include "duckdb_python/pybind11/gil_wrapper.hpp"
15-
#include "duckdb_python/numpy/numpy_type.hpp"
1615
#include "duckdb_python/python_conversion.hpp"
1716

1817
namespace duckdb {
1918

2019
class PandasAnalyzer {
2120
public:
22-
explicit PandasAnalyzer(const ClientContext &context) {
21+
explicit PandasAnalyzer(ClientContext &context) : context(context) {
2322
analyzed_type = LogicalType::SQLNULL;
2423

2524
Value result;
@@ -48,6 +47,7 @@ class PandasAnalyzer {
4847
PythonGILWrapper gil;
4948
//! The resulting analyzed type
5049
LogicalType analyzed_type;
50+
ClientContext &context;
5151
};
5252

5353
} // namespace duckdb

src/duckdb_py/include/duckdb_python/pandas/pandas_bind.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ struct PandasColumnBindData {
2727
};
2828

2929
struct Pandas {
30-
static void Bind(const ClientContext &config, py::handle df, vector<PandasColumnBindData> &out,
30+
static void Bind(ClientContext &config, py::handle df, vector<PandasColumnBindData> &out,
3131
vector<LogicalType> &return_types, vector<string> &names);
3232
};
3333

src/duckdb_py/include/duckdb_python/pandas/pandas_scan.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ struct PandasScanFunction : public TableFunction {
5151
// Helper function that transform pandas df names to make them work with our binder
5252
static py::object PandasReplaceCopiedNames(const py::object &original_df);
5353

54-
static void PandasBackendScanSwitch(PandasColumnBindData &bind_data, idx_t count, idx_t offset, Vector &out);
54+
static void PandasBackendScanSwitch(ClientContext &context, PandasColumnBindData &bind_data, idx_t count,
55+
idx_t offset, Vector &out);
5556

5657
static void PandasSerialize(Serializer &serializer, const optional_ptr<FunctionData> bind_data,
5758
const TableFunction &function);

0 commit comments

Comments
 (0)