Skip to content

Commit 6b23153

Browse files
committed
finish expression filter integration
1 parent 620d84c commit 6b23153

4 files changed

Lines changed: 92 additions & 36 deletions

File tree

src/duckdb_py/arrow/arrow_array_stream.cpp

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -73,18 +73,20 @@ unique_ptr<ArrowArrayStreamWrapper> PythonTableArrowArrayStreamFactory::Produce(
7373
auto filters = parameters.filters;
7474
bool filters_pushed = false;
7575

76-
// Translate DuckDB filters to Polars expressions and push into the lazy plan
76+
// Translate DuckDB filters to Polars expressions and push into the lazy plan.
77+
// The walker only fails (throws / returns py::none()) for filters that are not
78+
// required for correctness — optional/runtime wrappers it skips, or shapes the
79+
// optimizer keeps above the scan. A throw here would mean the optimizer fully
80+
// pushed something we can't translate (a correctness bug), so we let it surface
81+
// rather than silently returning unfiltered rows — the arrow scan does not
82+
// re-apply pushed filters. Mirrors the pyarrow ProduceScanner path.
7783
if (filters && filters->HasFilters()) {
78-
try {
79-
auto filter_expr = PolarsFilterPushdown::TransformFilter(
80-
*filters, parameters.projected_columns.projection_map, parameters.projected_columns.filter_to_col,
81-
factory->client_properties);
82-
if (!filter_expr.is(py::none())) {
83-
lf = lf.attr("filter")(filter_expr);
84-
filters_pushed = true;
85-
}
86-
} catch (...) {
87-
// Fallback: DuckDB handles filtering post-scan
84+
auto filter_expr = PolarsFilterPushdown::TransformFilter(
85+
*filters, parameters.projected_columns.projection_map, parameters.projected_columns.filter_to_col,
86+
factory->client_properties);
87+
if (!filter_expr.is(py::none())) {
88+
lf = lf.attr("filter")(filter_expr);
89+
filters_pushed = true;
8890
}
8991
}
9092

src/duckdb_py/arrow/filter_pushdown_visitor.cpp

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,8 @@
66
#include "duckdb/planner/expression/bound_constant_expression.hpp"
77
#include "duckdb/planner/expression/bound_function_expression.hpp"
88
#include "duckdb/planner/expression/bound_operator_expression.hpp"
9-
#include "duckdb/planner/expression/bound_reference_expression.hpp"
10-
#include "duckdb/planner/filter/conjunction_filter.hpp"
11-
#include "duckdb/planner/filter/constant_filter.hpp"
129
#include "duckdb/planner/filter/expression_filter.hpp"
13-
#include "duckdb/planner/filter/in_filter.hpp"
14-
#include "duckdb/planner/filter/optional_filter.hpp"
15-
#include "duckdb/planner/filter/struct_filter.hpp"
10+
#include "duckdb/planner/filter/table_filter_functions.hpp"
1611

1712
namespace duckdb {
1813

@@ -101,6 +96,45 @@ py::object TransformExpression(const Expression &expression, const vector<string
10196
return EmitCompare(backend, expression_type, std::move(col), constant_side->value, resolved.leaf_type,
10297
timezone_config);
10398
}
99+
100+
// Internal table-filter functions. Since the table-filter -> expression-filter
101+
// migration in core, optional / dynamic / bloom / perfect-hash-join / prefix-range
102+
// filters no longer have dedicated TableFilter subtypes. They arrive as scalar
103+
// function wrappers inside the ExpressionFilter expression tree (see
104+
// table_filter_functions.hpp).
105+
const auto &func_name = bound_function_expression.function.GetName();
106+
107+
// OPTIONAL / SELECTIVITY_OPTIONAL wrap a child predicate that lives in `bind_info`
108+
// (their `children` hold only a placeholder column ref). An optional filter is never
109+
// required for correctness, so if its child can't be translated we push nothing for
110+
// it rather than failing the whole scan.
111+
if (func_name == OptionalFilterScalarFun::NAME || func_name == SelectivityOptionalFilterScalarFun::NAME) {
112+
optional_ptr<const Expression> child;
113+
if (bound_function_expression.bind_info) {
114+
if (func_name == OptionalFilterScalarFun::NAME) {
115+
child =
116+
bound_function_expression.bind_info->Cast<OptionalFilterFunctionData>().child_filter_expr.get();
117+
} else {
118+
child = bound_function_expression.bind_info->Cast<SelectivityOptionalFilterFunctionData>()
119+
.child_filter_expr.get();
120+
}
121+
}
122+
if (!child) {
123+
return py::none();
124+
}
125+
try {
126+
return TransformExpression(*child, column_path, backend, arrow_type, timezone_config);
127+
} catch (const NotImplementedException &) {
128+
return py::none();
129+
}
130+
}
131+
132+
// DYNAMIC / BLOOM / PERFECT_HASH_JOIN / PREFIX_RANGE are runtime filters with no
133+
// static pyarrow/polars equivalent. They are not required for correctness (the
134+
// engine applies them above the scan), so skip them.
135+
if (TableFilterFunctions::IsTableFilterFunction(func_name)) {
136+
return py::none();
137+
}
104138
}
105139

106140
if (expression_class == ExpressionClass::BOUND_OPERATOR) {
@@ -130,19 +164,27 @@ py::object TransformExpression(const Expression &expression, const vector<string
130164

131165
if (expression_class == ExpressionClass::BOUND_CONJUNCTION) {
132166
if (expression_type == ExpressionType::CONJUNCTION_OR || expression_type == ExpressionType::CONJUNCTION_AND) {
167+
const bool is_and = expression_type == ExpressionType::CONJUNCTION_AND;
133168
auto &conj_expr = expression.Cast<BoundConjunctionExpression>();
134169
py::object result = py::none();
135170
for (idx_t i = 0; i < conj_expr.children.size(); i++) {
136171
py::object child_expression =
137172
TransformExpression(*conj_expr.children[i], column_path, backend, arrow_type, timezone_config);
138173
if (child_expression.is(py::none())) {
139-
// An OR branch that can't be translated (e.g. DYNAMIC_FILTER) means the pushed-down
140-
// predicate would be stricter than the engine intends — fall back to no pushdown.
174+
if (is_and) {
175+
// A conjunct we can't push can simply be dropped: the remaining AND
176+
// terms still form a correct (if weaker) filter, and the engine
177+
// re-applies the rest above the scan.
178+
continue;
179+
}
180+
// An OR branch that can't be translated (e.g. a dynamic filter) would
181+
// make the pushed-down predicate stricter than the engine intends —
182+
// fall back to no pushdown for the whole disjunction.
141183
return py::none();
142184
}
143185
if (result.is(py::none())) {
144186
result = std::move(child_expression);
145-
} else if (expression_type == ExpressionType::CONJUNCTION_AND) {
187+
} else if (is_and) {
146188
result = backend.And(std::move(result), std::move(child_expression));
147189
} else {
148190
result = backend.Or(std::move(result), std::move(child_expression));

src/duckdb_py/include/duckdb_python/arrow/filter_pushdown_visitor.hpp

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,22 @@
1616

1717
namespace duckdb {
1818

19-
// A FilterBackend abstracts the Python side of a `TableFilter` → expression
20-
// translation. The shared walker in this file handles the structural
21-
// recursion (CONJUNCTION_AND/OR, STRUCT_EXTRACT, OPTIONAL_FILTER, the
22-
// ExpressionFilter sub-walker) and dispatches leaf operations to the backend.
19+
// A FilterBackend abstracts the Python side of an `ExpressionFilter` →
20+
// expression translation. The shared walker in this file handles the
21+
// structural recursion (CONJUNCTION_AND/OR, struct_extract column paths, the
22+
// optional / selectivity-optional filter wrappers, and the internal runtime
23+
// filter functions) and dispatches leaf operations to the backend.
2324
//
2425
// Two backends exist today: PyArrowBackend (emits pyarrow.dataset.Expression)
2526
// and PolarsBackend (emits polars.Expr). Adding a new backend is purely a
2627
// matter of implementing this interface; the walker itself is reused.
2728
//
2829
// Convention: a backend method that cannot push the given filter must throw
29-
// `NotImplementedException`. The walker catches it at `OPTIONAL_FILTER`
30-
// boundaries (silently swallow) and at the top-level entry points (returning
31-
// `py::none()` for the affected column). Throwing keeps the "I can't push
32-
// this" path uniform across backends, replacing the old polars walker's ad
33-
// hoc `return py::none()` style.
30+
// `NotImplementedException`. The walker swallows it at optional-filter
31+
// boundaries (an optional filter is not required for correctness) and the
32+
// top-level entry points catch it too, returning `py::none()` for the affected
33+
// column. Throwing keeps the "I can't push this" path uniform across backends,
34+
// replacing the old polars walker's ad hoc `return py::none()` style.
3435
struct FilterBackend {
3536
virtual ~FilterBackend() = default;
3637

@@ -68,19 +69,25 @@ struct FilterBackend {
6869
virtual py::object Or(py::object a, py::object b) = 0;
6970
};
7071

71-
// Walk a TableFilter and emit a backend-specific expression.
72-
// - `column_path` is the path accumulated through STRUCT_EXTRACT.
72+
// Walk a TableFilter and emit a backend-specific expression. Since the
73+
// table-filter -> expression-filter migration in core, the only runtime filter
74+
// type is `EXPRESSION_FILTER`; this unwraps it and walks the expression tree.
75+
// - `column_path` is the top-level column name; struct paths are accumulated
76+
// inside the expression walk via struct_extract.
7377
// - `arrow_type` is the ArrowType for the current path leaf (nullable for
7478
// backends that don't track Arrow types).
7579
// - Returns `py::none()` if no part of the filter could be pushed.
7680
py::object TransformFilter(const TableFilter &filter, vector<string> column_path, FilterBackend &backend,
7781
const ArrowType *arrow_type, const string &timezone_config);
7882

79-
// Walk a bound Expression tree (the contents of an `ExpressionFilter`) and
80-
// emit a backend-specific expression. Handles BOUND_FUNCTION (comparisons),
83+
// Walk a bound Expression tree (the contents of an `ExpressionFilter`) and emit
84+
// a backend-specific expression. Handles BOUND_FUNCTION comparisons,
8185
// BOUND_OPERATOR (IS_NULL / IS_NOT_NULL / COMPARE_IN), BOUND_CONJUNCTION
82-
// (AND/OR), and recurses through struct_extract chains to build the column
83-
// path before invoking the backend.
86+
// (AND/OR), struct_extract column chains, the optional / selectivity-optional
87+
// wrappers (unwrapped from `bind_info`; an untranslatable child is swallowed),
88+
// and the internal runtime filter functions (dynamic / bloom / perfect-hash-join
89+
// / prefix-range, which are skipped). Returns `py::none()` for an optional or
90+
// runtime filter that can't be pushed.
8491
py::object TransformExpression(const Expression &expression, const vector<string> &column_path, FilterBackend &backend,
8592
const ArrowType *arrow_type, const string &timezone_config);
8693

tests/fast/arrow/test_filter_pushdown.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,12 @@ def test_or_with_like_does_not_push(self, duckdb_cursor):
190190

191191

192192
class TestInPushdown:
193-
"""IN (...) reaches the walker as ``BOUND_OPERATOR`` with ``COMPARE_IN``."""
193+
"""IN (...) pushdown test.
194+
195+
IN (...) reaches the walker as a ``COMPARE_IN`` ``BOUND_OPERATOR``, wrapped in an
196+
``__internal_tablefilter_optional`` function that the walker must unwrap before it
197+
sees the operator.
198+
"""
194199

195200
def test_basic(self, duckdb_cursor):
196201
duckdb_cursor.execute("CREATE TABLE _t AS SELECT range a FROM range(1000)")

0 commit comments

Comments
 (0)