Skip to content

Commit 9a86633

Browse files
Fix concjunction OR (#465)
Fixes #460 A bug in the conjunction OR pushdown for pyarrow resulted in a too restrictive filter if one of the branches couldn't be translated.
2 parents 1e93bf4 + 3d778de commit 9a86633

3 files changed

Lines changed: 37 additions & 2 deletions

File tree

src/duckdb_py/arrow/pyarrow_filter_pushdown.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,15 +236,16 @@ py::object TransformFilterRecursive(TableFilter &filter, vector<string> column_r
236236
auto constant_field = field(py::tuple(py::cast(column_ref)));
237237
return constant_field.attr("is_valid")();
238238
}
239-
//! We do not pushdown or conjunctions yet
240239
case TableFilterType::CONJUNCTION_OR: {
241240
auto &or_filter = filter.Cast<ConjunctionOrFilter>();
242241
py::object expression = py::none();
243242
for (idx_t i = 0; i < or_filter.child_filters.size(); i++) {
244243
auto &child_filter = *or_filter.child_filters[i];
245244
py::object child_expression = TransformFilterRecursive(child_filter, column_ref, timezone_config, type);
246245
if (child_expression.is(py::none())) {
247-
continue;
246+
// An OR branch that can't be translated (e.g. DYNAMIC_FILTER) means the pushed-down
247+
// predicate would be stricter than the engine intends — fall back to no pushdown.
248+
return py::none();
248249
}
249250
if (expression.is(py::none())) {
250251
expression = std::move(child_expression);

tests/fast/arrow/test_filter_pushdown.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,26 @@ def test_dynamic_filter(self, duckdb_cursor):
10211021
res = duckdb_cursor.sql("SELECT a FROM t ORDER BY a LIMIT 11").fetchall()
10221022
assert len(res) == 11
10231023

1024+
def test_dynamic_filter_nulls_first_pyarrow(self, duckdb_cursor):
1025+
# Regression for #460(a): TOP_N with ASC NULLS FIRST pushes an
1026+
# OPTIONAL(x IS NULL OR DYNAMIC_FILTER(x)) into the arrow scan. The
1027+
# pyarrow translation must NOT collapse the OR by dropping the
1028+
# untranslatable DYNAMIC_FILTER branch — doing so produces a
1029+
# stricter (`field("x").is_null()`) predicate that drops every row.
1030+
t = pa.Table.from_pydict({"x": pa.array([3, 1, 2], type=pa.int32())})
1031+
duckdb_cursor.register("src", t)
1032+
res = duckdb_cursor.sql("SELECT * FROM src ORDER BY x ASC NULLS FIRST LIMIT 1").fetchall()
1033+
assert res == [(1,)]
1034+
1035+
def test_dynamic_filter_nulls_first_polars_dataframe(self, duckdb_cursor):
1036+
# pl.DataFrame is materialized to a pyarrow.Table before scanning,
1037+
# so it exercises PyarrowFilterPushdown the same way pa.Table does.
1038+
pl = pytest.importorskip("polars")
1039+
df = pl.DataFrame({"x": [3, 1, 2]})
1040+
duckdb_cursor.register("src", df)
1041+
res = duckdb_cursor.sql("SELECT * FROM src ORDER BY x ASC NULLS FIRST LIMIT 1").fetchall()
1042+
assert res == [(1,)]
1043+
10241044
def test_binary_view_filter(self, duckdb_cursor):
10251045
"""Filters on a view column work (without pushdown because pyarrow does not support view filters yet)."""
10261046
table = pa.table({"col": pa.array([b"abc", b"efg"], type=pa.binary_view())})

tests/fast/arrow/test_polars_filter_pushdown.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,20 @@ def test_optional_filter(self):
147147
result = duckdb.sql("SELECT * FROM lf WHERE a = 1 OR a = 3").fetchall()
148148
assert sorted(result) == [(1,), (3,)]
149149

150+
##### DYNAMIC_FILTER via TOP_N (issue #460(a))
151+
152+
def test_top_n_nulls_first_includes_min(self):
153+
"""ORDER BY x ASC NULLS FIRST LIMIT 1 pushes OPTIONAL(IS_NULL OR DYNAMIC_FILTER) into the scan.
154+
155+
The OR branch must not be partially translated: dropping the
156+
untranslatable DYNAMIC_FILTER child would leave just IS_NULL and
157+
silently discard every non-null row. See pyarrow_filter_pushdown
158+
sibling regression test in test_filter_pushdown.py.
159+
"""
160+
lf = pl.LazyFrame({"x": [3, 1, 2]})
161+
result = duckdb.sql("SELECT * FROM lf ORDER BY x ASC NULLS FIRST LIMIT 1").fetchall()
162+
assert result == [(1,)]
163+
150164
##### Produce path, no filters
151165

152166
def test_unfiltered_scan(self):

0 commit comments

Comments
 (0)