Skip to content

Commit ffd1f91

Browse files
committed
Fix strict metrics evaluator over-pruning files for NotEqualTo/NotIn with partial nulls
_StrictMetricsEvaluator.visit_not_equal and visit_not_in short-circuited on _can_contain_nulls / _can_contain_nans (null/NaN count > 0) and returned ROWS_MUST_MATCH without checking the value bounds. A file holding any null or NaN was therefore reported as fully matching the predicate, even when a non-null value inside the bounds did not match. This drives _DeleteFiles (table/update/snapshot.py): ROWS_MUST_MATCH drops the whole data file without rewriting it. So delete(NotEqualTo("x", 5)) against a file with stats [null, 5] and bounds lower=upper=5 would delete the entire file, silently losing the row with value 5 that should have survived. Every other strict ROWS_MUST_MATCH path already guards on the "only" variants (_contains_nulls_only / _contains_nans_only), matching the reference StrictMetricsEvaluator. Switch both methods to the same guard so that an all-null/all-NaN column still short-circuits to ROWS_MUST_MATCH (those rows satisfy not-equal/not-in), while a partially-null column falls through to the bounds check. Update the existing NotIn-on-some-nulls test that encoded the buggy result and add a regression test covering the [null, value] / bounds-include-literal case for both NotEqualTo and NotIn. Fixes #3498 (partially)
1 parent d101879 commit ffd1f91

2 files changed

Lines changed: 45 additions & 3 deletions

File tree

pyiceberg/expressions/visitors.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1668,7 +1668,10 @@ def visit_not_equal(self, term: BoundTerm, literal: LiteralValue) -> bool:
16681668
# Rows must match when X < Min or Max < X because it is not in the range
16691669
field_id = term.ref().field.field_id
16701670

1671-
if self._can_contain_nulls(field_id) or self._can_contain_nans(field_id):
1671+
# When the column is entirely null or entirely NaN there is no value equal to the
1672+
# literal, so every row satisfies the predicate. A merely partial presence of nulls
1673+
# or NaNs does not guarantee this, so we must fall through to the bounds check below.
1674+
if self._contains_nulls_only(field_id) or self._contains_nans_only(field_id):
16721675
return ROWS_MUST_MATCH
16731676

16741677
field = self._get_field(field_id)
@@ -1728,7 +1731,10 @@ def visit_in(self, term: BoundTerm, literals: set[L]) -> bool:
17281731
def visit_not_in(self, term: BoundTerm, literals: set[L]) -> bool:
17291732
field_id = term.ref().field.field_id
17301733

1731-
if self._can_contain_nulls(field_id) or self._can_contain_nans(field_id):
1734+
# When the column is entirely null or entirely NaN none of the values are in the set,
1735+
# so every row satisfies the predicate. A merely partial presence of nulls or NaNs does
1736+
# not guarantee this, so we must fall through to the bounds check below.
1737+
if self._contains_nulls_only(field_id) or self._contains_nans_only(field_id):
17321738
return ROWS_MUST_MATCH
17331739

17341740
field = self._get_field(field_id)

tests/expressions/test_evaluator.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1523,12 +1523,48 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file
15231523
assert should_read, "Should match: notIn on all nulls column"
15241524

15251525
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("some_nulls", {"abc", "def"})).eval(strict_data_file_1)
1526-
assert should_read, "Should match: notIn on some nulls column, 'bbb' > 'abc' and 'bbb' < 'def'"
1526+
assert not should_read, "Should not match: some non-null value may be == 'def' which is within bounds ['bbb', 'eee']"
15271527

15281528
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("no_nulls", {"abc", "def"})).eval(strict_data_file_1)
15291529
assert not should_read, "Should not match: no_nulls field does not have bounds"
15301530

15311531

1532+
def test_strict_not_eq_partial_nulls_within_bounds() -> None:
1533+
# Regression test for https://github.com/apache/iceberg-python/issues/3498
1534+
# A column that contains *some* nulls (but not only nulls) whose bounds still cover the
1535+
# literal must not be reported as ROWS_MUST_MATCH: the non-null value equal to the literal
1536+
# does not satisfy the predicate. Reporting a match here lets _DeleteFiles drop the whole
1537+
# data file and silently lose the row that should have survived the delete.
1538+
schema = Schema(NestedField(1, "x", IntegerType(), required=False))
1539+
data_file = DataFile.from_args(
1540+
file_path="file.parquet",
1541+
file_format=FileFormat.PARQUET,
1542+
partition=Record(),
1543+
record_count=2,
1544+
value_counts={1: 2},
1545+
null_value_counts={1: 1}, # one null, one non-null -> not "nulls only"
1546+
nan_value_counts={},
1547+
lower_bounds={1: to_bytes(IntegerType(), 5)},
1548+
upper_bounds={1: to_bytes(IntegerType(), 5)}, # the only non-null value is 5
1549+
)
1550+
1551+
assert not _StrictMetricsEvaluator(schema, NotEqualTo("x", 5)).eval(data_file), (
1552+
"Should not match: the non-null value 5 does not satisfy x != 5"
1553+
)
1554+
assert not _StrictMetricsEvaluator(schema, NotIn("x", {5})).eval(data_file), (
1555+
"Should not match: the non-null value 5 is in {5}"
1556+
)
1557+
1558+
# The literal sits outside the bounds, so every non-null value satisfies the predicate and
1559+
# the remaining nulls/NaNs also satisfy it -> the whole file matches.
1560+
assert _StrictMetricsEvaluator(schema, NotEqualTo("x", 6)).eval(data_file), (
1561+
"Should match: no value equals 6 and nulls satisfy x != 6"
1562+
)
1563+
assert _StrictMetricsEvaluator(schema, NotIn("x", {6})).eval(data_file), (
1564+
"Should match: no value is in {6} and nulls satisfy not-in"
1565+
)
1566+
1567+
15321568
@pytest.mark.parametrize(
15331569
"file_type, evolved_type, lower_bound, upper_bound, op, lit, expected",
15341570
[

0 commit comments

Comments
 (0)