Skip to content

Commit 8c7912f

Browse files
authored
Add strict NotEqualTo/NotIn null and NaN tests (#3547)
1 parent d0a9b91 commit 8c7912f

1 file changed

Lines changed: 86 additions & 36 deletions

File tree

tests/expressions/test_evaluator.py

Lines changed: 86 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1152,6 +1152,48 @@ def test_strict_some_nulls(strict_data_file_schema: Schema, strict_data_file_2:
11521152
assert not should_read, "Should not match: equal on some nulls column"
11531153

11541154

1155+
def test_strict_not_equal_and_not_in_with_mixed_nulls_and_matching_bounds() -> None:
1156+
schema = Schema(NestedField(1, "x", IntegerType(), required=False))
1157+
data_file = DataFile.from_args(
1158+
file_path="file.parquet",
1159+
file_format=FileFormat.PARQUET,
1160+
partition={},
1161+
record_count=2,
1162+
file_size_in_bytes=1,
1163+
value_counts={1: 2},
1164+
null_value_counts={1: 1},
1165+
nan_value_counts=None,
1166+
lower_bounds={1: to_bytes(IntegerType(), 5)},
1167+
upper_bounds={1: to_bytes(IntegerType(), 5)},
1168+
)
1169+
1170+
should_read = _StrictMetricsEvaluator(schema, NotEqualTo("x", 5)).eval(data_file)
1171+
assert should_read == ROWS_MIGHT_NOT_MATCH, "Should not match: bounds prove the non-null value is 5"
1172+
1173+
should_read = _StrictMetricsEvaluator(schema, NotIn("x", {5, 6})).eval(data_file)
1174+
assert should_read == ROWS_MIGHT_NOT_MATCH, "Should not match: bounds prove the non-null value is 5"
1175+
1176+
1177+
def test_strict_not_equal_and_not_in_with_all_nulls() -> None:
1178+
schema = Schema(NestedField(1, "x", IntegerType(), required=False))
1179+
data_file = DataFile.from_args(
1180+
file_path="file.parquet",
1181+
file_format=FileFormat.PARQUET,
1182+
partition={},
1183+
record_count=2,
1184+
file_size_in_bytes=1,
1185+
value_counts={1: 2},
1186+
null_value_counts={1: 2},
1187+
nan_value_counts=None,
1188+
)
1189+
1190+
should_read = _StrictMetricsEvaluator(schema, NotEqualTo("x", 5)).eval(data_file)
1191+
assert should_read == ROWS_MUST_MATCH, "Should match: notEqual on all-null column"
1192+
1193+
should_read = _StrictMetricsEvaluator(schema, NotIn("x", {5, 6})).eval(data_file)
1194+
assert should_read == ROWS_MUST_MATCH, "Should match: notIn on all-null column"
1195+
1196+
11551197
def test_strict_is_nan(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
11561198
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("all_nans")).eval(strict_data_file_1)
11571199
assert should_read, "Should match: all values are nan"
@@ -1198,6 +1240,50 @@ def test_strict_not_nan(strict_data_file_schema: Schema, strict_data_file_1: Dat
11981240
assert not should_read, "Should not match: null values are not nan"
11991241

12001242

1243+
@pytest.mark.parametrize("field_type", [FloatType(), DoubleType()])
1244+
def test_strict_not_equal_and_not_in_with_mixed_nans_and_matching_bounds(field_type: PrimitiveType) -> None:
1245+
schema = Schema(NestedField(1, "x", field_type, required=False))
1246+
data_file = DataFile.from_args(
1247+
file_path="file.parquet",
1248+
file_format=FileFormat.PARQUET,
1249+
partition={},
1250+
record_count=2,
1251+
file_size_in_bytes=1,
1252+
value_counts={1: 2},
1253+
null_value_counts={1: 0},
1254+
nan_value_counts={1: 1},
1255+
lower_bounds={1: to_bytes(field_type, 5.0)},
1256+
upper_bounds={1: to_bytes(field_type, 5.0)},
1257+
)
1258+
1259+
should_read = _StrictMetricsEvaluator(schema, NotEqualTo("x", 5.0)).eval(data_file)
1260+
assert should_read == ROWS_MIGHT_NOT_MATCH, "Should not match: bounds prove the non-NaN value is 5.0"
1261+
1262+
should_read = _StrictMetricsEvaluator(schema, NotIn("x", {5.0, 6.0})).eval(data_file)
1263+
assert should_read == ROWS_MIGHT_NOT_MATCH, "Should not match: bounds prove the non-NaN value is 5.0"
1264+
1265+
1266+
@pytest.mark.parametrize("field_type", [FloatType(), DoubleType()])
1267+
def test_strict_not_equal_and_not_in_with_all_nans(field_type: PrimitiveType) -> None:
1268+
schema = Schema(NestedField(1, "x", field_type, required=False))
1269+
data_file = DataFile.from_args(
1270+
file_path="file.parquet",
1271+
file_format=FileFormat.PARQUET,
1272+
partition={},
1273+
record_count=2,
1274+
file_size_in_bytes=1,
1275+
value_counts={1: 2},
1276+
null_value_counts={1: 0},
1277+
nan_value_counts={1: 2},
1278+
)
1279+
1280+
should_read = _StrictMetricsEvaluator(schema, NotEqualTo("x", 5.0)).eval(data_file)
1281+
assert should_read == ROWS_MUST_MATCH, "Should match: notEqual on all-NaN column"
1282+
1283+
should_read = _StrictMetricsEvaluator(schema, NotIn("x", {5.0, 6.0})).eval(data_file)
1284+
assert should_read == ROWS_MUST_MATCH, "Should match: notIn on all-NaN column"
1285+
1286+
12011287
def test_strict_required_column(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
12021288
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("required")).eval(strict_data_file_1)
12031289
assert should_read, "Should match: required columns are always non-null"
@@ -1529,42 +1615,6 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file
15291615
assert not should_read, "Should not match: no_nulls field does not have bounds"
15301616

15311617

1532-
def test_strict_not_eq_partial_nulls_within_bounds() -> None:
1533-
# Regression test for https://github.com/apache/iceberg-python/issues/3498
1534-
# A column that contains *some* nulls (but not only nulls) whose bounds still cover the
1535-
# literal must not be reported as ROWS_MUST_MATCH: the non-null value equal to the literal
1536-
# does not satisfy the predicate. Reporting a match here lets _DeleteFiles drop the whole
1537-
# data file and silently lose the row that should have survived the delete.
1538-
schema = Schema(NestedField(1, "x", IntegerType(), required=False))
1539-
data_file = DataFile.from_args(
1540-
file_path="file.parquet",
1541-
file_format=FileFormat.PARQUET,
1542-
partition=Record(),
1543-
record_count=2,
1544-
value_counts={1: 2},
1545-
null_value_counts={1: 1}, # one null, one non-null -> not "nulls only"
1546-
nan_value_counts={},
1547-
lower_bounds={1: to_bytes(IntegerType(), 5)},
1548-
upper_bounds={1: to_bytes(IntegerType(), 5)}, # the only non-null value is 5
1549-
)
1550-
1551-
assert not _StrictMetricsEvaluator(schema, NotEqualTo("x", 5)).eval(data_file), (
1552-
"Should not match: the non-null value 5 does not satisfy x != 5"
1553-
)
1554-
assert not _StrictMetricsEvaluator(schema, NotIn("x", {5})).eval(data_file), (
1555-
"Should not match: the non-null value 5 is in {5}"
1556-
)
1557-
1558-
# The literal sits outside the bounds, so every non-null value satisfies the predicate and
1559-
# the remaining nulls/NaNs also satisfy it -> the whole file matches.
1560-
assert _StrictMetricsEvaluator(schema, NotEqualTo("x", 6)).eval(data_file), (
1561-
"Should match: no value equals 6 and nulls satisfy x != 6"
1562-
)
1563-
assert _StrictMetricsEvaluator(schema, NotIn("x", {6})).eval(data_file), (
1564-
"Should match: no value is in {6} and nulls satisfy not-in"
1565-
)
1566-
1567-
15681618
@pytest.mark.parametrize(
15691619
"file_type, evolved_type, lower_bound, upper_bound, op, lit, expected",
15701620
[

0 commit comments

Comments
 (0)