@@ -1152,6 +1152,48 @@ def test_strict_some_nulls(strict_data_file_schema: Schema, strict_data_file_2:
11521152 assert not should_read , "Should not match: equal on some nulls column"
11531153
11541154
1155+ def test_strict_not_equal_and_not_in_with_mixed_nulls_and_matching_bounds () -> None :
1156+ schema = Schema (NestedField (1 , "x" , IntegerType (), required = False ))
1157+ data_file = DataFile .from_args (
1158+ file_path = "file.parquet" ,
1159+ file_format = FileFormat .PARQUET ,
1160+ partition = {},
1161+ record_count = 2 ,
1162+ file_size_in_bytes = 1 ,
1163+ value_counts = {1 : 2 },
1164+ null_value_counts = {1 : 1 },
1165+ nan_value_counts = None ,
1166+ lower_bounds = {1 : to_bytes (IntegerType (), 5 )},
1167+ upper_bounds = {1 : to_bytes (IntegerType (), 5 )},
1168+ )
1169+
1170+ should_read = _StrictMetricsEvaluator (schema , NotEqualTo ("x" , 5 )).eval (data_file )
1171+ assert should_read == ROWS_MIGHT_NOT_MATCH , "Should not match: bounds prove the non-null value is 5"
1172+
1173+ should_read = _StrictMetricsEvaluator (schema , NotIn ("x" , {5 , 6 })).eval (data_file )
1174+ assert should_read == ROWS_MIGHT_NOT_MATCH , "Should not match: bounds prove the non-null value is 5"
1175+
1176+
1177+ def test_strict_not_equal_and_not_in_with_all_nulls () -> None :
1178+ schema = Schema (NestedField (1 , "x" , IntegerType (), required = False ))
1179+ data_file = DataFile .from_args (
1180+ file_path = "file.parquet" ,
1181+ file_format = FileFormat .PARQUET ,
1182+ partition = {},
1183+ record_count = 2 ,
1184+ file_size_in_bytes = 1 ,
1185+ value_counts = {1 : 2 },
1186+ null_value_counts = {1 : 2 },
1187+ nan_value_counts = None ,
1188+ )
1189+
1190+ should_read = _StrictMetricsEvaluator (schema , NotEqualTo ("x" , 5 )).eval (data_file )
1191+ assert should_read == ROWS_MUST_MATCH , "Should match: notEqual on all-null column"
1192+
1193+ should_read = _StrictMetricsEvaluator (schema , NotIn ("x" , {5 , 6 })).eval (data_file )
1194+ assert should_read == ROWS_MUST_MATCH , "Should match: notIn on all-null column"
1195+
1196+
11551197def test_strict_is_nan (strict_data_file_schema : Schema , strict_data_file_1 : DataFile ) -> None :
11561198 should_read = _StrictMetricsEvaluator (strict_data_file_schema , IsNaN ("all_nans" )).eval (strict_data_file_1 )
11571199 assert should_read , "Should match: all values are nan"
@@ -1198,6 +1240,50 @@ def test_strict_not_nan(strict_data_file_schema: Schema, strict_data_file_1: Dat
11981240 assert not should_read , "Should not match: null values are not nan"
11991241
12001242
1243+ @pytest .mark .parametrize ("field_type" , [FloatType (), DoubleType ()])
1244+ def test_strict_not_equal_and_not_in_with_mixed_nans_and_matching_bounds (field_type : PrimitiveType ) -> None :
1245+ schema = Schema (NestedField (1 , "x" , field_type , required = False ))
1246+ data_file = DataFile .from_args (
1247+ file_path = "file.parquet" ,
1248+ file_format = FileFormat .PARQUET ,
1249+ partition = {},
1250+ record_count = 2 ,
1251+ file_size_in_bytes = 1 ,
1252+ value_counts = {1 : 2 },
1253+ null_value_counts = {1 : 0 },
1254+ nan_value_counts = {1 : 1 },
1255+ lower_bounds = {1 : to_bytes (field_type , 5.0 )},
1256+ upper_bounds = {1 : to_bytes (field_type , 5.0 )},
1257+ )
1258+
1259+ should_read = _StrictMetricsEvaluator (schema , NotEqualTo ("x" , 5.0 )).eval (data_file )
1260+ assert should_read == ROWS_MIGHT_NOT_MATCH , "Should not match: bounds prove the non-NaN value is 5.0"
1261+
1262+ should_read = _StrictMetricsEvaluator (schema , NotIn ("x" , {5.0 , 6.0 })).eval (data_file )
1263+ assert should_read == ROWS_MIGHT_NOT_MATCH , "Should not match: bounds prove the non-NaN value is 5.0"
1264+
1265+
1266+ @pytest .mark .parametrize ("field_type" , [FloatType (), DoubleType ()])
1267+ def test_strict_not_equal_and_not_in_with_all_nans (field_type : PrimitiveType ) -> None :
1268+ schema = Schema (NestedField (1 , "x" , field_type , required = False ))
1269+ data_file = DataFile .from_args (
1270+ file_path = "file.parquet" ,
1271+ file_format = FileFormat .PARQUET ,
1272+ partition = {},
1273+ record_count = 2 ,
1274+ file_size_in_bytes = 1 ,
1275+ value_counts = {1 : 2 },
1276+ null_value_counts = {1 : 0 },
1277+ nan_value_counts = {1 : 2 },
1278+ )
1279+
1280+ should_read = _StrictMetricsEvaluator (schema , NotEqualTo ("x" , 5.0 )).eval (data_file )
1281+ assert should_read == ROWS_MUST_MATCH , "Should match: notEqual on all-NaN column"
1282+
1283+ should_read = _StrictMetricsEvaluator (schema , NotIn ("x" , {5.0 , 6.0 })).eval (data_file )
1284+ assert should_read == ROWS_MUST_MATCH , "Should match: notIn on all-NaN column"
1285+
1286+
12011287def test_strict_required_column (strict_data_file_schema : Schema , strict_data_file_1 : DataFile ) -> None :
12021288 should_read = _StrictMetricsEvaluator (strict_data_file_schema , NotNull ("required" )).eval (strict_data_file_1 )
12031289 assert should_read , "Should match: required columns are always non-null"
@@ -1529,42 +1615,6 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file
15291615 assert not should_read , "Should not match: no_nulls field does not have bounds"
15301616
15311617
1532- def test_strict_not_eq_partial_nulls_within_bounds () -> None :
1533- # Regression test for https://github.com/apache/iceberg-python/issues/3498
1534- # A column that contains *some* nulls (but not only nulls) whose bounds still cover the
1535- # literal must not be reported as ROWS_MUST_MATCH: the non-null value equal to the literal
1536- # does not satisfy the predicate. Reporting a match here lets _DeleteFiles drop the whole
1537- # data file and silently lose the row that should have survived the delete.
1538- schema = Schema (NestedField (1 , "x" , IntegerType (), required = False ))
1539- data_file = DataFile .from_args (
1540- file_path = "file.parquet" ,
1541- file_format = FileFormat .PARQUET ,
1542- partition = Record (),
1543- record_count = 2 ,
1544- value_counts = {1 : 2 },
1545- null_value_counts = {1 : 1 }, # one null, one non-null -> not "nulls only"
1546- nan_value_counts = {},
1547- lower_bounds = {1 : to_bytes (IntegerType (), 5 )},
1548- upper_bounds = {1 : to_bytes (IntegerType (), 5 )}, # the only non-null value is 5
1549- )
1550-
1551- assert not _StrictMetricsEvaluator (schema , NotEqualTo ("x" , 5 )).eval (data_file ), (
1552- "Should not match: the non-null value 5 does not satisfy x != 5"
1553- )
1554- assert not _StrictMetricsEvaluator (schema , NotIn ("x" , {5 })).eval (data_file ), (
1555- "Should not match: the non-null value 5 is in {5}"
1556- )
1557-
1558- # The literal sits outside the bounds, so every non-null value satisfies the predicate and
1559- # the remaining nulls/NaNs also satisfy it -> the whole file matches.
1560- assert _StrictMetricsEvaluator (schema , NotEqualTo ("x" , 6 )).eval (data_file ), (
1561- "Should match: no value equals 6 and nulls satisfy x != 6"
1562- )
1563- assert _StrictMetricsEvaluator (schema , NotIn ("x" , {6 })).eval (data_file ), (
1564- "Should match: no value is in {6} and nulls satisfy not-in"
1565- )
1566-
1567-
15681618@pytest .mark .parametrize (
15691619 "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected" ,
15701620 [
0 commit comments