Skip to content

Commit 69dee3a

Browse files
committed
Add DeleteFileIndex support for EqualityDeletes
1 parent a1f2e5a commit 69dee3a

1 file changed

Lines changed: 51 additions & 34 deletions

File tree

tests/table/test_delete_file_index.py

Lines changed: 51 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,21 @@
2424
from pyiceberg.types import IntegerType, NestedField
2525

2626

27-
def _create_data_file(file_path: str = "s3://bucket/data.parquet", spec_id: int = 0) -> DataFile:
27+
def _create_data_file(
28+
file_path: str = "s3://bucket/data.parquet",
29+
spec_id: int = 0,
30+
lower_bounds: dict[int, bytes] | None = None,
31+
upper_bounds: dict[int, bytes] | None = None,
32+
) -> DataFile:
2833
data_file = DataFile.from_args(
2934
content=DataFileContent.DATA,
3035
file_path=file_path,
3136
file_format=FileFormat.PARQUET,
3237
partition=Record(),
3338
record_count=100,
3439
file_size_in_bytes=1000,
40+
lower_bounds=lower_bounds,
41+
upper_bounds=upper_bounds,
3542
)
3643
data_file._spec_id = spec_id
3744
return data_file
@@ -84,6 +91,27 @@ def _create_deletion_vector(
8491
return ManifestEntry.from_args(status=ManifestEntryStatus.ADDED, sequence_number=sequence_number, data_file=delete_file)
8592

8693

94+
def _create_equality_delete(
95+
sequence_number: int = 1,
96+
spec_id: int = 0,
97+
lower_bounds: dict[int, bytes] | None = None,
98+
upper_bounds: dict[int, bytes] | None = None,
99+
) -> ManifestEntry:
100+
delete_file = DataFile.from_args(
101+
content=DataFileContent.EQUALITY_DELETES,
102+
file_path=f"s3://bucket/eq-delete-{sequence_number}.parquet",
103+
file_format=FileFormat.PARQUET,
104+
partition=Record(),
105+
record_count=10,
106+
file_size_in_bytes=100,
107+
equality_ids=[1],
108+
lower_bounds=lower_bounds,
109+
upper_bounds=upper_bounds,
110+
)
111+
delete_file._spec_id = spec_id
112+
return ManifestEntry.from_args(status=ManifestEntryStatus.ADDED, sequence_number=sequence_number, data_file=delete_file)
113+
114+
87115
def test_empty_index() -> None:
88116
index = DeleteFileIndex()
89117
data_file = _create_data_file()
@@ -236,46 +264,35 @@ def test_equality_delete_metrics_filtering() -> None:
236264
schema = Schema(NestedField(1, "id", IntegerType(), required=True))
237265
index = DeleteFileIndex(schema=schema)
238266

239-
def _create_data_file_with_metrics(file_path: str, lower: int, upper: int) -> DataFile:
240-
data_file = DataFile.from_args(
241-
content=DataFileContent.DATA,
242-
file_path=file_path,
243-
file_format=FileFormat.PARQUET,
244-
partition=Record(),
245-
record_count=100,
246-
file_size_in_bytes=1000,
247-
lower_bounds={1: to_bytes(IntegerType(), lower)},
248-
upper_bounds={1: to_bytes(IntegerType(), upper)},
249-
)
250-
data_file._spec_id = 0
251-
return data_file
252-
253-
def _create_equality_delete_with_metrics(sequence_number: int, lower: int, upper: int) -> ManifestEntry:
254-
delete_file = DataFile.from_args(
255-
content=DataFileContent.EQUALITY_DELETES,
256-
file_path=f"s3://bucket/eq-delete-{sequence_number}.parquet",
257-
file_format=FileFormat.PARQUET,
258-
partition=Record(),
259-
record_count=10,
260-
file_size_in_bytes=100,
261-
equality_ids=[1],
262-
lower_bounds={1: to_bytes(IntegerType(), lower)},
263-
upper_bounds={1: to_bytes(IntegerType(), upper)},
264-
)
265-
delete_file._spec_id = 0
266-
return ManifestEntry.from_args(status=ManifestEntryStatus.ADDED, sequence_number=sequence_number, data_file=delete_file)
267-
268267
# Equality delete for rows where id is between 10 and 20
269-
index.add_delete_file(_create_equality_delete_with_metrics(sequence_number=100, lower=10, upper=20))
268+
index.add_delete_file(
269+
_create_equality_delete(
270+
sequence_number=100,
271+
lower_bounds={1: to_bytes(IntegerType(), 10)},
272+
upper_bounds={1: to_bytes(IntegerType(), 20)},
273+
)
274+
)
270275

271276
# Data file with id between 0 and 5 (no overlap)
272-
file_no_overlap = _create_data_file_with_metrics("s3://bucket/no_overlap.parquet", 0, 5)
277+
file_no_overlap = _create_data_file(
278+
"s3://bucket/no_overlap.parquet",
279+
lower_bounds={1: to_bytes(IntegerType(), 0)},
280+
upper_bounds={1: to_bytes(IntegerType(), 5)},
281+
)
273282
assert len(index.for_data_file(1, file_no_overlap)) == 0
274283

275284
# Data file with id between 15 and 25 (overlap)
276-
file_overlap = _create_data_file_with_metrics("s3://bucket/overlap.parquet", 15, 25)
285+
file_overlap = _create_data_file(
286+
"s3://bucket/overlap.parquet",
287+
lower_bounds={1: to_bytes(IntegerType(), 15)},
288+
upper_bounds={1: to_bytes(IntegerType(), 25)},
289+
)
277290
assert len(index.for_data_file(1, file_overlap)) == 1
278291

279292
# Data file with id between 25 and 30 (no overlap)
280-
file_no_overlap_2 = _create_data_file_with_metrics("s3://bucket/no_overlap_2.parquet", 25, 30)
293+
file_no_overlap_2 = _create_data_file(
294+
"s3://bucket/no_overlap_2.parquet",
295+
lower_bounds={1: to_bytes(IntegerType(), 25)},
296+
upper_bounds={1: to_bytes(IntegerType(), 30)},
297+
)
281298
assert len(index.for_data_file(1, file_no_overlap_2)) == 0

0 commit comments

Comments
 (0)