|
24 | 24 | from pyiceberg.types import IntegerType, NestedField |
25 | 25 |
|
26 | 26 |
|
27 | | -def _create_data_file(file_path: str = "s3://bucket/data.parquet", spec_id: int = 0) -> DataFile: |
| 27 | +def _create_data_file( |
| 28 | + file_path: str = "s3://bucket/data.parquet", |
| 29 | + spec_id: int = 0, |
| 30 | + lower_bounds: dict[int, bytes] | None = None, |
| 31 | + upper_bounds: dict[int, bytes] | None = None, |
| 32 | +) -> DataFile: |
28 | 33 | data_file = DataFile.from_args( |
29 | 34 | content=DataFileContent.DATA, |
30 | 35 | file_path=file_path, |
31 | 36 | file_format=FileFormat.PARQUET, |
32 | 37 | partition=Record(), |
33 | 38 | record_count=100, |
34 | 39 | file_size_in_bytes=1000, |
| 40 | + lower_bounds=lower_bounds, |
| 41 | + upper_bounds=upper_bounds, |
35 | 42 | ) |
36 | 43 | data_file._spec_id = spec_id |
37 | 44 | return data_file |
@@ -84,6 +91,27 @@ def _create_deletion_vector( |
84 | 91 | return ManifestEntry.from_args(status=ManifestEntryStatus.ADDED, sequence_number=sequence_number, data_file=delete_file) |
85 | 92 |
|
86 | 93 |
|
| 94 | +def _create_equality_delete( |
| 95 | + sequence_number: int = 1, |
| 96 | + spec_id: int = 0, |
| 97 | + lower_bounds: dict[int, bytes] | None = None, |
| 98 | + upper_bounds: dict[int, bytes] | None = None, |
| 99 | +) -> ManifestEntry: |
| 100 | + delete_file = DataFile.from_args( |
| 101 | + content=DataFileContent.EQUALITY_DELETES, |
| 102 | + file_path=f"s3://bucket/eq-delete-{sequence_number}.parquet", |
| 103 | + file_format=FileFormat.PARQUET, |
| 104 | + partition=Record(), |
| 105 | + record_count=10, |
| 106 | + file_size_in_bytes=100, |
| 107 | + equality_ids=[1], |
| 108 | + lower_bounds=lower_bounds, |
| 109 | + upper_bounds=upper_bounds, |
| 110 | + ) |
| 111 | + delete_file._spec_id = spec_id |
| 112 | + return ManifestEntry.from_args(status=ManifestEntryStatus.ADDED, sequence_number=sequence_number, data_file=delete_file) |
| 113 | + |
| 114 | + |
87 | 115 | def test_empty_index() -> None: |
88 | 116 | index = DeleteFileIndex() |
89 | 117 | data_file = _create_data_file() |
@@ -236,46 +264,35 @@ def test_equality_delete_metrics_filtering() -> None: |
236 | 264 | schema = Schema(NestedField(1, "id", IntegerType(), required=True)) |
237 | 265 | index = DeleteFileIndex(schema=schema) |
238 | 266 |
|
239 | | - def _create_data_file_with_metrics(file_path: str, lower: int, upper: int) -> DataFile: |
240 | | - data_file = DataFile.from_args( |
241 | | - content=DataFileContent.DATA, |
242 | | - file_path=file_path, |
243 | | - file_format=FileFormat.PARQUET, |
244 | | - partition=Record(), |
245 | | - record_count=100, |
246 | | - file_size_in_bytes=1000, |
247 | | - lower_bounds={1: to_bytes(IntegerType(), lower)}, |
248 | | - upper_bounds={1: to_bytes(IntegerType(), upper)}, |
249 | | - ) |
250 | | - data_file._spec_id = 0 |
251 | | - return data_file |
252 | | - |
253 | | - def _create_equality_delete_with_metrics(sequence_number: int, lower: int, upper: int) -> ManifestEntry: |
254 | | - delete_file = DataFile.from_args( |
255 | | - content=DataFileContent.EQUALITY_DELETES, |
256 | | - file_path=f"s3://bucket/eq-delete-{sequence_number}.parquet", |
257 | | - file_format=FileFormat.PARQUET, |
258 | | - partition=Record(), |
259 | | - record_count=10, |
260 | | - file_size_in_bytes=100, |
261 | | - equality_ids=[1], |
262 | | - lower_bounds={1: to_bytes(IntegerType(), lower)}, |
263 | | - upper_bounds={1: to_bytes(IntegerType(), upper)}, |
264 | | - ) |
265 | | - delete_file._spec_id = 0 |
266 | | - return ManifestEntry.from_args(status=ManifestEntryStatus.ADDED, sequence_number=sequence_number, data_file=delete_file) |
267 | | - |
268 | 267 | # Equality delete for rows where id is between 10 and 20 |
269 | | - index.add_delete_file(_create_equality_delete_with_metrics(sequence_number=100, lower=10, upper=20)) |
| 268 | + index.add_delete_file( |
| 269 | + _create_equality_delete( |
| 270 | + sequence_number=100, |
| 271 | + lower_bounds={1: to_bytes(IntegerType(), 10)}, |
| 272 | + upper_bounds={1: to_bytes(IntegerType(), 20)}, |
| 273 | + ) |
| 274 | + ) |
270 | 275 |
|
271 | 276 | # Data file with id between 0 and 5 (no overlap) |
272 | | - file_no_overlap = _create_data_file_with_metrics("s3://bucket/no_overlap.parquet", 0, 5) |
| 277 | + file_no_overlap = _create_data_file( |
| 278 | + "s3://bucket/no_overlap.parquet", |
| 279 | + lower_bounds={1: to_bytes(IntegerType(), 0)}, |
| 280 | + upper_bounds={1: to_bytes(IntegerType(), 5)}, |
| 281 | + ) |
273 | 282 | assert len(index.for_data_file(1, file_no_overlap)) == 0 |
274 | 283 |
|
275 | 284 | # Data file with id between 15 and 25 (overlap) |
276 | | - file_overlap = _create_data_file_with_metrics("s3://bucket/overlap.parquet", 15, 25) |
| 285 | + file_overlap = _create_data_file( |
| 286 | + "s3://bucket/overlap.parquet", |
| 287 | + lower_bounds={1: to_bytes(IntegerType(), 15)}, |
| 288 | + upper_bounds={1: to_bytes(IntegerType(), 25)}, |
| 289 | + ) |
277 | 290 | assert len(index.for_data_file(1, file_overlap)) == 1 |
278 | 291 |
|
279 | 292 | # Data file with id between 25 and 30 (no overlap) |
280 | | - file_no_overlap_2 = _create_data_file_with_metrics("s3://bucket/no_overlap_2.parquet", 25, 30) |
| 293 | + file_no_overlap_2 = _create_data_file( |
| 294 | + "s3://bucket/no_overlap_2.parquet", |
| 295 | + lower_bounds={1: to_bytes(IntegerType(), 25)}, |
| 296 | + upper_bounds={1: to_bytes(IntegerType(), 30)}, |
| 297 | + ) |
281 | 298 | assert len(index.for_data_file(1, file_no_overlap_2)) == 0 |
0 commit comments