|
17 | 17 | # pylint: disable=protected-access,unused-argument,redefined-outer-name |
18 | 18 | import logging |
19 | 19 | import os |
| 20 | +import struct |
20 | 21 | import tempfile |
21 | 22 | import uuid |
22 | 23 | import warnings |
| 24 | +import zlib |
23 | 25 | from collections.abc import Iterator |
24 | 26 | from datetime import date, datetime, timezone |
25 | 27 | from pathlib import Path |
|
34 | 36 | import pytest |
35 | 37 | from packaging import version |
36 | 38 | from pyarrow.fs import AwsDefaultS3RetryStrategy, FileType, LocalFileSystem, S3FileSystem |
| 39 | +from pyroaring import BitMap |
37 | 40 |
|
38 | 41 | from pyiceberg.exceptions import ResolveError |
39 | 42 | from pyiceberg.expressions import ( |
|
91 | 94 | from pyiceberg.table import FileScanTask, TableProperties |
92 | 95 | from pyiceberg.table.metadata import TableMetadataV2 |
93 | 96 | from pyiceberg.table.name_mapping import create_mapping_from_schema |
| 97 | +from pyiceberg.table.puffin import _DV_BLOB_MAGIC_NUMBER |
94 | 98 | from pyiceberg.transforms import HourTransform, IdentityTransform |
95 | 99 | from pyiceberg.typedef import UTF8, Properties, Record, TableVersion |
96 | 100 | from pyiceberg.types import ( |
@@ -1820,6 +1824,42 @@ def test_read_deletes(deletes_file: str, request: pytest.FixtureRequest) -> None |
1820 | 1824 | assert list(deletes.values())[0] == pa.chunked_array([[1, 3, 5]]) |
1821 | 1825 |
|
1822 | 1826 |
|
| 1827 | +def test_read_deletion_vector_blob_from_content_range(tmp_path: Path) -> None: |
| 1828 | + referenced_data_file = f"{tmp_path}/data.parquet" |
| 1829 | + bitmap_payload = ( |
| 1830 | + (1).to_bytes(8, byteorder="little") |
| 1831 | + + (0).to_bytes(4, byteorder="little") |
| 1832 | + + BitMap([1, 3, 5]).serialize() |
| 1833 | + ) |
| 1834 | + bitmap_data = struct.pack("<I", _DV_BLOB_MAGIC_NUMBER) + bitmap_payload |
| 1835 | + dv_blob = ( |
| 1836 | + struct.pack(">I", len(bitmap_data)) |
| 1837 | + + bitmap_data |
| 1838 | + + struct.pack(">I", zlib.crc32(bitmap_data) & 0xFFFFFFFF) |
| 1839 | + ) |
| 1840 | + prefix = b"\x01not-a-puffin-file" |
| 1841 | + delete_file_path = f"{tmp_path}/deletes.bin" |
| 1842 | + |
| 1843 | + with open(delete_file_path, "wb") as f: |
| 1844 | + f.write(prefix + dv_blob + b"trailing-bytes") |
| 1845 | + |
| 1846 | + deletes = _read_deletes( |
| 1847 | + PyArrowFileIO(), |
| 1848 | + DataFile.from_args( |
| 1849 | + _table_format_version=3, |
| 1850 | + content=DataFileContent.POSITION_DELETES, |
| 1851 | + file_path=delete_file_path, |
| 1852 | + file_format=FileFormat.PUFFIN, |
| 1853 | + record_count=3, |
| 1854 | + referenced_data_file=referenced_data_file, |
| 1855 | + content_offset=len(prefix), |
| 1856 | + content_size_in_bytes=len(dv_blob), |
| 1857 | + ), |
| 1858 | + ) |
| 1859 | + |
| 1860 | + assert deletes == {referenced_data_file: pa.chunked_array([[1, 3, 5]])} |
| 1861 | + |
| 1862 | + |
1823 | 1863 | def test_delete(deletes_file: str, request: pytest.FixtureRequest, table_schema_simple: Schema) -> None: |
1824 | 1864 | # Determine file format from the file extension |
1825 | 1865 | file_format = FileFormat.PARQUET if deletes_file.endswith(".parquet") else FileFormat.ORC |
|
0 commit comments