|
15 | 15 | # specific language governing permissions and limitations |
16 | 16 | # under the License. |
17 | 17 | # pylint: disable=protected-access,unused-argument,redefined-outer-name |
| 18 | +import json |
18 | 19 | import logging |
19 | 20 | import os |
| 21 | +import struct |
20 | 22 | import tempfile |
21 | 23 | import uuid |
22 | 24 | import warnings |
| 25 | +import zlib |
23 | 26 | from collections.abc import Iterator |
24 | 27 | from datetime import date, datetime, timezone |
25 | 28 | from pathlib import Path |
|
34 | 37 | import pytest |
35 | 38 | from packaging import version |
36 | 39 | from pyarrow.fs import AwsDefaultS3RetryStrategy, FileType, LocalFileSystem, S3FileSystem |
| 40 | +from pyroaring import BitMap |
37 | 41 |
|
38 | 42 | from pyiceberg.exceptions import ResolveError |
39 | 43 | from pyiceberg.expressions import ( |
|
91 | 95 | from pyiceberg.table import FileScanTask, TableProperties |
92 | 96 | from pyiceberg.table.metadata import TableMetadataV2 |
93 | 97 | from pyiceberg.table.name_mapping import create_mapping_from_schema |
| 98 | +from pyiceberg.table.puffin import ( |
| 99 | + _DV_BLOB_MAGIC_NUMBER, |
| 100 | + MAGIC_BYTES, |
| 101 | + PROPERTY_REFERENCED_DATA_FILE, |
| 102 | +) |
94 | 103 | from pyiceberg.transforms import HourTransform, IdentityTransform |
95 | 104 | from pyiceberg.typedef import UTF8, Properties, Record, TableVersion |
96 | 105 | from pyiceberg.types import ( |
@@ -1820,6 +1829,86 @@ def test_read_deletes(deletes_file: str, request: pytest.FixtureRequest) -> None |
1820 | 1829 | assert list(deletes.values())[0] == pa.chunked_array([[1, 3, 5]]) |
1821 | 1830 |
|
1822 | 1831 |
|
| 1832 | +def _deletion_vector_bitmap_payload() -> bytes: |
| 1833 | + return (1).to_bytes(8, byteorder="little") + (0).to_bytes(4, byteorder="little") + BitMap([1, 3, 5]).serialize() |
| 1834 | + |
| 1835 | + |
| 1836 | +def _deletion_vector_blob(bitmap_payload: bytes) -> bytes: |
| 1837 | + bitmap_data = struct.pack("<I", _DV_BLOB_MAGIC_NUMBER) + bitmap_payload |
| 1838 | + return struct.pack(">I", len(bitmap_data)) + bitmap_data + struct.pack(">I", zlib.crc32(bitmap_data) & 0xFFFFFFFF) |
| 1839 | + |
| 1840 | + |
| 1841 | +def test_read_deletion_vector_from_puffin_file(tmp_path: Path) -> None: |
| 1842 | + referenced_data_file = f"{tmp_path}/data.parquet" |
| 1843 | + bitmap_payload = _deletion_vector_bitmap_payload() |
| 1844 | + footer_payload = json.dumps( |
| 1845 | + { |
| 1846 | + "blobs": [ |
| 1847 | + { |
| 1848 | + "type": "deletion-vector-v1", |
| 1849 | + "fields": [2147483546], |
| 1850 | + "snapshot-id": 1, |
| 1851 | + "sequence-number": 1, |
| 1852 | + "offset": 0, |
| 1853 | + "length": len(bitmap_payload), |
| 1854 | + "properties": {PROPERTY_REFERENCED_DATA_FILE: referenced_data_file}, |
| 1855 | + } |
| 1856 | + ], |
| 1857 | + "properties": {}, |
| 1858 | + } |
| 1859 | + ).encode() |
| 1860 | + puffin_payload = ( |
| 1861 | + MAGIC_BYTES |
| 1862 | + + b"\x00\x00\x00\x00" |
| 1863 | + + bitmap_payload |
| 1864 | + + footer_payload |
| 1865 | + + len(footer_payload).to_bytes(4, byteorder="little") |
| 1866 | + + b"\x00\x00\x00\x00" |
| 1867 | + + MAGIC_BYTES |
| 1868 | + ) |
| 1869 | + delete_file_path = f"{tmp_path}/deletes.puffin" |
| 1870 | + |
| 1871 | + with open(delete_file_path, "wb") as f: |
| 1872 | + f.write(puffin_payload) |
| 1873 | + |
| 1874 | + deletes = _read_deletes( |
| 1875 | + PyArrowFileIO(), |
| 1876 | + DataFile.from_args( |
| 1877 | + content=DataFileContent.POSITION_DELETES, |
| 1878 | + file_path=delete_file_path, |
| 1879 | + file_format=FileFormat.PUFFIN, |
| 1880 | + ), |
| 1881 | + ) |
| 1882 | + |
| 1883 | + assert deletes == {referenced_data_file: pa.chunked_array([[1, 3, 5]])} |
| 1884 | + |
| 1885 | + |
| 1886 | +def test_read_deletion_vector_blob_from_content_range(tmp_path: Path) -> None: |
| 1887 | + referenced_data_file = f"{tmp_path}/data.parquet" |
| 1888 | + dv_blob = _deletion_vector_blob(_deletion_vector_bitmap_payload()) |
| 1889 | + prefix = b"\x01not-a-puffin-file" |
| 1890 | + delete_file_path = f"{tmp_path}/deletes.bin" |
| 1891 | + |
| 1892 | + with open(delete_file_path, "wb") as f: |
| 1893 | + f.write(prefix + dv_blob + b"trailing-bytes") |
| 1894 | + |
| 1895 | + deletes = _read_deletes( |
| 1896 | + PyArrowFileIO(), |
| 1897 | + DataFile.from_args( |
| 1898 | + _table_format_version=3, |
| 1899 | + content=DataFileContent.POSITION_DELETES, |
| 1900 | + file_path=delete_file_path, |
| 1901 | + file_format=FileFormat.PUFFIN, |
| 1902 | + record_count=3, |
| 1903 | + referenced_data_file=referenced_data_file, |
| 1904 | + content_offset=len(prefix), |
| 1905 | + content_size_in_bytes=len(dv_blob), |
| 1906 | + ), |
| 1907 | + ) |
| 1908 | + |
| 1909 | + assert deletes == {referenced_data_file: pa.chunked_array([[1, 3, 5]])} |
| 1910 | + |
| 1911 | + |
1823 | 1912 | def test_delete(deletes_file: str, request: pytest.FixtureRequest, table_schema_simple: Schema) -> None: |
1824 | 1913 | # Determine file format from the file extension |
1825 | 1914 | file_format = FileFormat.PARQUET if deletes_file.endswith(".parquet") else FileFormat.ORC |
|
0 commit comments