1414# KIND, either express or implied. See the License for the
1515# specific language governing permissions and limitations
1616# under the License.
17- import math
1817from typing import TYPE_CHECKING , Literal
1918
2019from pydantic import Field
21- from pyroaring import BitMap , FrozenBitMap
2220
2321from pyiceberg .typedef import IcebergBaseModel
2422
2523if TYPE_CHECKING :
26- import pyarrow as pa
24+ pass
2725
2826# Short for: Puffin Fratercula arctica, version 1
2927MAGIC_BYTES = b"PFA1"
30- EMPTY_BITMAP = FrozenBitMap ()
31- MAX_JAVA_SIGNED = int (math .pow (2 , 31 )) - 1
32- PROPERTY_REFERENCED_DATA_FILE = "referenced-data-file"
33-
34-
35- def _deserialize_bitmap (pl : bytes ) -> list [BitMap ]:
36- number_of_bitmaps = int .from_bytes (pl [0 :8 ], byteorder = "little" )
37- pl = pl [8 :]
38-
39- bitmaps = []
40- last_key = - 1
41- for _ in range (number_of_bitmaps ):
42- key = int .from_bytes (pl [0 :4 ], byteorder = "little" )
43- if key < 0 :
44- raise ValueError (f"Invalid unsigned key: { key } " )
45- if key <= last_key :
46- raise ValueError ("Keys must be sorted in ascending order" )
47- if key > MAX_JAVA_SIGNED :
48- raise ValueError (f"Key { key } is too large, max { MAX_JAVA_SIGNED } to maintain compatibility with Java impl" )
49- pl = pl [4 :]
50-
51- while last_key < key - 1 :
52- bitmaps .append (EMPTY_BITMAP )
53- last_key += 1
54-
55- bm = BitMap ().deserialize (pl )
56- # TODO: Optimize this
57- pl = pl [len (bm .serialize ()) :]
58- bitmaps .append (bm )
59-
60- last_key = key
61-
62- return bitmaps
6328
6429
6530class PuffinBlobMetadata (IcebergBaseModel ):
@@ -78,15 +43,9 @@ class Footer(IcebergBaseModel):
7843 properties : dict [str , str ] = Field (default_factory = dict )
7944
8045
81- def _bitmaps_to_chunked_array (bitmaps : list [BitMap ]) -> "pa.ChunkedArray" :
82- import pyarrow as pa
83-
84- return pa .chunked_array ([(key_pos << 32 ) + pos for pos in bitmap ] for key_pos , bitmap in enumerate (bitmaps ))
85-
86-
8746class PuffinFile :
8847 footer : Footer
89- _deletion_vectors : dict [ str , list [ BitMap ]]
48+ _payload : bytes
9049
9150 def __init__ (self , puffin : bytes ) -> None :
9251 for magic_bytes in [puffin [:4 ], puffin [- 4 :]]:
@@ -105,12 +64,7 @@ def __init__(self, puffin: bytes) -> None:
10564 footer_payload_size_int = int .from_bytes (puffin [- 12 :- 8 ], byteorder = "little" )
10665
10766 self .footer = Footer .model_validate_json (puffin [- (footer_payload_size_int + 12 ) : - 12 ])
108- puffin = puffin [8 :]
109-
110- self ._deletion_vectors = {
111- blob .properties [PROPERTY_REFERENCED_DATA_FILE ]: _deserialize_bitmap (puffin [blob .offset : blob .offset + blob .length ])
112- for blob in self .footer .blobs
113- }
67+ self ._payload = puffin [8 :]
11468
115- def to_vector (self ) -> dict [ str , "pa.ChunkedArray" ] :
116- return { path : _bitmaps_to_chunked_array ( bitmaps ) for path , bitmaps in self . _deletion_vectors . items ()}
69+ def get_blob_payload (self , blob : PuffinBlobMetadata ) -> bytes :
70+ return self . _payload [ blob . offset : blob . offset + blob . length ]
0 commit comments