apache
diff --git a/‎paimon-python/pypaimon/common/core_options.py‎
Lines changed: 1 addition & 0 deletions b/‎paimon-python/pypaimon/common/core_options.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paimon-python/pypaimon/common/delta_varint_compressor.py‎
Lines changed: 125 additions & 0 deletions b/‎paimon-python/pypaimon/common/delta_varint_compressor.py‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/common/file_io.py‎
Lines changed: 55 additions & 0 deletions b/‎paimon-python/pypaimon/common/file_io.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/read/reader/format_blob_reader.py‎
Lines changed: 199 additions & 0 deletions b/‎paimon-python/pypaimon/read/reader/format_blob_reader.py‎
Lines changed: 199 additions & 0 deletions
@@ -38,6 +38,7 @@ def __str__(self):
     FILE_FORMAT_ORC = "orc"
     FILE_FORMAT_AVRO = "avro"
     FILE_FORMAT_PARQUET = "parquet"
+    FILE_FORMAT_BLOB = "blob"
     FILE_COMPRESSION = "file.compression"
     FILE_COMPRESSION_PER_LEVEL = "file.compression.per.level"
     FILE_FORMAT_PER_LEVEL = "file.format.per.level"
 
@@ -0,0 +1,125 @@
+################################################################################
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import io
+from typing import List
+
+
+class DeltaVarintCompressor:
+
+    @staticmethod
+    def compress(data: List[int]) -> bytes:
+        if not data:
+            return b''
+
+        # Estimate output size (conservative: 5 bytes per varint max)
+        out = io.BytesIO()
+        out.seek(0)
+
+        # Encode first value directly
+        DeltaVarintCompressor._encode_varint(data[0], out)
+
+        # Encode deltas without intermediate list creation
+        prev = data[0]
+        for i in range(1, len(data)):
+            current = data[i]
+            delta = current - prev
+            DeltaVarintCompressor._encode_varint(delta, out)
+            prev = current
+
+        # Return only the used portion of the buffer
+        position = out.tell()
+        result = out.getvalue()
+        out.close()
+        return result[:position]
+
+    @staticmethod
+    def decompress(compressed: bytes) -> List[int]:
+        if not compressed:
+            return []
+
+        # Fast path: decode directly into result without intermediate deltas list
+        in_stream = io.BytesIO(compressed)
+        result = []
+
+        try:
+            # Decode first value
+            first_value = DeltaVarintCompressor._decode_varint(in_stream)
+            result.append(first_value)
+
+            # Decode and reconstruct remaining values in one pass
+            current_value = first_value
+            while True:
+                try:
+                    delta = DeltaVarintCompressor._decode_varint(in_stream)
+                    current_value += delta
+                    result.append(current_value)
+                except RuntimeError:
+                    # End of stream reached
+                    break
+
+        except RuntimeError:
+            # Handle empty stream case
+            pass
+        finally:
+            in_stream.close()
+
+        return result
+
+    @staticmethod
+    def _encode_varint(value: int, out: io.BytesIO) -> None:
+        # ZigZag encoding: maps signed integers to unsigned integers
+        if value >= 0:
+            zigzag = value << 1
+        else:
+            zigzag = ((-value) << 1) - 1
+
+        # Varint encoding
+        while zigzag >= 0x80:
+            out.write(bytes([(zigzag & 0x7F) | 0x80]))
+            zigzag >>= 7
+        out.write(bytes([zigzag & 0x7F]))
+
+    @staticmethod
+    def _decode_varint(in_stream: io.BytesIO) -> int:
+        result = 0
+        shift = 0
+        while True:
+            byte_data = in_stream.read(1)
+            if not byte_data:
+                if shift == 0:
+                    # Natural end of stream
+                    raise RuntimeError("End of stream")
+                else:
+                    # Unexpected end in middle of varint
+                    raise RuntimeError("Unexpected end of input")
+
+            b = byte_data[0]
+            result |= (b & 0x7F) << shift
+            if (b & 0x80) == 0:
+                break
+
+            shift += 7
+            if shift > 63:
+                raise RuntimeError("Varint overflow")
+
+        # ZigZag decoding: maps unsigned integers back to signed integers
+        if result & 1:
+            return -((result + 1) >> 1)
+        else:
+            return result >> 1
@@ -28,6 +28,11 @@
 from pyarrow._fs import FileSystem
 
 from pypaimon.common.config import OssOptions, S3Options
+from pypaimon.schema.data_types import DataField, AtomicType, PyarrowFieldParser
+from pypaimon.table.row.blob import BlobData
+from pypaimon.table.row.generic_row import GenericRow
+from pypaimon.table.row.row_kind import RowKind
+from pypaimon.write.blob_format_writer import BlobFormatWriter
 
 
 class FileIO:
@@ -364,3 +369,53 @@ def record_generator():
 
         with self.new_output_stream(path) as output_stream:
             fastavro.writer(output_stream, avro_schema, records, **kwargs)
+
+    def write_blob(self, path: Path, data: pyarrow.Table, **kwargs):
+        try:
+            # Validate input constraints
+            if data.num_columns != 1:
+                raise RuntimeError(f"Blob format only supports a single column, got {data.num_columns} columns")
+            # Check for null values
+            column = data.column(0)
+            if column.null_count > 0:
+                raise RuntimeError("Blob format does not support null values")
+            # Convert PyArrow schema to Paimon DataFields
+            # For blob files, we expect exactly one blob column
+            field = data.schema[0]
+            if pyarrow.types.is_large_binary(field.type):
+                fields = [DataField(0, field.name, AtomicType("BLOB"))]
+            else:
+                # Convert other types as needed
+                paimon_type = PyarrowFieldParser.to_paimon_type(field.type, field.nullable)
+                fields = [DataField(0, field.name, paimon_type)]
+            # Convert PyArrow Table to records
+            records_dict = data.to_pydict()
+            num_rows = data.num_rows
+            field_name = fields[0].name
+            with self.new_output_stream(path) as output_stream:
+                writer = BlobFormatWriter(output_stream)
+                # Write each row
+                for i in range(num_rows):
+                    col_data = records_dict[field_name][i]
+                    # Convert to appropriate type based on field type
+                    if hasattr(fields[0].type, 'type') and fields[0].type.type == "BLOB":
+                        if isinstance(col_data, bytes):
+                            blob_data = BlobData(col_data)
+                        else:
+                            # Convert to bytes if needed
+                            if hasattr(col_data, 'as_py'):
+                                col_data = col_data.as_py()
+                            if isinstance(col_data, str):
+                                col_data = col_data.encode('utf-8')
+                            blob_data = BlobData(col_data)
+                        row_values = [blob_data]
+                    else:
+                        row_values = [col_data]
+                    # Create GenericRow and write
+                    row = GenericRow(row_values, fields, RowKind.INSERT)
+                    writer.add_element(row)
+                writer.close()
+
+        except Exception as e:
+            self.delete_quietly(path)
+            raise RuntimeError(f"Failed to write blob file {path}: {e}") from e
@@ -0,0 +1,199 @@
+################################################################################
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import struct
+from pathlib import Path
+from typing import List, Optional, Any, Iterator
+
+import pyarrow as pa
+import pyarrow.dataset as ds
+from pyarrow import RecordBatch
+
+from pypaimon.common.delta_varint_compressor import DeltaVarintCompressor
+from pypaimon.common.file_io import FileIO
+from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
+from pypaimon.schema.data_types import DataField, PyarrowFieldParser
+from pypaimon.table.row.blob import Blob, BlobDescriptor, BlobRef
+from pypaimon.table.row.generic_row import GenericRow
+
+
+class FormatBlobReader(RecordBatchReader):
+
+    def __init__(self, file_io: FileIO, file_path: str, read_fields: List[str],
+                 full_fields: List[DataField], push_down_predicate: Any):
+        self._file_io = file_io
+        self._file_path = file_path
+        self._push_down_predicate = push_down_predicate
+
+        # Get file size
+        self._file_size = file_io.get_file_size(file_path)
+
+        # Initialize the low-level blob format reader
+        self.file_path = file_path
+        self.blob_lengths: List[int] = []
+        self.blob_offsets: List[int] = []
+        self.returned = False
+        self._read_index()
+
+        # Set up fields and schema
+        if len(read_fields) > 1:
+            raise RuntimeError("Blob reader only supports one field.")
+        self._fields = read_fields
+        full_fields_map = {field.name: field for field in full_fields}
+        projected_data_fields = [full_fields_map[name] for name in read_fields]
+        self._schema = PyarrowFieldParser.from_paimon_schema(projected_data_fields)
+
+        # Initialize iterator
+        self._blob_iterator = None
+        self._current_batch = None
+
+    def read_arrow_batch(self) -> Optional[RecordBatch]:
+        if self._blob_iterator is None:
+            if self.returned:
+                return None
+            self.returned = True
+            batch_iterator = BlobRecordIterator(self.file_path, self.blob_lengths, self.blob_offsets, self._fields[0])
+            self._blob_iterator = iter(batch_iterator)
+
+        # Collect records for this batch
+        pydict_data = {name: [] for name in self._fields}
+        records_in_batch = 0
+
+        try:
+            while True:
+                # Get next blob record
+                blob_row = next(self._blob_iterator)
+                # Check if first read returns None, stop immediately
+                if blob_row is None:
+                    break
+
+                # Extract blob data from the row
+                blob = blob_row.values[0]  # Blob files have single blob field
+
+                # Convert blob to appropriate format for each requested field
+                for field_name in self._fields:
+                    # For blob files, all fields should contain blob data
+                    if isinstance(blob, Blob):
+                        blob_data = blob.to_data()
+                    else:
+                        blob_data = bytes(blob) if blob is not None else None
+                    pydict_data[field_name].append(blob_data)
+
+                records_in_batch += 1
+
+        except StopIteration:
+            # Stop immediately when StopIteration occurs
+            pass
+
+        if records_in_batch == 0:
+            return None
+
+        # Create RecordBatch
+        if self._push_down_predicate is None:
+            # Convert to Table first, then to RecordBatch
+            table = pa.Table.from_pydict(pydict_data, self._schema)
+            if table.num_rows > 0:
+                return table.to_batches()[0]
+            else:
+                return None
+        else:
+            # Apply predicate filtering
+            pa_batch = pa.Table.from_pydict(pydict_data, self._schema)
+            dataset = ds.InMemoryDataset(pa_batch)
+            scanner = dataset.scanner(filter=self._push_down_predicate)
+            combine_chunks = scanner.to_table().combine_chunks()
+            if combine_chunks.num_rows > 0:
+                return combine_chunks.to_batches()[0]
+            else:
+                return None
+
+    def close(self):
+        self._blob_iterator = None
+
+    def _read_index(self) -> None:
+        with self._file_io.new_input_stream(Path(self.file_path)) as f:
+            # Seek to header: last 5 bytes
+            f.seek(self._file_size - 5)
+            header = f.read(5)
+
+            if len(header) != 5:
+                raise IOError("Invalid blob file: cannot read header")
+
+            # Parse header
+            index_length = struct.unpack('<I', header[:4])[0]  # Little endian
+            version = header[4]
+
+            if version != 1:
+                raise IOError(f"Unsupported blob file version: {version}")
+
+            # Read index data
+            f.seek(self._file_size - 5 - index_length)
+            index_bytes = f.read(index_length)
+
+            if len(index_bytes) != index_length:
+                raise IOError("Invalid blob file: cannot read index")
+
+            # Decompress blob lengths and compute offsets
+            blob_lengths = DeltaVarintCompressor.decompress(index_bytes)
+            blob_offsets = []
+            offset = 0
+            for length in blob_lengths:
+                blob_offsets.append(offset)
+                offset += length
+            self.blob_lengths = blob_lengths
+            self.blob_offsets = blob_offsets
+
+
+class BlobRecordIterator:
+    MAGIC_NUMBER_SIZE = 4
+    METADATA_OVERHEAD = 16
+
+    def __init__(self, file_path: str, blob_lengths: List[int], blob_offsets: List[int], field_name: str):
+        self.file_path = file_path
+        self.field_name = field_name
+        self.blob_lengths = blob_lengths
+        self.blob_offsets = blob_offsets
+        self.current_position = 0
+
+    def __iter__(self) -> Iterator[GenericRow]:
+        return self
+
+    def __next__(self) -> GenericRow:
+        if self.current_position >= len(self.blob_lengths):
+            raise StopIteration
+
+        # Create blob reference for the current blob
+        # Skip magic number (4 bytes) and exclude length (8 bytes) + CRC (4 bytes) = 12 bytes
+        blob_offset = self.blob_offsets[self.current_position] + self.MAGIC_NUMBER_SIZE  # Skip magic number
+        blob_length = self.blob_lengths[self.current_position] - self.METADATA_OVERHEAD
+
+        # Create BlobDescriptor for this blob
+        descriptor = BlobDescriptor(self.file_path, blob_offset, blob_length)
+        blob = BlobRef(descriptor)
+
+        self.current_position += 1
+
+        # Return as GenericRow with single blob field
+        from pypaimon.schema.data_types import DataField, AtomicType
+        from pypaimon.table.row.row_kind import RowKind
+
+        fields = [DataField(0, self.field_name, AtomicType("BLOB"))]
+        return GenericRow([blob], fields, RowKind.INSERT)
+
+    def returned_position(self) -> int:
+        """Get current position in the iterator."""
+        return self.current_position