Skip to content

Commit a6bc9c1

Browse files
Fix: Handle bytes values in string column statistics from Parquet
Problem: When using `add_files()` with Parquet files written by DuckDB, PyIceberg fails with `AttributeError: 'bytes' object has no attribute 'encode'`. Root Cause: The Parquet format stores column statistics (min_value, max_value) as binary data in the Statistics struct (see parquet.thrift). When PyArrow reads these statistics from Parquet files, it may return them as Python `bytes` objects rather than decoded `str` values. This is valid per the Parquet specification: struct Statistics { 5: optional binary max_value; 6: optional binary min_value; } PyIceberg's StatsAggregator expected string statistics to always be `str`, causing failures when processing Parquet files from writers like DuckDB that expose this binary representation. Fix: 1. In `StatsAggregator.min_as_bytes()`: Add handling for bytes values by decoding to UTF-8 string before truncation and serialization. 2. In `StatsAggregator.max_as_bytes()`: Update existing string handling to decode bytes values before processing (was raising ValueError). 3. In `to_bytes()` for StringType: Add defensive isinstance check to handle bytes values as a safety fallback. 4. Add unit tests for both StatsAggregator bytes handling and to_bytes.
1 parent c0e7c6d commit a6bc9c1

File tree

3 files changed

+134
-4
lines changed

3 files changed

+134
-4
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2183,6 +2183,11 @@ def min_as_bytes(self) -> bytes | None:
21832183
if self.current_min is None:
21842184
return None
21852185

2186+
# The Parquet format stores column statistics as binary (see Statistics struct
2187+
# in parquet.thrift). PyArrow may return these as bytes instead of str.
2188+
if self.primitive_type == StringType() and isinstance(self.current_min, bytes):
2189+
self.current_min = self.current_min.decode("utf-8")
2190+
21862191
return self.serialize(
21872192
self.current_min
21882193
if self.trunc_length is None
@@ -2194,10 +2199,14 @@ def max_as_bytes(self) -> bytes | None:
21942199
return None
21952200

21962201
if self.primitive_type == StringType():
2197-
if not isinstance(self.current_max, str):
2198-
raise ValueError("Expected the current_max to be a string")
2199-
s_result = truncate_upper_bound_text_string(self.current_max, self.trunc_length)
2200-
return self.serialize(s_result) if s_result is not None else None
2202+
# The Parquet format stores column statistics as binary (see Statistics struct
2203+
# in parquet.thrift). PyArrow may return these as bytes instead of str.
2204+
if isinstance(self.current_max, bytes):
2205+
self.current_max = self.current_max.decode("utf-8")
2206+
if isinstance(self.current_max, str):
2207+
s_result = truncate_upper_bound_text_string(self.current_max, self.trunc_length)
2208+
return self.serialize(s_result) if s_result is not None else None
2209+
raise ValueError(f"Expected the current_max to be a str, got {type(self.current_max)}")
22012210
elif self.primitive_type == BinaryType():
22022211
if not isinstance(self.current_max, bytes):
22032212
raise ValueError("Expected the current_max to be bytes")

tests/io/test_pyarrow.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2232,6 +2232,115 @@ def test_stats_aggregator_physical_type_does_not_match_expected_raise_error(
22322232
StatsAggregator(iceberg_type, physical_type_string)
22332233

22342234

2235+
def test_stats_aggregator_string_type_with_bytes_value() -> None:
2236+
"""Test that StatsAggregator handles bytes values for StringType.
2237+
2238+
The Parquet format stores column statistics as binary data (see Statistics
2239+
struct in parquet.thrift). When PyArrow reads these statistics, it may
2240+
return them as Python bytes objects. This test ensures we handle bytes
2241+
values correctly, as produced by writers like DuckDB.
2242+
"""
2243+
stats = StatsAggregator(StringType(), "BYTE_ARRAY")
2244+
2245+
# Simulate bytes values as returned by PyArrow from Parquet statistics
2246+
# These represent realistic min/max values for a "source" column
2247+
stats.update_min(b"/docs/readme.md")
2248+
stats.update_max(b"/docs/tutorial.md")
2249+
2250+
assert stats.current_min == b"/docs/readme.md"
2251+
assert stats.current_max == b"/docs/tutorial.md"
2252+
2253+
# Verify serialization decodes bytes to str and encodes back to bytes
2254+
min_bytes = stats.min_as_bytes()
2255+
max_bytes = stats.max_as_bytes()
2256+
2257+
assert min_bytes == b"/docs/readme.md"
2258+
assert max_bytes == b"/docs/tutorial.md"
2259+
2260+
2261+
def test_stats_aggregator_string_type_with_str_value() -> None:
2262+
"""Test that StatsAggregator handles str values for StringType."""
2263+
stats = StatsAggregator(StringType(), "BYTE_ARRAY")
2264+
2265+
# Standard str values (the common case when PyArrow decodes statistics)
2266+
stats.update_min("2024-01-01")
2267+
stats.update_max("2024-12-31")
2268+
2269+
assert stats.current_min == "2024-01-01"
2270+
assert stats.current_max == "2024-12-31"
2271+
2272+
# Verify serialization encodes str to bytes
2273+
min_bytes = stats.min_as_bytes()
2274+
max_bytes = stats.max_as_bytes()
2275+
2276+
assert min_bytes == b"2024-01-01"
2277+
assert max_bytes == b"2024-12-31"
2278+
2279+
2280+
def test_stats_aggregator_integer_type() -> None:
2281+
"""Test that StatsAggregator handles IntegerType min/max statistics."""
2282+
stats = StatsAggregator(IntegerType(), "INT32")
2283+
2284+
stats.update_min(1)
2285+
stats.update_max(1000)
2286+
2287+
assert stats.current_min == 1
2288+
assert stats.current_max == 1000
2289+
2290+
min_bytes = stats.min_as_bytes()
2291+
max_bytes = stats.max_as_bytes()
2292+
2293+
# INT32 is stored as 4 bytes little-endian
2294+
assert min_bytes == (1).to_bytes(4, byteorder="little", signed=True)
2295+
assert max_bytes == (1000).to_bytes(4, byteorder="little", signed=True)
2296+
2297+
2298+
def test_stats_aggregator_long_type() -> None:
2299+
"""Test that StatsAggregator handles LongType min/max statistics."""
2300+
stats = StatsAggregator(LongType(), "INT64")
2301+
2302+
stats.update_min(-9223372036854775808) # Long min
2303+
stats.update_max(9223372036854775807) # Long max
2304+
2305+
min_bytes = stats.min_as_bytes()
2306+
max_bytes = stats.max_as_bytes()
2307+
2308+
# INT64 is stored as 8 bytes little-endian
2309+
assert min_bytes == (-9223372036854775808).to_bytes(8, byteorder="little", signed=True)
2310+
assert max_bytes == (9223372036854775807).to_bytes(8, byteorder="little", signed=True)
2311+
2312+
2313+
def test_stats_aggregator_double_type() -> None:
2314+
"""Test that StatsAggregator handles DoubleType min/max statistics."""
2315+
import struct
2316+
2317+
stats = StatsAggregator(DoubleType(), "DOUBLE")
2318+
2319+
stats.update_min(-273.15) # Absolute zero in Celsius
2320+
stats.update_max(1000000.0)
2321+
2322+
min_bytes = stats.min_as_bytes()
2323+
max_bytes = stats.max_as_bytes()
2324+
2325+
# DOUBLE is stored as 8 bytes IEEE 754
2326+
assert min_bytes == struct.pack("<d", -273.15)
2327+
assert max_bytes == struct.pack("<d", 1000000.0)
2328+
2329+
2330+
def test_stats_aggregator_binary_type() -> None:
2331+
"""Test that StatsAggregator handles BinaryType min/max statistics."""
2332+
stats = StatsAggregator(BinaryType(), "BYTE_ARRAY")
2333+
2334+
stats.update_min(b"\x00\x01\x02")
2335+
stats.update_max(b"\xff\xfe\xfd")
2336+
2337+
min_bytes = stats.min_as_bytes()
2338+
max_bytes = stats.max_as_bytes()
2339+
2340+
assert min_bytes == b"\x00\x01\x02"
2341+
assert max_bytes == b"\xff\xfe\xfd"
2342+
2343+
22352344
def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None:
22362345
# default packs to 1 bin since the table is small
22372346
bin_packed = bin_pack_arrow_table(

tests/test_conversions.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,3 +603,15 @@ def test_json_single_serialization(primitive_type: PrimitiveType, value: Any, ex
603603
)
604604
def test_json_serialize_roundtrip(primitive_type: PrimitiveType, value: Any) -> None:
605605
assert value == conversions.from_json(primitive_type, conversions.to_json(primitive_type, value))
606+
607+
608+
def test_string_type_to_bytes_with_str() -> None:
609+
"""Test that to_bytes works with str values for StringType."""
610+
result = conversions.to_bytes(StringType(), "hello")
611+
assert result == b"hello"
612+
613+
614+
def test_string_type_to_bytes_with_unicode() -> None:
615+
"""Test that to_bytes works with unicode str values for StringType."""
616+
result = conversions.to_bytes(StringType(), "héllo wörld")
617+
assert result == "héllo wörld".encode()

0 commit comments

Comments
 (0)