Skip to content

Commit a572b1d

Browse files
Fix: Handle bytes values in string column statistics from Parquet
Problem: When using `add_files()` with Parquet files written by DuckDB, PyIceberg fails with `AttributeError: 'bytes' object has no attribute 'encode'`. Root Cause: The Parquet format stores column statistics (min_value, max_value) as binary data in the Statistics struct (see parquet.thrift). When PyArrow reads these statistics from Parquet files, it may return them as Python `bytes` objects rather than decoded `str` values. This is valid per the Parquet specification: struct Statistics { 5: optional binary max_value; 6: optional binary min_value; } PyIceberg's StatsAggregator expected string statistics to always be `str`, causing failures when processing Parquet files from writers like DuckDB that expose this binary representation. Fix: 1. In `StatsAggregator.min_as_bytes()`: Add handling for bytes values by decoding to UTF-8 string before truncation and serialization. 2. In `StatsAggregator.max_as_bytes()`: Update existing string handling to decode bytes values before processing (was raising ValueError). 3. In `to_bytes()` for StringType: Add defensive isinstance check to handle bytes values as a safety fallback. 4. Add unit tests for both StatsAggregator bytes handling and to_bytes.
1 parent c0e7c6d commit a572b1d

File tree

3 files changed

+64
-1
lines changed

3 files changed

+64
-1
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2183,6 +2183,11 @@ def min_as_bytes(self) -> bytes | None:
21832183
if self.current_min is None:
21842184
return None
21852185

2186+
# The Parquet format stores column statistics as binary (see Statistics struct
2187+
# in parquet.thrift). PyArrow may return these as bytes instead of str.
2188+
if self.primitive_type == StringType() and isinstance(self.current_min, bytes):
2189+
self.current_min = self.current_min.decode("utf-8")
2190+
21862191
return self.serialize(
21872192
self.current_min
21882193
if self.trunc_length is None
@@ -2194,8 +2199,12 @@ def max_as_bytes(self) -> bytes | None:
21942199
return None
21952200

21962201
if self.primitive_type == StringType():
2202+
# The Parquet format stores column statistics as binary (see Statistics struct
2203+
# in parquet.thrift). PyArrow may return these as bytes instead of str.
2204+
if isinstance(self.current_max, bytes):
2205+
self.current_max = self.current_max.decode("utf-8")
21972206
if not isinstance(self.current_max, str):
2198-
raise ValueError("Expected the current_max to be a string")
2207+
raise ValueError("Expected the current_max to be a string or bytes")
21992208
s_result = truncate_upper_bound_text_string(self.current_max, self.trunc_length)
22002209
return self.serialize(s_result) if s_result is not None else None
22012210
elif self.primitive_type == BinaryType():

tests/io/test_pyarrow.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2232,6 +2232,48 @@ def test_stats_aggregator_physical_type_does_not_match_expected_raise_error(
22322232
StatsAggregator(iceberg_type, physical_type_string)
22332233

22342234

2235+
def test_stats_aggregator_string_type_with_bytes_value() -> None:
2236+
"""Test that StatsAggregator handles bytes values for StringType.
2237+
2238+
Some Parquet writers (e.g., DuckDB) provide string statistics as bytes
2239+
instead of str. This test ensures we handle both formats correctly.
2240+
"""
2241+
stats = StatsAggregator(StringType(), "BYTE_ARRAY")
2242+
2243+
# Update with bytes values (as DuckDB might provide)
2244+
stats.update_min(b"apple")
2245+
stats.update_max(b"zebra")
2246+
2247+
assert stats.current_min == b"apple"
2248+
assert stats.current_max == b"zebra"
2249+
2250+
# Verify serialization works with bytes values
2251+
min_bytes = stats.min_as_bytes()
2252+
max_bytes = stats.max_as_bytes()
2253+
2254+
assert min_bytes == b"apple"
2255+
assert max_bytes == b"zebra"
2256+
2257+
2258+
def test_stats_aggregator_string_type_with_mixed_str_and_bytes() -> None:
2259+
"""Test that StatsAggregator handles mixed str and bytes for StringType."""
2260+
stats = StatsAggregator(StringType(), "BYTE_ARRAY")
2261+
2262+
# Update with string values (normal case)
2263+
stats.update_min("apple")
2264+
stats.update_max("zebra")
2265+
2266+
assert stats.current_min == "apple"
2267+
assert stats.current_max == "zebra"
2268+
2269+
# Verify serialization works with string values
2270+
min_bytes = stats.min_as_bytes()
2271+
max_bytes = stats.max_as_bytes()
2272+
2273+
assert min_bytes == b"apple"
2274+
assert max_bytes == b"zebra"
2275+
2276+
22352277
def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None:
22362278
# default packs to 1 bin since the table is small
22372279
bin_packed = bin_pack_arrow_table(

tests/test_conversions.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,3 +603,15 @@ def test_json_single_serialization(primitive_type: PrimitiveType, value: Any, ex
603603
)
604604
def test_json_serialize_roundtrip(primitive_type: PrimitiveType, value: Any) -> None:
605605
assert value == conversions.from_json(primitive_type, conversions.to_json(primitive_type, value))
606+
607+
608+
def test_string_type_to_bytes_with_str() -> None:
609+
"""Test that to_bytes works with str values for StringType."""
610+
result = conversions.to_bytes(StringType(), "hello")
611+
assert result == b"hello"
612+
613+
614+
def test_string_type_to_bytes_with_unicode() -> None:
615+
"""Test that to_bytes works with unicode str values for StringType."""
616+
result = conversions.to_bytes(StringType(), "héllo wörld")
617+
assert result == "héllo wörld".encode("utf-8")

0 commit comments

Comments
 (0)