Fix: Handle bytes values in string column statistics from Parquet

oamazonasgabriel · oamazonasgabriel · commit a572b1d55ac4 · 2026-02-01T09:00:54.000-03:00
Problem:
When using `add_files()` with Parquet files written by DuckDB, PyIceberg
fails with `AttributeError: 'bytes' object has no attribute 'encode'`.

Root Cause:
The Parquet format stores column statistics (min_value, max_value) as binary
data in the Statistics struct (see parquet.thrift). When PyArrow reads these
statistics from Parquet files, it may return them as Python `bytes` objects
rather than decoded `str` values. This is valid per the Parquet specification:

  struct Statistics {
    5: optional binary max_value;
    6: optional binary min_value;
  }

PyIceberg's StatsAggregator expected string statistics to always be `str`,
causing failures when processing Parquet files from writers like DuckDB that
expose this binary representation.

Fix:
1. In `StatsAggregator.min_as_bytes()`: Add handling for bytes values by
   decoding to UTF-8 string before truncation and serialization.

2. In `StatsAggregator.max_as_bytes()`: Update existing string handling to
   decode bytes values before processing (was raising ValueError).

3. In `to_bytes()` for StringType: Add defensive isinstance check to handle
   bytes values as a safety fallback.

4. Add unit tests for both StatsAggregator bytes handling and to_bytes.
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -2183,6 +2183,11 @@ def min_as_bytes(self) -> bytes | None:
         if self.current_min is None:
             return None
 
+        # The Parquet format stores column statistics as binary (see Statistics struct
+        # in parquet.thrift). PyArrow may return these as bytes instead of str.
+        if self.primitive_type == StringType() and isinstance(self.current_min, bytes):
+            self.current_min = self.current_min.decode("utf-8")
+
         return self.serialize(
             self.current_min
             if self.trunc_length is None
@@ -2194,8 +2199,12 @@ def max_as_bytes(self) -> bytes | None:
             return None
 
         if self.primitive_type == StringType():
+            # The Parquet format stores column statistics as binary (see Statistics struct
+            # in parquet.thrift). PyArrow may return these as bytes instead of str.
+            if isinstance(self.current_max, bytes):
+                self.current_max = self.current_max.decode("utf-8")
             if not isinstance(self.current_max, str):
-                raise ValueError("Expected the current_max to be a string")
+                raise ValueError("Expected the current_max to be a string or bytes")
             s_result = truncate_upper_bound_text_string(self.current_max, self.trunc_length)
             return self.serialize(s_result) if s_result is not None else None
         elif self.primitive_type == BinaryType():
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
@@ -2232,6 +2232,48 @@ def test_stats_aggregator_physical_type_does_not_match_expected_raise_error(
         StatsAggregator(iceberg_type, physical_type_string)
 
 
+def test_stats_aggregator_string_type_with_bytes_value() -> None:
+    """Test that StatsAggregator handles bytes values for StringType.
+
+    Some Parquet writers (e.g., DuckDB) provide string statistics as bytes
+    instead of str. This test ensures we handle both formats correctly.
+    """
+    stats = StatsAggregator(StringType(), "BYTE_ARRAY")
+
+    # Update with bytes values (as DuckDB might provide)
+    stats.update_min(b"apple")
+    stats.update_max(b"zebra")
+
+    assert stats.current_min == b"apple"
+    assert stats.current_max == b"zebra"
+
+    # Verify serialization works with bytes values
+    min_bytes = stats.min_as_bytes()
+    max_bytes = stats.max_as_bytes()
+
+    assert min_bytes == b"apple"
+    assert max_bytes == b"zebra"
+
+
+def test_stats_aggregator_string_type_with_mixed_str_and_bytes() -> None:
+    """Test that StatsAggregator handles mixed str and bytes for StringType."""
+    stats = StatsAggregator(StringType(), "BYTE_ARRAY")
+
+    # Update with string values (normal case)
+    stats.update_min("apple")
+    stats.update_max("zebra")
+
+    assert stats.current_min == "apple"
+    assert stats.current_max == "zebra"
+
+    # Verify serialization works with string values
+    min_bytes = stats.min_as_bytes()
+    max_bytes = stats.max_as_bytes()
+
+    assert min_bytes == b"apple"
+    assert max_bytes == b"zebra"
+
+
 def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None:
     # default packs to 1 bin since the table is small
     bin_packed = bin_pack_arrow_table(
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -603,3 +603,15 @@ def test_json_single_serialization(primitive_type: PrimitiveType, value: Any, ex
 )
 def test_json_serialize_roundtrip(primitive_type: PrimitiveType, value: Any) -> None:
     assert value == conversions.from_json(primitive_type, conversions.to_json(primitive_type, value))
+
+
+def test_string_type_to_bytes_with_str() -> None:
+    """Test that to_bytes works with str values for StringType."""
+    result = conversions.to_bytes(StringType(), "hello")
+    assert result == b"hello"
+
+
+def test_string_type_to_bytes_with_unicode() -> None:
+    """Test that to_bytes works with unicode str values for StringType."""
+    result = conversions.to_bytes(StringType(), "héllo wörld")
+    assert result == "héllo wörld".encode("utf-8")