Skip to content

Commit bad9cda

Browse files
authored
fix: Cast smaller integer types to int32/int64 on write for Spark compatibility (#2799)
## Summary - Fixes #2791: Writing smaller integer types (uint8, int8, int16, uint16) to Iceberg IntegerType columns now correctly casts to int32/int64 - PyIceberg was preserving original Arrow types in Parquet files, causing Spark to fail with `Unsupported logical type: UINT_8` - Added integer type widening logic in `ArrowProjectionVisitor._cast_if_needed()` following the same pattern as existing timestamp handling - Only widening conversions are allowed (e.g., uint8 → int32, int32 → int64); narrowing conversions continue to be rejected via `promote()` ## Test plan - [x] All 3041 unit tests pass - [x] Lint passes - [x] New parameterized test covers: uint8, int8, int16, uint16 → int32 and uint32, int32 → int64 - [x] Existing `test_projection_filter_add_column_demote` still works (narrowing rejection) - [x] Manual verification: uint8 data written to IntegerType column produces int32 in Parquet file Closes #2791 Co-authored-by: Somasundaram Sekar <somasundaramsekar.1986@gmail.com>
1 parent a26fca7 commit bad9cda

File tree

2 files changed

+41
-0
lines changed

2 files changed

+41
-0
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1910,6 +1910,15 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
19101910
elif target_type.unit == "us" and values.type.unit in {"s", "ms", "us"}:
19111911
return values.cast(target_type)
19121912
raise ValueError(f"Unsupported schema projection from {values.type} to {target_type}")
1913+
elif isinstance(field.field_type, (IntegerType, LongType)):
1914+
# Cast smaller integer types to target type for cross-platform compatibility
1915+
# Only allow widening conversions (smaller bit width to larger)
1916+
# Narrowing conversions fall through to promote() handling below
1917+
if pa.types.is_integer(values.type):
1918+
source_width = values.type.bit_width
1919+
target_width = target_type.bit_width
1920+
if source_width < target_width:
1921+
return values.cast(target_type)
19131922

19141923
if field.field_type != file_field.field_type:
19151924
target_schema = schema_to_pyarrow(

tests/io/test_pyarrow.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2752,6 +2752,38 @@ def test__to_requested_schema_timestamps_without_downcast_raises_exception(
27522752
assert "Unsupported schema projection from timestamp[ns] to timestamp[us]" in str(exc_info.value)
27532753

27542754

2755+
@pytest.mark.parametrize(
2756+
"arrow_type,iceberg_type,expected_arrow_type",
2757+
[
2758+
(pa.uint8(), IntegerType(), pa.int32()),
2759+
(pa.int8(), IntegerType(), pa.int32()),
2760+
(pa.int16(), IntegerType(), pa.int32()),
2761+
(pa.uint16(), IntegerType(), pa.int32()),
2762+
(pa.uint32(), LongType(), pa.int64()),
2763+
(pa.int32(), LongType(), pa.int64()),
2764+
],
2765+
)
2766+
def test__to_requested_schema_integer_promotion(
2767+
arrow_type: pa.DataType,
2768+
iceberg_type: PrimitiveType,
2769+
expected_arrow_type: pa.DataType,
2770+
) -> None:
2771+
"""Test that smaller integer types are cast to target Iceberg type during write."""
2772+
requested_schema = Schema(NestedField(1, "col", iceberg_type, required=False))
2773+
file_schema = requested_schema
2774+
2775+
arrow_schema = pa.schema([pa.field("col", arrow_type)])
2776+
data = pa.array([1, 2, 3, None], type=arrow_type)
2777+
batch = pa.RecordBatch.from_arrays([data], schema=arrow_schema)
2778+
2779+
result = _to_requested_schema(
2780+
requested_schema, file_schema, batch, downcast_ns_timestamp_to_us=False, include_field_ids=False
2781+
)
2782+
2783+
assert result.schema[0].type == expected_arrow_type
2784+
assert result.column(0).to_pylist() == [1, 2, 3, None]
2785+
2786+
27552787
def test_pyarrow_file_io_fs_by_scheme_cache() -> None:
27562788
# It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument becomes available for `resolve_s3_region`
27572789
# Refer to: https://github.com/apache/arrow/issues/43713

0 commit comments

Comments
 (0)