Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,13 @@ def visit_fixed(self, fixed_type: FixedType) -> pa.DataType:
return pa.binary(len(fixed_type))

def visit_decimal(self, decimal_type: DecimalType) -> pa.DataType:
return pa.decimal128(decimal_type.precision, decimal_type.scale)
return (
pa.decimal32(decimal_type.precision, decimal_type.scale)
if decimal_type.precision <= 9
else pa.decimal64(decimal_type.precision, decimal_type.scale)
if decimal_type.precision <= 18
else pa.decimal128(decimal_type.precision, decimal_type.scale)
Comment thread
Fokko marked this conversation as resolved.
Outdated
)

def visit_boolean(self, _: BooleanType) -> pa.DataType:
return pa.bool_()
Expand Down Expand Up @@ -2442,7 +2448,9 @@ def write_parquet(task: WriteTask) -> DataFile:
)
fo = io.new_output(file_path)
with fo.create(overwrite=True) as fos:
with pq.ParquetWriter(fos, schema=arrow_table.schema, **parquet_writer_kwargs) as writer:
with pq.ParquetWriter(
fos, schema=arrow_table.schema, store_decimal_as_integer=True, **parquet_writer_kwargs
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"""
By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. When enabled, the writer will use the following physical types to store decimals:

  • int32: for 1 <= precision <= 9.
  • int64: for 10 <= precision <= 18.
  • fixed_len_byte_array: for precision > 18.
    """

from https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html
Screenshot 2025-05-09 at 8 35 41 AM

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this matches the parquet data type mapping for decimal
https://iceberg.apache.org/spec/#parquet

Screenshot 2025-05-09 at 8 36 40 AM

) as writer:
writer.write(arrow_table, row_group_size=row_group_size)
statistics = data_file_statistics_from_parquet_metadata(
parquet_metadata=writer.writer.metadata,
Expand Down
31 changes: 31 additions & 0 deletions tests/integration/test_writes/test_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import random
import time
from datetime import date, datetime, timedelta
from decimal import Decimal
from pathlib import Path
from typing import Any, Dict
from urllib.parse import urlparse
Expand Down Expand Up @@ -50,6 +51,7 @@
from pyiceberg.transforms import DayTransform, HourTransform, IdentityTransform
from pyiceberg.types import (
DateType,
DecimalType,
DoubleType,
IntegerType,
ListType,
Expand Down Expand Up @@ -1810,3 +1812,32 @@ def test_evolve_and_write(
)

assert session_catalog.load_table(identifier).scan().to_arrow().column(0).combine_chunks() == numbers


@pytest.mark.integration
def test_read_write_decimals(session_catalog: Catalog) -> None:
"""Roundtrip decimal types to make sure that we correctly write them as ints"""
identifier = "default.test_read_write_decimals"

arrow_table = pa.Table.from_pydict(
{
"decimal8": pa.array([Decimal("123.45"), Decimal("678.91")], pa.decimal128(8, 2)),
"decimal16": pa.array([Decimal("12345679.123456"), Decimal("67891234.678912")], pa.decimal128(16, 6)),
"decimal19": pa.array([Decimal("1234567890123.123456"), Decimal("9876543210703.654321")], pa.decimal128(19, 6)),
},
)

tbl = _create_table(
session_catalog,
identifier,
properties={"format-version": 2},
schema=Schema(
NestedField(1, "decimal8", DecimalType(8, 2)),
NestedField(2, "decimal16", DecimalType(16, 6)),
NestedField(3, "decimal19", DecimalType(19, 6)),
),
)

tbl.append(arrow_table)

assert tbl.scan().to_arrow() == arrow_table
Loading