146146 visit ,
147147 visit_with_partner ,
148148)
149+ from pyiceberg .table import TableProperties
149150from pyiceberg .table .locations import load_location_provider
150151from pyiceberg .table .metadata import TableMetadata
151152from pyiceberg .table .name_mapping import NameMapping , apply_name_mapping
152153from pyiceberg .table .puffin import PuffinFile
153154from pyiceberg .transforms import IdentityTransform , TruncateTransform
154- from pyiceberg .typedef import EMPTY_DICT , Properties , Record
155+ from pyiceberg .typedef import EMPTY_DICT , Properties , Record , TableVersion
155156from pyiceberg .types import (
156157 BinaryType ,
157158 BooleanType ,
@@ -1018,22 +1019,22 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start
10181019
10191020
10201021def pyarrow_to_schema (
1021- schema : pa .Schema , name_mapping : Optional [NameMapping ] = None , downcast_ns_timestamp_to_us : bool = False
1022+ schema : pa .Schema , name_mapping : Optional [NameMapping ] = None , downcast_ns_timestamp_to_us : bool = False , format_version : TableVersion = TableProperties . DEFAULT_FORMAT_VERSION
10221023) -> Schema :
10231024 has_ids = visit_pyarrow (schema , _HasIds ())
10241025 if has_ids :
1025- return visit_pyarrow (schema , _ConvertToIceberg (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us ))
1026+ return visit_pyarrow (schema , _ConvertToIceberg (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version ))
10261027 elif name_mapping is not None :
1027- schema_without_ids = _pyarrow_to_schema_without_ids (schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us )
1028+ schema_without_ids = _pyarrow_to_schema_without_ids (schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version )
10281029 return apply_name_mapping (schema_without_ids , name_mapping )
10291030 else :
10301031 raise ValueError (
10311032 "Parquet file does not have field-ids and the Iceberg table does not have 'schema.name-mapping.default' defined"
10321033 )
10331034
10341035
1035- def _pyarrow_to_schema_without_ids (schema : pa .Schema , downcast_ns_timestamp_to_us : bool = False ) -> Schema :
1036- return visit_pyarrow (schema , _ConvertToIcebergWithoutIDs (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us ))
1036+ def _pyarrow_to_schema_without_ids (schema : pa .Schema , downcast_ns_timestamp_to_us : bool = False , format_version : TableVersion = TableProperties . DEFAULT_FORMAT_VERSION ) -> Schema :
1037+ return visit_pyarrow (schema , _ConvertToIcebergWithoutIDs (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version ))
10371038
10381039
10391040def _pyarrow_schema_ensure_large_types (schema : pa .Schema ) -> pa .Schema :
@@ -1111,6 +1112,8 @@ def _(obj: pa.Field, visitor: PyArrowSchemaVisitor[T]) -> T:
11111112
11121113 visitor .before_field (obj )
11131114 try :
1115+ if obj .name == "timestamp_ns" :
1116+ print ('alexstephen' )
11141117 result = visit_pyarrow (field_type , visitor )
11151118 except TypeError as e :
11161119 raise UnsupportedPyArrowTypeException (obj , f"Column '{ obj .name } ' has an unsupported type: { field_type } " ) from e
@@ -1215,9 +1218,10 @@ class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]):
12151218
12161219 _field_names : List [str ]
12171220
1218- def __init__ (self , downcast_ns_timestamp_to_us : bool = False ) -> None :
1221+ def __init__ (self , downcast_ns_timestamp_to_us : bool = False , format_version : TableVersion = TableProperties . DEFAULT_FORMAT_VERSION ) -> None : # noqa: F821
12191222 self ._field_names = []
12201223 self ._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
1224+ self ._format_version = format_version
12211225
12221226 def _field_id (self , field : pa .Field ) -> int :
12231227 if (field_id := _get_field_id (field )) is not None :
@@ -1288,6 +1292,11 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType:
12881292 elif primitive .unit == "ns" :
12891293 if self ._downcast_ns_timestamp_to_us :
12901294 logger .warning ("Iceberg does not yet support 'ns' timestamp precision. Downcasting to 'us'." )
1295+ elif self ._format_version == 3 :
1296+ if primitive .tz in UTC_ALIASES :
1297+ return TimestamptzNanoType ()
1298+ else :
1299+ return TimestampNanoType ()
12911300 else :
12921301 raise TypeError (
12931302 "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." ,
@@ -2540,7 +2549,8 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[
25402549
25412550
25422551def _check_pyarrow_schema_compatible (
2543- requested_schema : Schema , provided_schema : pa .Schema , downcast_ns_timestamp_to_us : bool = False
2552+ requested_schema : Schema , provided_schema : pa .Schema , downcast_ns_timestamp_to_us : bool = False ,
2553+ format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION
25442554) -> None :
25452555 """
25462556 Check if the `requested_schema` is compatible with `provided_schema`.
@@ -2553,10 +2563,10 @@ def _check_pyarrow_schema_compatible(
25532563 name_mapping = requested_schema .name_mapping
25542564 try :
25552565 provided_schema = pyarrow_to_schema (
2556- provided_schema , name_mapping = name_mapping , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
2566+ provided_schema , name_mapping = name_mapping , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version
25572567 )
25582568 except ValueError as e :
2559- provided_schema = _pyarrow_to_schema_without_ids (provided_schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us )
2569+ provided_schema = _pyarrow_to_schema_without_ids (provided_schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version )
25602570 additional_names = set (provided_schema ._name_to_id .keys ()) - set (requested_schema ._name_to_id .keys ())
25612571 raise ValueError (
25622572 f"PyArrow table contains more columns: { ', ' .join (sorted (additional_names ))} . Update the schema first (hint, use union_by_name)."
@@ -2582,7 +2592,7 @@ def parquet_file_to_data_file(io: FileIO, table_metadata: TableMetadata, file_pa
25822592 )
25832593
25842594 schema = table_metadata .schema ()
2585- _check_pyarrow_schema_compatible (schema , arrow_schema )
2595+ _check_pyarrow_schema_compatible (schema , arrow_schema , format_version = table_metadata . format_version )
25862596
25872597 statistics = data_file_statistics_from_parquet_metadata (
25882598 parquet_metadata = parquet_metadata ,
@@ -2673,7 +2683,7 @@ def _dataframe_to_data_files(
26732683 )
26742684 name_mapping = table_metadata .schema ().name_mapping
26752685 downcast_ns_timestamp_to_us = Config ().get_bool (DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE ) or False
2676- task_schema = pyarrow_to_schema (df .schema , name_mapping = name_mapping , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us )
2686+ task_schema = pyarrow_to_schema (df .schema , name_mapping = name_mapping , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = table_metadata . format_version )
26772687
26782688 if table_metadata .spec ().is_unpartitioned ():
26792689 yield from write_file (
0 commit comments