@@ -1019,22 +1019,36 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start
10191019
10201020
10211021def pyarrow_to_schema (
1022- schema : pa .Schema , name_mapping : Optional [NameMapping ] = None , downcast_ns_timestamp_to_us : bool = False , format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION
1022+ schema : pa .Schema ,
1023+ name_mapping : Optional [NameMapping ] = None ,
1024+ downcast_ns_timestamp_to_us : bool = False ,
1025+ format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION ,
10231026) -> Schema :
10241027 has_ids = visit_pyarrow (schema , _HasIds ())
10251028 if has_ids :
1026- return visit_pyarrow (schema , _ConvertToIceberg (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version ))
1029+ return visit_pyarrow (
1030+ schema , _ConvertToIceberg (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version )
1031+ )
10271032 elif name_mapping is not None :
1028- schema_without_ids = _pyarrow_to_schema_without_ids (schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version )
1033+ schema_without_ids = _pyarrow_to_schema_without_ids (
1034+ schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version
1035+ )
10291036 return apply_name_mapping (schema_without_ids , name_mapping )
10301037 else :
10311038 raise ValueError (
10321039 "Parquet file does not have field-ids and the Iceberg table does not have 'schema.name-mapping.default' defined"
10331040 )
10341041
10351042
1036- def _pyarrow_to_schema_without_ids (schema : pa .Schema , downcast_ns_timestamp_to_us : bool = False , format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION ) -> Schema :
1037- return visit_pyarrow (schema , _ConvertToIcebergWithoutIDs (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version ))
1043+ def _pyarrow_to_schema_without_ids (
1044+ schema : pa .Schema ,
1045+ downcast_ns_timestamp_to_us : bool = False ,
1046+ format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION ,
1047+ ) -> Schema :
1048+ return visit_pyarrow (
1049+ schema ,
1050+ _ConvertToIcebergWithoutIDs (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version ),
1051+ )
10381052
10391053
10401054def _pyarrow_schema_ensure_large_types (schema : pa .Schema ) -> pa .Schema :
@@ -1113,7 +1127,7 @@ def _(obj: pa.Field, visitor: PyArrowSchemaVisitor[T]) -> T:
11131127 visitor .before_field (obj )
11141128 try :
11151129 if obj .name == "timestamp_ns" :
1116- print (' alexstephen' )
1130+ print (" alexstephen" )
11171131 result = visit_pyarrow (field_type , visitor )
11181132 except TypeError as e :
11191133 raise UnsupportedPyArrowTypeException (obj , f"Column '{ obj .name } ' has an unsupported type: { field_type } " ) from e
@@ -1218,7 +1232,9 @@ class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]):
12181232
12191233 _field_names : List [str ]
12201234
1221- def __init__ (self , downcast_ns_timestamp_to_us : bool = False , format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION ) -> None : # noqa: F821
1235+ def __init__ (
1236+ self , downcast_ns_timestamp_to_us : bool = False , format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION
1237+ ) -> None : # noqa: F821
12221238 self ._field_names = []
12231239 self ._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
12241240 self ._format_version = format_version
@@ -2549,8 +2565,10 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[
25492565
25502566
25512567def _check_pyarrow_schema_compatible (
2552- requested_schema : Schema , provided_schema : pa .Schema , downcast_ns_timestamp_to_us : bool = False ,
2553- format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION
2568+ requested_schema : Schema ,
2569+ provided_schema : pa .Schema ,
2570+ downcast_ns_timestamp_to_us : bool = False ,
2571+ format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION ,
25542572) -> None :
25552573 """
25562574 Check if the `requested_schema` is compatible with `provided_schema`.
@@ -2563,10 +2581,15 @@ def _check_pyarrow_schema_compatible(
25632581 name_mapping = requested_schema .name_mapping
25642582 try :
25652583 provided_schema = pyarrow_to_schema (
2566- provided_schema , name_mapping = name_mapping , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version
2584+ provided_schema ,
2585+ name_mapping = name_mapping ,
2586+ downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us ,
2587+ format_version = format_version ,
25672588 )
25682589 except ValueError as e :
2569- provided_schema = _pyarrow_to_schema_without_ids (provided_schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version )
2590+ provided_schema = _pyarrow_to_schema_without_ids (
2591+ provided_schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version
2592+ )
25702593 additional_names = set (provided_schema ._name_to_id .keys ()) - set (requested_schema ._name_to_id .keys ())
25712594 raise ValueError (
25722595 f"PyArrow table contains more columns: { ', ' .join (sorted (additional_names ))} . Update the schema first (hint, use union_by_name)."
@@ -2683,7 +2706,12 @@ def _dataframe_to_data_files(
26832706 )
26842707 name_mapping = table_metadata .schema ().name_mapping
26852708 downcast_ns_timestamp_to_us = Config ().get_bool (DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE ) or False
2686- task_schema = pyarrow_to_schema (df .schema , name_mapping = name_mapping , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = table_metadata .format_version )
2709+ task_schema = pyarrow_to_schema (
2710+ df .schema ,
2711+ name_mapping = name_mapping ,
2712+ downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us ,
2713+ format_version = table_metadata .format_version ,
2714+ )
26872715
26882716 if table_metadata .spec ().is_unpartitioned ():
26892717 yield from write_file (
0 commit comments