Skip to content

Commit 345827e

Browse files
committed
push down to when parquet writes
1 parent 06dd647 commit 345827e

2 files changed

Lines changed: 3 additions & 4 deletions

File tree

pyiceberg/io/pyarrow.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1772,12 +1772,13 @@ def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteT
17721772
)
17731773

17741774
def write_parquet(task: WriteTask) -> DataFile:
1775+
df = pa.Table.from_batches(task.record_batches)
1776+
df = df.rename_columns(schema.column_names)
17751777
file_path = f'{table_metadata.location}/data/{task.generate_data_file_path("parquet")}'
17761778
fo = io.new_output(file_path)
17771779
with fo.create(overwrite=True) as fos:
17781780
with pq.ParquetWriter(fos, schema=arrow_file_schema, **parquet_writer_kwargs) as writer:
1779-
writer.write(pa.Table.from_batches(task.record_batches), row_group_size=row_group_size)
1780-
1781+
writer.write(df, row_group_size=row_group_size)
17811782
statistics = data_file_statistics_from_parquet_metadata(
17821783
parquet_metadata=writer.writer.metadata,
17831784
stats_columns=compute_statistics_plan(schema, table_metadata.properties),

pyiceberg/table/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2703,8 +2703,6 @@ def _dataframe_to_data_files(
27032703
property_name=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES,
27042704
default=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT,
27052705
)
2706-
sanitized_arrow_schema = sanitize_column_names(table_metadata.schema()).as_arrow()
2707-
df = df.rename_columns(sanitized_arrow_schema.names)
27082706

27092707
if len(table_metadata.spec().fields) > 0:
27102708
partitions = _determine_partitions(spec=table_metadata.spec(), schema=table_metadata.schema(), arrow_table=df)

0 commit comments

Comments
 (0)