Skip to content

Commit bc6bea1

Browse files
HonahXFokko
authored andcommitted
Read: fetch file_schema directly from pyarrow_to_schema (#597)
1 parent 20093c3 commit bc6bea1

3 files changed

Lines changed: 8 additions & 12 deletions

File tree

pyiceberg/io/pyarrow.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@
120120
pre_order_visit,
121121
promote,
122122
prune_columns,
123-
sanitize_column_names,
124123
visit,
125124
visit_with_partner,
126125
)
@@ -950,20 +949,15 @@ def _task_to_table(
950949
with fs.open_input_file(path) as fin:
951950
fragment = arrow_format.make_fragment(fin)
952951
physical_schema = fragment.physical_schema
953-
schema_raw = None
954-
if metadata := physical_schema.metadata:
955-
schema_raw = metadata.get(ICEBERG_SCHEMA)
956-
file_schema = (
957-
Schema.model_validate_json(schema_raw) if schema_raw is not None else pyarrow_to_schema(physical_schema, name_mapping)
958-
)
952+
file_schema = pyarrow_to_schema(physical_schema, name_mapping)
959953

960954
pyarrow_filter = None
961955
if bound_row_filter is not AlwaysTrue():
962956
translated_row_filter = translate_column_names(bound_row_filter, file_schema, case_sensitive=case_sensitive)
963957
bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive)
964958
pyarrow_filter = expression_to_pyarrow(bound_file_filter)
965959

966-
file_project_schema = sanitize_column_names(prune_columns(file_schema, projected_field_ids, select_full_types=False))
960+
file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False)
967961

968962
if file_schema is None:
969963
raise ValueError(f"Missing Iceberg schema in Metadata for file: {path}")

tests/conftest.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1913,9 +1913,11 @@ def data_file(table_schema_simple: Schema, tmp_path: str) -> str:
19131913
import pyarrow as pa
19141914
from pyarrow import parquet as pq
19151915

1916+
from pyiceberg.io.pyarrow import schema_to_pyarrow
1917+
19161918
table = pa.table(
19171919
{"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": [True, False, None]},
1918-
metadata={"iceberg.schema": table_schema_simple.model_dump_json()},
1920+
schema=schema_to_pyarrow(table_schema_simple),
19191921
)
19201922

19211923
file_path = f"{tmp_path}/0000-data.parquet"

tests/io/test_pyarrow.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,7 +1383,7 @@ def test_delete(deletes_file: str, example_task: FileScanTask, table_schema_simp
13831383
str(with_deletes)
13841384
== """pyarrow.Table
13851385
foo: string
1386-
bar: int64 not null
1386+
bar: int32 not null
13871387
baz: bool
13881388
----
13891389
foo: [["a","c"]]
@@ -1426,7 +1426,7 @@ def test_delete_duplicates(deletes_file: str, example_task: FileScanTask, table_
14261426
str(with_deletes)
14271427
== """pyarrow.Table
14281428
foo: string
1429-
bar: int64 not null
1429+
bar: int32 not null
14301430
baz: bool
14311431
----
14321432
foo: [["a","c"]]
@@ -1462,7 +1462,7 @@ def test_pyarrow_wrap_fsspec(example_task: FileScanTask, table_schema_simple: Sc
14621462
str(projection)
14631463
== """pyarrow.Table
14641464
foo: string
1465-
bar: int64 not null
1465+
bar: int32 not null
14661466
baz: bool
14671467
----
14681468
foo: [["a","b","c"]]

0 commit comments

Comments
 (0)