@@ -35,51 +35,27 @@ def format_model(request: pytest.FixtureRequest) -> FileFormatModel:
3535 return FileFormatFactory .get (request .param )
3636
3737
38- @pytest .fixture
39- def simple_table () -> pa .Table :
40- return pa .table (
41- {
42- "foo" : ["a" , "b" , "c" ],
43- "bar" : pa .array ([1 , 2 , 3 ], type = pa .int32 ()),
44- "baz" : [True , False , True ],
45- }
46- )
47-
48-
4938def test_parquet_registered () -> None :
5039 """ParquetFormatModel is registered in the factory."""
5140 model = FileFormatFactory .get (FileFormat .PARQUET )
5241 assert model .format == FileFormat .PARQUET
5342 assert model .file_extension () == "parquet"
5443
5544
56- def test_round_trip (format_model : FileFormatModel , table_schema_simple : Schema , simple_table : pa .Table , tmp_path : Path ) -> None :
45+ def test_round_trip (
46+ format_model : FileFormatModel , table_schema_simple : Schema , arrow_table_simple : pa .Table , tmp_path : Path
47+ ) -> None :
5748 """Write a table and read it back, to verify equality and record count."""
5849 file_path = str (tmp_path / f"test.{ format_model .file_extension ()} " )
5950 writer = format_model .create_writer (PyArrowFileIO ().new_output (file_path ), table_schema_simple , {})
60- writer .write (simple_table )
51+ writer .write (arrow_table_simple )
6152 statistics = writer .close ()
6253
6354 result = ds .dataset (file_path ).to_table ()
64- assert result .equals (simple_table )
55+ assert result .equals (arrow_table_simple )
6556 assert statistics .record_count == 3
6657
6758
68- def test_statistics_record_count (format_model : FileFormatModel , table_schema_simple : Schema , tmp_path : Path ) -> None :
69- """close() returns DataFileStatistics with correct record count."""
70- table = pa .table (
71- {
72- "foo" : ["a" , "b" , "c" , "d" , "e" ],
73- "bar" : pa .array ([10 , 20 , 30 , 40 , 50 ], type = pa .int32 ()),
74- "baz" : [True ] * 5 ,
75- }
76- )
77- file_path = str (tmp_path / f"test.{ format_model .file_extension ()} " )
78- writer = format_model .create_writer (PyArrowFileIO ().new_output (file_path ), table_schema_simple , {})
79- writer .write (table )
80- assert writer .close ().record_count == 5
81-
82-
8359def test_null_handling (format_model : FileFormatModel , table_schema_simple : Schema , tmp_path : Path ) -> None :
8460 """Nullable columns produce correct null_value_counts in statistics."""
8561 table = pa .table (
@@ -98,23 +74,23 @@ def test_null_handling(format_model: FileFormatModel, table_schema_simple: Schem
9874
9975
10076def test_context_manager_caches_result (
101- format_model : FileFormatModel , table_schema_simple : Schema , simple_table : pa .Table , tmp_path : Path
77+ format_model : FileFormatModel , table_schema_simple : Schema , arrow_table_simple : pa .Table , tmp_path : Path
10278) -> None :
10379 """writer.result() returns cached statistics after context manager exit."""
10480 file_path = str (tmp_path / f"test.{ format_model .file_extension ()} " )
10581 writer = format_model .create_writer (PyArrowFileIO ().new_output (file_path ), table_schema_simple , {})
10682 with writer :
107- writer .write (simple_table )
83+ writer .write (arrow_table_simple )
10884 assert writer .result ().record_count == 3
10985
11086
11187def test_close_is_idempotent (
112- format_model : FileFormatModel , table_schema_simple : Schema , simple_table : pa .Table , tmp_path : Path
88+ format_model : FileFormatModel , table_schema_simple : Schema , arrow_table_simple : pa .Table , tmp_path : Path
11389) -> None :
11490 """Calling close() twice returns the same cached statistics object."""
11591 file_path = str (tmp_path / f"test.{ format_model .file_extension ()} " )
11692 writer = format_model .create_writer (PyArrowFileIO ().new_output (file_path ), table_schema_simple , {})
117- writer .write (simple_table )
93+ writer .write (arrow_table_simple )
11894 stats1 = writer .close ()
11995 stats2 = writer .close ()
12096 assert stats1 is stats2
@@ -128,28 +104,16 @@ def test_close_without_write_raises(format_model: FileFormatModel, table_schema_
128104 writer .close ()
129105
130106
131- def test_construct_field_uses_orc_field_id_key () -> None :
132- """ArrowProjectionVisitor uses ORC field ID and required keys when file_format is ORC."""
133- from pyiceberg .io .pyarrow import (
134- ORC_FIELD_ID_KEY ,
135- ORC_FIELD_REQUIRED_KEY ,
136- PYARROW_PARQUET_FIELD_ID_KEY ,
137- ArrowProjectionVisitor ,
138- )
107+ def test_parquet_format_model_adds_field_id_metadata () -> None :
108+ """ParquetFormatModel.add_field_metadata writes the Parquet field-id key when requested."""
109+ from pyiceberg .io .pyarrow import PYARROW_PARQUET_FIELD_ID_KEY , ParquetFormatModel
110+
111+ field = NestedField (field_id = 1 , name = "x" , field_type = LongType (), required = True )
112+
113+ metadata : dict [bytes , bytes ] = {}
114+ ParquetFormatModel ().add_field_metadata (field , metadata , include_field_ids = True )
115+ assert metadata == {PYARROW_PARQUET_FIELD_ID_KEY : b"1" }
139116
140- schema = Schema (NestedField (field_id = 1 , name = "x" , field_type = LongType (), required = True ))
141-
142- visitor = ArrowProjectionVisitor (schema , include_field_ids = True , file_format = FileFormat .ORC )
143- field = visitor ._construct_field (schema .find_field (1 ), pa .int64 ())
144- assert field .metadata is not None
145- assert ORC_FIELD_ID_KEY in field .metadata
146- assert ORC_FIELD_REQUIRED_KEY in field .metadata
147- assert field .metadata [ORC_FIELD_REQUIRED_KEY ] == b"true"
148- assert PYARROW_PARQUET_FIELD_ID_KEY not in field .metadata
149-
150- visitor_pq = ArrowProjectionVisitor (schema , include_field_ids = True , file_format = FileFormat .PARQUET )
151- field_pq = visitor_pq ._construct_field (schema .find_field (1 ), pa .int64 ())
152- assert field_pq .metadata is not None
153- assert PYARROW_PARQUET_FIELD_ID_KEY in field_pq .metadata
154- assert ORC_FIELD_ID_KEY not in field_pq .metadata
155- assert ORC_FIELD_REQUIRED_KEY not in field_pq .metadata
117+ metadata_no_ids : dict [bytes , bytes ] = {}
118+ ParquetFormatModel ().add_field_metadata (field , metadata_no_ids , include_field_ids = False )
119+ assert metadata_no_ids == {}
0 commit comments