Skip to content

Commit 4593208

Browse files
authored
Arrow: Set field-id with prefix (#227)
'PARQUET:' prefix is specific to Parquet, with 'PARQUET:field_id' setting the 'field_id'. Removed the non-prefixed alternative for 'field_id'. Removed the prefixed alternative for 'doc'.
1 parent 5085d28 commit 4593208

File tree

3 files changed

+45
-50
lines changed

3 files changed

+45
-50
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,9 @@
156156
ONE_MEGABYTE = 1024 * 1024
157157
BUFFER_SIZE = "buffer-size"
158158
ICEBERG_SCHEMA = b"iceberg.schema"
159-
FIELD_ID = "field_id"
160-
DOC = "doc"
161-
PYARROW_FIELD_ID_KEYS = [b"PARQUET:field_id", b"field_id"]
162-
PYARROW_FIELD_DOC_KEYS = [b"PARQUET:field_doc", b"field_doc", b"doc"]
159+
# The PARQUET: in front means that it is Parquet specific, in this case the field_id
160+
PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
161+
PYARROW_FIELD_DOC_KEY = b"doc"
163162

164163
T = TypeVar("T")
165164

@@ -461,7 +460,9 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
461460
name=field.name,
462461
type=field_result,
463462
nullable=field.optional,
464-
metadata={DOC: field.doc, FIELD_ID: str(field.field_id)} if field.doc else {FIELD_ID: str(field.field_id)},
463+
metadata={PYARROW_FIELD_DOC_KEY: field.doc, PYARROW_PARQUET_FIELD_ID_KEY: str(field.field_id)}
464+
if field.doc
465+
else {PYARROW_PARQUET_FIELD_ID_KEY: str(field.field_id)},
465466
)
466467

467468
def list(self, list_type: ListType, element_result: pa.DataType) -> pa.DataType:
@@ -725,25 +726,19 @@ def primitive(self, primitive: pa.DataType) -> Optional[T]:
725726

726727

727728
def _get_field_id(field: pa.Field) -> Optional[int]:
728-
for pyarrow_field_id_key in PYARROW_FIELD_ID_KEYS:
729-
if field_id_str := field.metadata.get(pyarrow_field_id_key):
730-
return int(field_id_str.decode())
731-
return None
732-
733-
734-
def _get_field_doc(field: pa.Field) -> Optional[str]:
735-
for pyarrow_doc_key in PYARROW_FIELD_DOC_KEYS:
736-
if doc_str := field.metadata.get(pyarrow_doc_key):
737-
return doc_str.decode()
738-
return None
729+
return (
730+
int(field_id_str.decode())
731+
if (field.metadata and (field_id_str := field.metadata.get(PYARROW_PARQUET_FIELD_ID_KEY)))
732+
else None
733+
)
739734

740735

741736
class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]):
742737
def _convert_fields(self, arrow_fields: Iterable[pa.Field], field_results: List[Optional[IcebergType]]) -> List[NestedField]:
743738
fields = []
744739
for i, field in enumerate(arrow_fields):
745740
field_id = _get_field_id(field)
746-
field_doc = _get_field_doc(field)
741+
field_doc = doc_str.decode() if (field.metadata and (doc_str := field.metadata.get(PYARROW_FIELD_DOC_KEY))) else None
747742
field_type = field_results[i]
748743
if field_type is not None and field_id is not None:
749744
fields.append(NestedField(field_id, field.name, field_type, required=not field.nullable, doc=field_doc))

tests/io/test_pyarrow.py

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -324,57 +324,57 @@ def test_schema_to_pyarrow_schema(table_schema_nested: Schema) -> None:
324324
actual = schema_to_pyarrow(table_schema_nested)
325325
expected = """foo: string
326326
-- field metadata --
327-
field_id: '1'
327+
PARQUET:field_id: '1'
328328
bar: int32 not null
329329
-- field metadata --
330-
field_id: '2'
330+
PARQUET:field_id: '2'
331331
baz: bool
332332
-- field metadata --
333-
field_id: '3'
333+
PARQUET:field_id: '3'
334334
qux: list<element: string not null> not null
335335
child 0, element: string not null
336336
-- field metadata --
337-
field_id: '5'
337+
PARQUET:field_id: '5'
338338
-- field metadata --
339-
field_id: '4'
339+
PARQUET:field_id: '4'
340340
quux: map<string, map<string, int32>> not null
341341
child 0, entries: struct<key: string not null, value: map<string, int32> not null> not null
342342
child 0, key: string not null
343343
-- field metadata --
344-
field_id: '7'
344+
PARQUET:field_id: '7'
345345
child 1, value: map<string, int32> not null
346346
child 0, entries: struct<key: string not null, value: int32 not null> not null
347347
child 0, key: string not null
348348
-- field metadata --
349-
field_id: '9'
349+
PARQUET:field_id: '9'
350350
child 1, value: int32 not null
351351
-- field metadata --
352-
field_id: '10'
352+
PARQUET:field_id: '10'
353353
-- field metadata --
354-
field_id: '8'
354+
PARQUET:field_id: '8'
355355
-- field metadata --
356-
field_id: '6'
356+
PARQUET:field_id: '6'
357357
location: list<element: struct<latitude: float, longitude: float> not null> not null
358358
child 0, element: struct<latitude: float, longitude: float> not null
359359
child 0, latitude: float
360360
-- field metadata --
361-
field_id: '13'
361+
PARQUET:field_id: '13'
362362
child 1, longitude: float
363363
-- field metadata --
364-
field_id: '14'
364+
PARQUET:field_id: '14'
365365
-- field metadata --
366-
field_id: '12'
366+
PARQUET:field_id: '12'
367367
-- field metadata --
368-
field_id: '11'
368+
PARQUET:field_id: '11'
369369
person: struct<name: string, age: int32 not null>
370370
child 0, name: string
371371
-- field metadata --
372-
field_id: '16'
372+
PARQUET:field_id: '16'
373373
child 1, age: int32 not null
374374
-- field metadata --
375-
field_id: '17'
375+
PARQUET:field_id: '17'
376376
-- field metadata --
377-
field_id: '15'"""
377+
PARQUET:field_id: '15'"""
378378
assert repr(actual) == expected
379379

380380

@@ -888,22 +888,22 @@ def test_projection_add_column(file_int: str) -> None:
888888
list: list<element: int32>
889889
child 0, element: int32
890890
-- field metadata --
891-
field_id: '21'
891+
PARQUET:field_id: '21'
892892
map: map<int32, string>
893893
child 0, entries: struct<key: int32 not null, value: string> not null
894894
child 0, key: int32 not null
895895
-- field metadata --
896-
field_id: '31'
896+
PARQUET:field_id: '31'
897897
child 1, value: string
898898
-- field metadata --
899-
field_id: '32'
899+
PARQUET:field_id: '32'
900900
location: struct<lat: double, lon: double>
901901
child 0, lat: double
902902
-- field metadata --
903-
field_id: '41'
903+
PARQUET:field_id: '41'
904904
child 1, lon: double
905905
-- field metadata --
906-
field_id: '42'"""
906+
PARQUET:field_id: '42'"""
907907
)
908908

909909

@@ -953,10 +953,10 @@ def test_projection_add_column_struct(schema_int: Schema, file_int: str) -> None
953953
child 0, entries: struct<key: int32 not null, value: string> not null
954954
child 0, key: int32 not null
955955
-- field metadata --
956-
field_id: '3'
956+
PARQUET:field_id: '3'
957957
child 1, value: string
958958
-- field metadata --
959-
field_id: '4'"""
959+
PARQUET:field_id: '4'"""
960960
)
961961

962962

@@ -1004,7 +1004,7 @@ def test_projection_filter(schema_int: Schema, file_int: str) -> None:
10041004
repr(result_table.schema)
10051005
== """id: int32
10061006
-- field metadata --
1007-
field_id: '1'"""
1007+
PARQUET:field_id: '1'"""
10081008
)
10091009

10101010

@@ -1182,10 +1182,10 @@ def test_projection_nested_struct_different_parent_id(file_struct: str) -> None:
11821182
== """location: struct<lat: double, long: double>
11831183
child 0, lat: double
11841184
-- field metadata --
1185-
field_id: '41'
1185+
PARQUET:field_id: '41'
11861186
child 1, long: double
11871187
-- field metadata --
1188-
field_id: '42'"""
1188+
PARQUET:field_id: '42'"""
11891189
)
11901190

11911191

tests/io/test_pyarrow_visitor.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,9 +208,9 @@ def test_pyarrow_variable_binary_to_iceberg() -> None:
208208

209209
def test_pyarrow_struct_to_iceberg() -> None:
210210
pyarrow_struct = pa.struct([
211-
pa.field("foo", pa.string(), nullable=True, metadata={"field_id": "1", "doc": "foo doc"}),
212-
pa.field("bar", pa.int32(), nullable=False, metadata={"field_id": "2"}),
213-
pa.field("baz", pa.bool_(), nullable=True, metadata={"field_id": "3"}),
211+
pa.field("foo", pa.string(), nullable=True, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}),
212+
pa.field("bar", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "2"}),
213+
pa.field("baz", pa.bool_(), nullable=True, metadata={"PARQUET:field_id": "3"}),
214214
])
215215
expected = StructType(
216216
NestedField(field_id=1, name="foo", field_type=StringType(), required=False, doc="foo doc"),
@@ -221,7 +221,7 @@ def test_pyarrow_struct_to_iceberg() -> None:
221221

222222

223223
def test_pyarrow_list_to_iceberg() -> None:
224-
pyarrow_list = pa.list_(pa.field("element", pa.int32(), nullable=False, metadata={"field_id": "1"}))
224+
pyarrow_list = pa.list_(pa.field("element", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "1"}))
225225
expected = ListType(
226226
element_id=1,
227227
element_type=IntegerType(),
@@ -232,8 +232,8 @@ def test_pyarrow_list_to_iceberg() -> None:
232232

233233
def test_pyarrow_map_to_iceberg() -> None:
234234
pyarrow_map = pa.map_(
235-
pa.field("key", pa.int32(), nullable=False, metadata={"field_id": "1"}),
236-
pa.field("value", pa.string(), nullable=False, metadata={"field_id": "2"}),
235+
pa.field("key", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "1"}),
236+
pa.field("value", pa.string(), nullable=False, metadata={"PARQUET:field_id": "2"}),
237237
)
238238
expected = MapType(
239239
key_id=1,

0 commit comments

Comments
 (0)