Skip to content

Pyarrow type error #541

@dev-goyal

Description

@dev-goyal

Apache Iceberg version

0.6.0 (latest release)

Please describe the bug 🐞

Given a table like so:

In [36]: table
Out[36]:
matches(
 ...
  14: player_last_session: optional timestamptz,
...
  30: subject_last_session: optional timestamptz,
),
partition by: [run_date, player_agg_cluster_name, initiating_at],
sort order: [],
snapshot: Operation.APPEND: id=6595288807809068528, schema_id=0

I get the following error

In [25]: table.scan().to_arrow()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[25], line 1
----> 1 table.scan().to_arrow()

File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/table/__init__.py:1418, in DataScan.to_arrow(self)
   1415 def to_arrow(self) -> pa.Table:
   1416     from pyiceberg.io.pyarrow import project_table
-> 1418     return project_table(
   1419         self.plan_files(),
   1420         self.table,
   1421         self.row_filter,
   1422         self.projection(),
   1423         case_sensitive=self.case_sensitive,
   1424         limit=self.limit,
   1425     )

File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:1114, in project_table(tasks, table, row_filter, projected_schema, case_sensitive, limit)
   1111 if limit is not None:
   1112     _ = [f.cancel() for f in futures if not f.done()]
-> 1114 tables = [f.result() for f in completed_futures if f.result()]
   1116 if len(tables) < 1:
   1117     return pa.Table.from_batches([], schema=schema_to_pyarrow(projected_schema))

File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:1114, in <listcomp>(.0)
   1111 if limit is not None:
   1112     _ = [f.cancel() for f in futures if not f.done()]
-> 1114 tables = [f.result() for f in completed_futures if f.result()]
   1116 if len(tables) < 1:
   1117     return pa.Table.from_batches([], schema=schema_to_pyarrow(projected_schema))

File ~/.pyenv/versions/3.11.7/lib/python3.11/concurrent/futures/_base.py:449, in Future.result(self, timeout)
    447     raise CancelledError()
    448 elif self._state == FINISHED:
--> 449     return self.__get_result()
    451 self._condition.wait(timeout)
    453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

File ~/.pyenv/versions/3.11.7/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self)
    399 if self._exception:
    400     try:
--> 401         raise self._exception
    402     finally:
    403         # Break a reference cycle with the exception in self._exception
    404         self = None

File ~/.pyenv/versions/3.11.7/lib/python3.11/concurrent/futures/thread.py:58, in _WorkItem.run(self)
     55     return
     57 try:
---> 58     result = self.fn(*self.args, **self.kwargs)
     59 except BaseException as exc:
     60     self.future.set_exception(exc)

File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:957, in _task_to_table(fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, row_counts, limit, name_mapping)
    954 if metadata := physical_schema.metadata:
    955     schema_raw = metadata.get(ICEBERG_SCHEMA)
    956 file_schema = (
--> 957     Schema.model_validate_json(schema_raw) if schema_raw is not None else pyarrow_to_schema(physical_schema, name_mapping)
    958 )
    960 pyarrow_filter = None
    961 if bound_row_filter is not AlwaysTrue():

File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:655, in pyarrow_to_schema(schema, name_mapping)
    651 else:
    652     raise ValueError(
    653         "Parquet file does not have field-ids and the Iceberg table does not have 'schema.name-mapping.default' defined"
    654     )
--> 655 return visit_pyarrow(schema, visitor)

File ~/.pyenv/versions/3.11.7/lib/python3.11/functools.py:909, in singledispatch.<locals>.wrapper(*args, **kw)
    905 if not args:
    906     raise TypeError(f'{funcname} requires at least '
    907                     '1 positional argument')
--> 909 return dispatch(args[0].__class__)(*args, **kw)

File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:676, in _(obj, visitor)
    674 @visit_pyarrow.register(pa.Schema)
    675 def _(obj: pa.Schema, visitor: PyArrowSchemaVisitor[T]) -> T:
--> 676     return visitor.schema(obj, visit_pyarrow(pa.struct(obj), visitor))

File ~/.pyenv/versions/3.11.7/lib/python3.11/functools.py:909, in singledispatch.<locals>.wrapper(*args, **kw)
    905 if not args:
    906     raise TypeError(f'{funcname} requires at least '
    907                     '1 positional argument')
--> 909 return dispatch(args[0].__class__)(*args, **kw)

File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:685, in _(obj, visitor)
    683 for field in obj:
    684     visitor.before_field(field)
--> 685     result = visit_pyarrow(field.type, visitor)
    686     results.append(visitor.field(field, result))
    687     visitor.after_field(field)

File ~/.pyenv/versions/3.11.7/lib/python3.11/functools.py:909, in singledispatch.<locals>.wrapper(*args, **kw)
    905 if not args:
    906     raise TypeError(f'{funcname} requires at least '
    907                     '1 positional argument')
--> 909 return dispatch(args[0].__class__)(*args, **kw)

File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:718, in _(obj, visitor)
    716 if pa.types.is_nested(obj):
    717     raise TypeError(f"Expected primitive type, got: {type(obj)}")
--> 718 return visitor.primitive(obj)

File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:891, in _ConvertToIceberg.primitive(self, primitive)
    888     primitive = cast(pa.FixedSizeBinaryType, primitive)
    889     return FixedType(primitive.byte_width)
--> 891 raise TypeError(f"Unsupported type: {primitive}")

TypeError: Unsupported type: timestamp[ns]

After some debugging, at this line I find

ipdb> physical_schema
player_last_session: timestamp[ns]
...
subject_last_session: timestamp[ns]

I imagine the fix is to do something like this on this line, but currently those overrides are not exposed. Am I on the right track?

I believe that this issue is somewhat similar to #520

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions