Skip to content

Commit d0eb360

Browse files
committed
UNPICK changes to review
1 parent d881cf3 commit d0eb360

File tree

23 files changed

+204
-701
lines changed

23 files changed

+204
-701
lines changed

docs/source/conf.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,6 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa
8383
# Duplicate modules (skip module-level docs to avoid duplication)
8484
("module", "datafusion.col"),
8585
("module", "datafusion.udf"),
86-
# Private variables causing duplicate documentation
87-
("data", "datafusion.utils._PYARROW_DATASET_TYPES"),
88-
("variable", "datafusion.utils._PYARROW_DATASET_TYPES"),
8986
# Deprecated
9087
("class", "datafusion.substrait.serde"),
9188
("class", "datafusion.substrait.plan"),
@@ -94,28 +91,9 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa
9491
("method", "datafusion.context.SessionContext.tables"),
9592
("method", "datafusion.dataframe.DataFrame.unnest_column"),
9693
]
97-
# Explicitly skip certain members listed above. These are either
98-
# re-exports, duplicate module-level documentation, deprecated
99-
# API surfaces, or private variables that would otherwise appear
100-
# in the generated docs and cause confusing duplication.
101-
# Keeping this explicit list avoids surprising entries in the
102-
# AutoAPI output and gives us a single place to opt-out items
103-
# when we intentionally hide them from the docs.
10494
if (what, name) in skip_contents:
10595
skip = True
10696

107-
# Skip private module-level names (those whose final component
108-
# starts with an underscore) when AutoAPI is rendering data or
109-
# variable entries. Many internal module-level constants are
110-
# implementation details (for example private pyarrow dataset type
111-
# mappings) that would otherwise be emitted as top-level "data"
112-
# or "variable" docs. Filtering them here avoids noisy,
113-
# duplicate, or implementation-specific entries in the public
114-
# documentation while still allowing public members and types to
115-
# be documented normally.
116-
if name.split(".")[-1].startswith("_") and what in ("data", "variable"):
117-
skip = True
118-
11997
return skip
12098

12199

docs/source/contributor-guide/ffi.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ as performant as possible and to utilize the features of DataFusion, you may dec
3434
your source in Rust and then expose it through `PyO3 <https://pyo3.rs>`_ as a Python library.
3535

3636
At first glance, it may appear the best way to do this is to add the ``datafusion-python``
37-
crate as a dependency, produce a DataFusion table in Rust, and then register it with the
37+
crate as a dependency, provide a ``PyTable``, and then to register it with the
3838
``SessionContext``. Unfortunately, this will not work.
3939

4040
When you produce your code as a Python library and it needs to interact with the DataFusion

docs/source/user-guide/data-sources.rst

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -152,26 +152,13 @@ as Delta Lake. This will require a recent version of
152152
.. code-block:: python
153153
154154
from deltalake import DeltaTable
155-
from datafusion import Table
156155
157156
delta_table = DeltaTable("path_to_table")
158-
table = Table.from_capsule(delta_table.__datafusion_table_provider__())
159-
ctx.register_table("my_delta_table", table)
157+
ctx.register_table_provider("my_delta_table", delta_table)
160158
df = ctx.table("my_delta_table")
161159
df.show()
162160
163-
Objects that implement ``__datafusion_table_provider__`` are supported directly by
164-
:py:meth:`~datafusion.context.SessionContext.register_table`, making it easy to
165-
work with custom table providers from Python libraries such as Delta Lake.
166-
167-
.. note::
168-
169-
:py:meth:`~datafusion.context.SessionContext.register_table_provider` is
170-
deprecated. Use
171-
:py:meth:`~datafusion.context.SessionContext.register_table` with a
172-
:py:class:`~datafusion.Table` instead.
173-
174-
On older versions of ``deltalake`` (prior to 0.22) you can use the
161+
On older versions of ``deltalake`` (prior to 0.22) you can use the
175162
`Arrow DataSet <https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html>`_
176163
interface to import to DataFusion, but this does not support features such as filter push down
177164
which can lead to a significant performance difference.

docs/source/user-guide/io/table_provider.rst

Lines changed: 7 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -39,47 +39,20 @@ A complete example can be found in the `examples folder <https://github.com/apac
3939
) -> PyResult<Bound<'py, PyCapsule>> {
4040
let name = CString::new("datafusion_table_provider").unwrap();
4141
42-
let provider = Arc::new(self.clone());
43-
let provider = FFI_TableProvider::new(provider, false, None);
42+
let provider = Arc::new(self.clone())
43+
.map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
44+
let provider = FFI_TableProvider::new(Arc::new(provider), false);
4445
4546
PyCapsule::new_bound(py, provider, Some(name.clone()))
4647
}
4748
}
4849
49-
Once you have this library available, you can construct a
50-
:py:class:`~datafusion.Table` in Python and register it with the
51-
``SessionContext``. Tables can be created either from the PyCapsule exposed by your
52-
Rust provider or from an existing :py:class:`~datafusion.dataframe.DataFrame`.
53-
Call the provider's ``__datafusion_table_provider__()`` method to obtain the capsule
54-
before constructing a ``Table``. The ``Table.from_view()`` helper is
55-
deprecated; instead use ``Table.from_dataframe()`` or ``DataFrame.into_view()``.
56-
57-
.. note::
58-
59-
:py:meth:`~datafusion.context.SessionContext.register_table_provider` is
60-
deprecated. Use
61-
:py:meth:`~datafusion.context.SessionContext.register_table` with the
62-
resulting :py:class:`~datafusion.Table` instead.
50+
Once you have this library available, in python you can register your table provider
51+
to the ``SessionContext``.
6352

6453
.. code-block:: python
6554
66-
from datafusion import SessionContext, Table
67-
68-
ctx = SessionContext()
6955
provider = MyTableProvider()
56+
ctx.register_table_provider("my_table", provider)
7057
71-
capsule = provider.__datafusion_table_provider__()
72-
capsule_table = Table.from_capsule(capsule)
73-
74-
df = ctx.from_pydict({"a": [1]})
75-
view_table = Table.from_dataframe(df)
76-
# or: view_table = df.into_view()
77-
78-
ctx.register_table("capsule_table", capsule_table)
79-
ctx.register_table("view_table", view_table)
80-
81-
ctx.table("capsule_table").show()
82-
ctx.table("view_table").show()
83-
84-
Both ``Table.from_capsule()`` and ``Table.from_dataframe()`` create
85-
table providers that can be registered with the SessionContext using ``register_table()``.
58+
ctx.table("my_table").show()

examples/datafusion-ffi-example/python/tests/_test_table_function.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def test_ffi_table_function_call_directly():
5353
table_udtf = udtf(table_func, "my_table_func")
5454

5555
my_table = table_udtf()
56-
ctx.register_table("t", my_table)
56+
ctx.register_table_provider("t", my_table)
5757
result = ctx.table("t").collect()
5858

5959
assert len(result) == 2

examples/datafusion-ffi-example/python/tests/_test_table_provider.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,14 @@
1818
from __future__ import annotations
1919

2020
import pyarrow as pa
21-
from datafusion import SessionContext, TableProvider
21+
from datafusion import SessionContext
2222
from datafusion_ffi_example import MyTableProvider
2323

2424

2525
def test_table_loading():
2626
ctx = SessionContext()
2727
table = MyTableProvider(3, 2, 4)
28-
ctx.register_table(
29-
"t", TableProvider.from_capsule(table.__datafusion_table_provider__())
30-
)
28+
ctx.register_table_provider("t", table)
3129
result = ctx.table("t").collect()
3230

3331
assert len(result) == 4

python/datafusion/__init__.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,17 @@
2828
try:
2929
import importlib.metadata as importlib_metadata
3030
except ImportError:
31-
import importlib_metadata # type: ignore[import]
31+
import importlib_metadata
3232

33-
# Public submodules
3433
from . import functions, object_store, substrait, unparser
3534

3635
# The following imports are okay to remain as opaque to the user.
37-
from ._internal import EXPECTED_PROVIDER_MSG, Config
36+
from ._internal import Config
3837
from .catalog import Catalog, Database, Table
3938
from .col import col, column
40-
from .common import DFSchema
39+
from .common import (
40+
DFSchema,
41+
)
4142
from .context import (
4243
RuntimeEnvBuilder,
4344
SessionConfig,
@@ -46,7 +47,10 @@
4647
)
4748
from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions
4849
from .dataframe_formatter import configure_formatter
49-
from .expr import Expr, WindowFrame
50+
from .expr import (
51+
Expr,
52+
WindowFrame,
53+
)
5054
from .io import read_avro, read_csv, read_json, read_parquet
5155
from .plan import ExecutionPlan, LogicalPlan
5256
from .record_batch import RecordBatch, RecordBatchStream
@@ -65,7 +69,6 @@
6569
__version__ = importlib_metadata.version(__name__)
6670

6771
__all__ = [
68-
"EXPECTED_PROVIDER_MSG",
6972
"Accumulator",
7073
"AggregateUDF",
7174
"Catalog",

python/datafusion/catalog.py

Lines changed: 21 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,14 @@
1919

2020
from __future__ import annotations
2121

22-
import warnings
2322
from abc import ABC, abstractmethod
24-
from typing import TYPE_CHECKING, Any, Protocol
23+
from typing import TYPE_CHECKING, Protocol
2524

2625
import datafusion._internal as df_internal
27-
from datafusion._internal import EXPECTED_PROVIDER_MSG
28-
from datafusion.utils import _normalize_table_provider
2926

3027
if TYPE_CHECKING:
3128
import pyarrow as pa
3229

33-
from datafusion.context import TableProviderExportable
34-
3530
try:
3631
from warnings import deprecated # Python 3.13+
3732
except ImportError:
@@ -87,11 +82,7 @@ def database(self, name: str = "public") -> Schema:
8782
"""Returns the database with the given ``name`` from this catalog."""
8883
return self.schema(name)
8984

90-
def register_schema(
91-
self,
92-
name: str,
93-
schema: Schema | SchemaProvider | SchemaProviderExportable,
94-
) -> Schema | None:
85+
def register_schema(self, name, schema) -> Schema | None:
9586
"""Register a schema with this catalog."""
9687
if isinstance(schema, Schema):
9788
return self.catalog.register_schema(name, schema._raw_schema)
@@ -131,16 +122,11 @@ def table(self, name: str) -> Table:
131122
"""Return the table with the given ``name`` from this schema."""
132123
return Table(self._raw_schema.table(name))
133124

134-
def register_table(
135-
self, name: str, table: Table | TableProviderExportable | Any
136-
) -> None:
137-
"""Register a table or table provider in this schema.
138-
139-
Objects implementing ``__datafusion_table_provider__`` are also supported
140-
and treated as table provider instances.
141-
"""
142-
provider = _normalize_table_provider(table)
143-
return self._raw_schema.register_table(name, provider)
125+
def register_table(self, name, table) -> None:
126+
"""Register a table provider in this schema."""
127+
if isinstance(table, Table):
128+
return self._raw_schema.register_table(name, table.table)
129+
return self._raw_schema.register_table(name, table)
144130

145131
def deregister_table(self, name: str) -> None:
146132
"""Deregister a table provider from this schema."""
@@ -152,101 +138,31 @@ class Database(Schema):
152138
"""See `Schema`."""
153139

154140

155-
_InternalRawTable = df_internal.catalog.RawTable
156-
_InternalTableProvider = df_internal.TableProvider
157-
158-
# Keep in sync with ``datafusion._internal.TableProvider.from_view``.
159-
_FROM_VIEW_WARN_STACKLEVEL = 2
160-
161-
162141
class Table:
163-
"""DataFusion table or table provider wrapper."""
164-
165-
__slots__ = ("_table",)
142+
"""DataFusion table."""
166143

167-
def __init__(
168-
self,
169-
table: _InternalRawTable | _InternalTableProvider | Table,
170-
) -> None:
171-
"""Wrap a low level table or table provider."""
172-
if isinstance(table, Table):
173-
table = table.table
174-
175-
if not isinstance(table, (_InternalRawTable, _InternalTableProvider)):
176-
raise TypeError(EXPECTED_PROVIDER_MSG)
177-
178-
self._table = table
179-
180-
def __getattribute__(self, name: str) -> Any:
181-
"""Restrict provider-specific helpers to compatible tables."""
182-
if name == "__datafusion_table_provider__":
183-
table = object.__getattribute__(self, "_table")
184-
if not hasattr(table, "__datafusion_table_provider__"):
185-
raise AttributeError(name)
186-
return object.__getattribute__(self, name)
144+
def __init__(self, table: df_internal.catalog.RawTable) -> None:
145+
"""This constructor is not typically called by the end user."""
146+
self.table = table
187147

188148
def __repr__(self) -> str:
189149
"""Print a string representation of the table."""
190-
return repr(self._table)
150+
return self.table.__repr__()
191151

192-
@property
193-
def table(self) -> _InternalRawTable | _InternalTableProvider:
194-
"""Return the wrapped low level table object."""
195-
return self._table
196-
197-
@classmethod
198-
def from_dataset(cls, dataset: pa.dataset.Dataset) -> Table:
199-
"""Turn a :mod:`pyarrow.dataset` ``Dataset`` into a :class:`Table`."""
200-
return cls(_InternalRawTable.from_dataset(dataset))
201-
202-
@classmethod
203-
def from_capsule(cls, capsule: Any) -> Table:
204-
"""Create a :class:`Table` from a PyCapsule exported provider."""
205-
provider = _InternalTableProvider.from_capsule(capsule)
206-
return cls(provider)
207-
208-
@classmethod
209-
def from_dataframe(cls, df: Any) -> Table:
210-
"""Create a :class:`Table` from tabular data."""
211-
from datafusion.dataframe import DataFrame as DataFrameWrapper
212-
213-
dataframe = df if isinstance(df, DataFrameWrapper) else DataFrameWrapper(df)
214-
return dataframe.into_view()
215-
216-
@classmethod
217-
def from_view(cls, df: Any) -> Table:
218-
"""Deprecated helper for constructing tables from views."""
219-
from datafusion.dataframe import DataFrame as DataFrameWrapper
220-
221-
if isinstance(df, DataFrameWrapper):
222-
df = df.df
223-
224-
provider = _InternalTableProvider.from_view(df)
225-
warnings.warn(
226-
"Table.from_view is deprecated; use DataFrame.into_view or "
227-
"Table.from_dataframe instead.",
228-
category=DeprecationWarning,
229-
stacklevel=_FROM_VIEW_WARN_STACKLEVEL,
230-
)
231-
return cls(provider)
152+
@staticmethod
153+
def from_dataset(dataset: pa.dataset.Dataset) -> Table:
154+
"""Turn a pyarrow Dataset into a Table."""
155+
return Table(df_internal.catalog.RawTable.from_dataset(dataset))
232156

233157
@property
234158
def schema(self) -> pa.Schema:
235159
"""Returns the schema associated with this table."""
236-
return self._table.schema
160+
return self.table.schema
237161

238162
@property
239163
def kind(self) -> str:
240164
"""Returns the kind of table."""
241-
return self._table.kind
242-
243-
def __datafusion_table_provider__(self) -> Any:
244-
"""Expose the wrapped provider for FFI integrations."""
245-
exporter = getattr(self._table, "__datafusion_table_provider__", None)
246-
if exporter is None:
247-
msg = "Underlying object does not export __datafusion_table_provider__()"
248-
raise AttributeError(msg)
249-
return exporter()
165+
return self.table.kind
250166

251167

252168
class CatalogProvider(ABC):
@@ -303,19 +219,14 @@ def table(self, name: str) -> Table | None:
303219
"""Retrieve a specific table from this schema."""
304220
...
305221

306-
def register_table( # noqa: B027
307-
self, name: str, table: Table | TableProviderExportable | Any
308-
) -> None:
309-
"""Add a table to this schema.
222+
def register_table(self, name: str, table: Table) -> None: # noqa: B027
223+
"""Add a table from this schema.
310224
311225
This method is optional. If your schema provides a fixed list of tables, you do
312226
not need to implement this method.
313-
314-
Objects implementing ``__datafusion_table_provider__`` are also supported
315-
and treated as table provider instances.
316227
"""
317228

318-
def deregister_table(self, name: str, cascade: bool) -> None: # noqa: B027
229+
def deregister_table(self, name, cascade: bool) -> None: # noqa: B027
319230
"""Remove a table from this schema.
320231
321232
This method is optional. If your schema provides a fixed list of tables, you do

0 commit comments

Comments
 (0)