diff --git a/docs/api-reference/expr_struct.md b/docs/api-reference/expr_struct.md index bfc093ed8a..2a00bd185e 100644 --- a/docs/api-reference/expr_struct.md +++ b/docs/api-reference/expr_struct.md @@ -5,5 +5,6 @@ options: members: - field + - unnest show_source: false show_bases: false diff --git a/docs/api-reference/series_struct.md b/docs/api-reference/series_struct.md index 638376dad3..cbdcb302ed 100644 --- a/docs/api-reference/series_struct.md +++ b/docs/api-reference/series_struct.md @@ -5,5 +5,6 @@ options: members: - field + - unnest show_source: false show_bases: false diff --git a/narwhals/_arrow/series_struct.py b/narwhals/_arrow/series_struct.py index 906725ba7b..c91a547770 100644 --- a/narwhals/_arrow/series_struct.py +++ b/narwhals/_arrow/series_struct.py @@ -2,15 +2,33 @@ from typing import TYPE_CHECKING +import pyarrow as pa import pyarrow.compute as pc from narwhals._arrow.utils import ArrowSeriesNamespace from narwhals._compliant.any_namespace import StructNamespace if TYPE_CHECKING: + from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.series import ArrowSeries class ArrowSeriesStructNamespace(ArrowSeriesNamespace, StructNamespace["ArrowSeries"]): def field(self, name: str) -> ArrowSeries: return self.with_native(pc.struct_field(self.native, name)).alias(name) + + def unnest(self) -> ArrowDataFrame: + from narwhals._arrow.dataframe import ArrowDataFrame + + native = self.native + struct_type: pa.StructType = native.type + + # NOTE: struct_type.names is not available until pyarrow 18.0.0 + n_fields = struct_type.num_fields + table = pa.table( + { + struct_type.field(idx).name: pc.struct_field(native, idx) + for idx in range(n_fields) + } + ) + return ArrowDataFrame.from_native(table, context=self.compliant) diff --git a/narwhals/_compliant/any_namespace.py b/narwhals/_compliant/any_namespace.py index 27354eb7ff..db59b939d0 100644 --- a/narwhals/_compliant/any_namespace.py +++ b/narwhals/_compliant/any_namespace.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, ClassVar, Protocol, TypeVar +from typing import TYPE_CHECKING, Any, ClassVar, Protocol, TypeVar from narwhals._utils import CompliantT_co, _StoresCompliant @@ -115,3 +115,4 @@ class StructNamespace(_StoresCompliant[CompliantT_co], Protocol[CompliantT_co]): _accessor: ClassVar[Accessor] = "struct" def field(self, name: str) -> CompliantT_co: ... + def unnest(self) -> Any: ... diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index 1a138f35f3..cd57021406 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -43,7 +43,13 @@ from narwhals._compliant.namespace import CompliantNamespace, EagerNamespace from narwhals._compliant.series import CompliantSeries - from narwhals._compliant.typing import AliasNames, EvalNames, EvalSeries + from narwhals._compliant.typing import ( + AliasNames, + EagerDataFrameAny, + EagerSeriesAny, + EvalNames, + EvalSeries, + ) from narwhals._expression_parsing import ExprMetadata from narwhals._typing import NoDefault from narwhals._utils import Implementation, Version, _LimitedContext @@ -1184,3 +1190,27 @@ def field(self, name: str) -> EagerExprT: return self.compliant._reuse_series_namespace("struct", "field", name=name).alias( name ) + + def unnest(self) -> EagerExprT: + compliant = self.compliant + + def inner(df: EagerDataFrameAny) -> list[EagerSeriesAny]: + result: list[EagerSeriesAny] = [] + for series in compliant(df): + unnested_df: EagerDataFrameAny = series.struct.unnest() + result.extend( + unnested_df.get_column(col_name) for col_name in unnested_df.columns + ) + return result + + def evaluate_output_names(df: EagerDataFrameAny) -> Sequence[str]: + return [ + field.name for series in compliant(df) for field in series.dtype.fields + ] + + return self.compliant._from_callable( + inner, + evaluate_output_names=evaluate_output_names, + alias_output_names=None, + context=compliant, + ) diff --git a/narwhals/_duckdb/expr_struct.py b/narwhals/_duckdb/expr_struct.py index ac91d399d9..002539928f 100644 --- a/narwhals/_duckdb/expr_struct.py +++ b/narwhals/_duckdb/expr_struct.py @@ -1,13 +1,17 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast from narwhals._compliant import LazyExprNamespace from narwhals._compliant.any_namespace import StructNamespace from narwhals._duckdb.utils import F, lit if TYPE_CHECKING: + from duckdb import Expression + + from narwhals._duckdb.dataframe import DuckDBLazyFrame from narwhals._duckdb.expr import DuckDBExpr + from narwhals.dtypes import Struct class DuckDBExprStructNamespace( @@ -17,3 +21,31 @@ def field(self, name: str) -> DuckDBExpr: return self.compliant._with_elementwise( lambda expr: F("struct_extract", expr, lit(name)) ).alias(name) + + def unnest(self) -> DuckDBExpr: + compliant = self.compliant + + def func(df: DuckDBLazyFrame) -> list[Expression]: + schema = df.schema + return [ + F("struct_extract", native_expr, lit(field.name)).alias(field.name) + for native_expr, name in zip( + compliant(df), compliant._evaluate_output_names(df) + ) + for field in cast("Struct", schema[name]).fields + ] + + def evaluate_output_names(df: DuckDBLazyFrame) -> list[str]: + schema = df.schema + return [ + field.name + for name in compliant._evaluate_output_names(df) + for field in cast("Struct", schema[name]).fields + ] + + return compliant.__class__( + func, + evaluate_output_names=evaluate_output_names, + alias_output_names=None, + version=compliant._version, + ) diff --git a/narwhals/_ibis/expr_struct.py b/narwhals/_ibis/expr_struct.py index 25eedf04fc..f92cdeaba0 100644 --- a/narwhals/_ibis/expr_struct.py +++ b/narwhals/_ibis/expr_struct.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast from narwhals._compliant import LazyExprNamespace from narwhals._compliant.any_namespace import StructNamespace @@ -8,7 +8,9 @@ if TYPE_CHECKING: import ibis.expr.types as ir + from narwhals._ibis.dataframe import IbisLazyFrame from narwhals._ibis.expr import IbisExpr + from narwhals.dtypes import Struct class IbisExprStructNamespace(LazyExprNamespace["IbisExpr"], StructNamespace["IbisExpr"]): @@ -17,3 +19,32 @@ def func(expr: ir.StructColumn) -> ir.Column: return expr[name] return self.compliant._with_callable(func).alias(name) + + def unnest(self) -> IbisExpr: + compliant = self.compliant + + def func(df: IbisLazyFrame) -> list[ir.Column]: + schema = df.schema + return [ + cast("ir.StructColumn", native_expr)[field.name].name(field.name) + for native_expr, name in zip( + compliant(df), compliant._evaluate_output_names(df) + ) + for field in cast("Struct", schema[name]).fields + ] + + def evaluate_output_names(df: IbisLazyFrame) -> list[str]: + schema = df.schema + return [ + field.name + for name in compliant._evaluate_output_names(df) + for field in cast("Struct", schema[name]).fields + ] + + return compliant.__class__( + func, + evaluate_output_names=evaluate_output_names, + alias_output_names=None, + version=compliant._version, + implementation=compliant._implementation, + ) diff --git a/narwhals/_pandas_like/series_struct.py b/narwhals/_pandas_like/series_struct.py index dc80997533..bf9e26fef1 100644 --- a/narwhals/_pandas_like/series_struct.py +++ b/narwhals/_pandas_like/series_struct.py @@ -6,6 +6,9 @@ from narwhals._pandas_like.utils import PandasLikeSeriesNamespace if TYPE_CHECKING: + import pyarrow as pa + + from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.series import PandasLikeSeries @@ -14,3 +17,21 @@ class PandasLikeSeriesStructNamespace( ): def field(self, name: str) -> PandasLikeSeries: return self.with_native(self.native.struct.field(name)).alias(name) + + def unnest(self) -> PandasLikeDataFrame: + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + + native = self.native + struct_type: pa.StructType = native.dtype.pyarrow_dtype + + # NOTE: struct_type.names is not available until pyarrow 18.0.0 + n_fields = struct_type.num_fields + ns = self.implementation.to_native_namespace() + + result = ns.DataFrame( + { + struct_type.field(idx).name: native.struct.field(idx) + for idx in range(n_fields) + } + ) + return PandasLikeDataFrame.from_native(result, context=self.compliant) diff --git a/narwhals/_polars/expr.py b/narwhals/_polars/expr.py index 1e2b546014..8c70924214 100644 --- a/narwhals/_polars/expr.py +++ b/narwhals/_polars/expr.py @@ -521,4 +521,19 @@ def contains(self, item: Any) -> PolarsExpr: class PolarsExprStructNamespace( PolarsExprNamespace, PolarsStructNamespace[PolarsExpr, pl.Expr] -): ... +): + def unnest(self) -> PolarsExpr: + native = self.native + pl_version = self._expr._backend_version + if pl_version >= (1, 10, 0): + result = native.struct.unnest() + elif pl_version >= (0, 20, 30): # pragma: no cover + result = native.struct.field("*") + else: # pragma: no cover + found = ".".join(f"{d}" for d in pl_version) + msg = ( + "`Expr.struct.unnest` is only available in 'polars>=0.20.30',\n" + f"found version {found!r}." + ) + raise NotImplementedError(msg) + return self.compliant._with_native(result) diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 5a8397522a..d976222b7f 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -852,4 +852,9 @@ def contains(self, item: NonNestedLiteral) -> PolarsSeries: class PolarsSeriesStructNamespace( PolarsSeriesNamespace, PolarsStructNamespace[PolarsSeries, pl.Series] -): ... +): + def unnest(self) -> PolarsDataFrame: + from narwhals._polars.dataframe import PolarsDataFrame + + result = self.native.struct.unnest() + return PolarsDataFrame(result, version=self.compliant._version) diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 1011f7ce93..96a59bb075 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -378,3 +378,4 @@ def len(self) -> CompliantT: ... class PolarsStructNamespace(PolarsAnyNamespace[CompliantT, NativeT_co]): _accessor: ClassVar[Accessor] = "struct" field: Method[CompliantT] + unnest: Method[Any] diff --git a/narwhals/_spark_like/expr_struct.py b/narwhals/_spark_like/expr_struct.py index ac5202535e..35ab65ee38 100644 --- a/narwhals/_spark_like/expr_struct.py +++ b/narwhals/_spark_like/expr_struct.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast from narwhals._compliant import LazyExprNamespace from narwhals._compliant.any_namespace import StructNamespace @@ -8,7 +8,9 @@ if TYPE_CHECKING: from sqlframe.base.column import Column + from narwhals._spark_like.dataframe import SparkLikeLazyFrame from narwhals._spark_like.expr import SparkLikeExpr + from narwhals.dtypes import Struct class SparkLikeExprStructNamespace( @@ -19,3 +21,32 @@ def func(expr: Column) -> Column: return expr.getField(name) return self.compliant._with_elementwise(func).alias(name) + + def unnest(self) -> SparkLikeExpr: + compliant = self.compliant + + def func(df: SparkLikeLazyFrame) -> list[Column]: + schema = df.schema + return [ + native_expr.getField(field.name).alias(field.name) + for native_expr, name in zip( + compliant(df), compliant._evaluate_output_names(df) + ) + for field in cast("Struct", schema[name]).fields + ] + + def evaluate_output_names(df: SparkLikeLazyFrame) -> list[str]: + schema = df.schema + return [ + field.name + for name in compliant._evaluate_output_names(df) + for field in cast("Struct", schema[name]).fields + ] + + return compliant.__class__( + func, + evaluate_output_names=evaluate_output_names, + alias_output_names=None, + version=compliant._version, + implementation=compliant._implementation, + ) diff --git a/narwhals/expr_struct.py b/narwhals/expr_struct.py index 7d734732f9..d1c7bed9d3 100644 --- a/narwhals/expr_struct.py +++ b/narwhals/expr_struct.py @@ -45,3 +45,32 @@ def field(self, name: str) -> ExprT: return self._expr._append_node( ExprNode(ExprKind.ELEMENTWISE, "struct.field", name=name) ) + + def unnest(self) -> ExprT: + r"""Expand the struct column into individual fields as separate columns. + + Each field of the struct becomes a separate column in the result. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame( + ... {"user": [{"id": 0, "name": "john"}, {"id": 1, "name": "jane"}]} + ... ) + >>> df = nw.from_native(df_native) + >>> df.select(nw.col("user").struct.unnest()) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | shape: (2, 2) | + | ┌─────┬──────┐ | + | │ id ┆ name │ | + | │ --- ┆ --- │ | + | │ i64 ┆ str │ | + | ╞═════╪══════╡ | + | │ 0 ┆ john │ | + | │ 1 ┆ jane │ | + | └─────┴──────┘ | + └──────────────────┘ + """ + return self._expr._append_node(ExprNode(ExprKind.ELEMENTWISE, "struct.unnest")) diff --git a/narwhals/series_struct.py b/narwhals/series_struct.py index 5a2851d6f0..5282eb505b 100644 --- a/narwhals/series_struct.py +++ b/narwhals/series_struct.py @@ -1,9 +1,12 @@ from __future__ import annotations -from typing import Generic +from typing import TYPE_CHECKING, Any, Generic from narwhals.typing import SeriesT +if TYPE_CHECKING: + from narwhals.dataframe import DataFrame + class SeriesStructNamespace(Generic[SeriesT]): def __init__(self, series: SeriesT) -> None: @@ -28,3 +31,35 @@ def field(self, name: str) -> SeriesT: return self._narwhals_series._with_compliant( self._narwhals_series._compliant_series.struct.field(name) ) + + def unnest(self) -> DataFrame[Any]: + r"""Convert this struct Series to a DataFrame with a separate column for each field. + + Each field of the struct becomes a column in the resulting DataFrame. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> s_native = pl.Series( + ... [{"id": 0, "name": "john"}, {"id": 1, "name": "jane"}] + ... ) + >>> s = nw.from_native(s_native, series_only=True) + >>> s.struct.unnest() + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | shape: (2, 2) | + | ┌─────┬──────┐ | + | │ id ┆ name │ | + | │ --- ┆ --- │ | + | │ i64 ┆ str │ | + | ╞═════╪══════╡ | + | │ 0 ┆ john │ | + | │ 1 ┆ jane │ | + | └─────┴──────┘ | + └──────────────────┘ + """ + return self._narwhals_series._dataframe( + self._narwhals_series._compliant_series.struct.unnest(), + level=self._narwhals_series._level, + ) diff --git a/tests/expr_and_series/struct_/unnest_test.py b/tests/expr_and_series/struct_/unnest_test.py new file mode 100644 index 0000000000..fe7cfcd6b6 --- /dev/null +++ b/tests/expr_and_series/struct_/unnest_test.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from datetime import datetime + +import pytest + +import narwhals as nw +from tests.utils import ( + DUCKDB_VERSION, + PANDAS_VERSION, + POLARS_VERSION, + PYARROW_VERSION, + Constructor, + ConstructorEager, + assert_equal_data, +) + + +def skip_if_old_version(constructor: Constructor | ConstructorEager) -> None: + if ( + ( + "pandas" in str(constructor) + and (PANDAS_VERSION < (2, 2, 0) or PYARROW_VERSION == (0, 0, 0)) + ) + or ("polars" in str(constructor) and POLARS_VERSION < (0, 20, 30)) + or ("duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3)) + ): + pytest.skip() + + +data = { + "id": [0, 1], + "name": ["john", "jane"], + "hash": ["fake-hash-1", "fake-hash-2"], + "ts": [datetime(2026, 1, 1, 0, 0), datetime(2026, 1, 2, 0, 0)], +} + +user_dtype = nw.Struct({"id": nw.Int16(), "name": nw.String()}) +psw_dtype = nw.Struct({"hash": nw.String(), "ts": nw.Datetime()}) + +user_expr = nw.struct("id", "name").cast(user_dtype).alias("user") +psw_expr = nw.struct("hash", "ts").cast(psw_dtype).alias("user") + + +def test_unnest_expr(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if any(backend in str(constructor) for backend in ("dask",)): + request.applymarker(pytest.mark.xfail) + + skip_if_old_version(constructor) + + df = nw.from_native(constructor(data)).select(user=user_expr, psw=psw_expr) + + result = df.select(nw.col("user").struct.unnest()) + expected = {"id": [0, 1], "name": ["john", "jane"]} + assert_equal_data(result, expected) + + +def test_unnest_expr_multi( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if any(backend in str(constructor) for backend in ("dask",)): + request.applymarker(pytest.mark.xfail) + + skip_if_old_version(constructor) + + df = nw.from_native(constructor(data)).select(user=user_expr, psw=psw_expr) + + result = df.select(nw.col("user", "psw").struct.unnest()) + expected = { + "id": [0, 1], + "name": ["john", "jane"], + "hash": ["fake-hash-1", "fake-hash-2"], + "ts": [datetime(2026, 1, 1, 0, 0), datetime(2026, 1, 2, 0, 0)], + } + assert_equal_data(result, expected) + + +def test_unnest_series(constructor_eager: ConstructorEager) -> None: + skip_if_old_version(constructor_eager) + + df = nw.from_native(constructor_eager(data), eager_only=True).select(user=user_expr) + + result = df.get_column("user").struct.unnest() + expected = {"id": [0, 1], "name": ["john", "jane"]} + assert_equal_data(result, expected)