Add GroupingSet.rollup, .cube, and .grouping_sets factory methods

timsaucer · claude · timsaucer · commit 124349806b9b · 2026-04-06T09:55:03.000-04:00
Expose ROLLUP, CUBE, and GROUPING SETS via the DataFrame API by adding
static methods on GroupingSet that construct the corresponding Expr
variants. Update grouping() docstring and tests to use the new API.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/crates/core/src/expr/grouping_set.rs b/crates/core/src/expr/grouping_set.rs
@@ -15,9 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::logical_expr::GroupingSet;
+use datafusion::logical_expr::{Expr, GroupingSet};
 use pyo3::prelude::*;
 
+use crate::expr::PyExpr;
+
 #[pyclass(
     from_py_object,
     frozen,
@@ -30,6 +32,39 @@ pub struct PyGroupingSet {
     grouping_set: GroupingSet,
 }
 
+#[pymethods]
+impl PyGroupingSet {
+    #[staticmethod]
+    #[pyo3(signature = (*exprs))]
+    fn rollup(exprs: Vec<PyExpr>) -> PyExpr {
+        Expr::GroupingSet(GroupingSet::Rollup(
+            exprs.into_iter().map(|e| e.expr).collect(),
+        ))
+        .into()
+    }
+
+    #[staticmethod]
+    #[pyo3(signature = (*exprs))]
+    fn cube(exprs: Vec<PyExpr>) -> PyExpr {
+        Expr::GroupingSet(GroupingSet::Cube(
+            exprs.into_iter().map(|e| e.expr).collect(),
+        ))
+        .into()
+    }
+
+    #[staticmethod]
+    #[pyo3(signature = (*expr_lists))]
+    fn grouping_sets(expr_lists: Vec<Vec<PyExpr>>) -> PyExpr {
+        Expr::GroupingSet(GroupingSet::GroupingSets(
+            expr_lists
+                .into_iter()
+                .map(|list| list.into_iter().map(|e| e.expr).collect())
+                .collect(),
+        ))
+        .into()
+    }
+}
+
 impl From<PyGroupingSet> for GroupingSet {
     fn from(grouping_set: PyGroupingSet) -> Self {
         grouping_set.grouping_set
diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py
@@ -91,7 +91,7 @@
 Extension = expr_internal.Extension
 FileType = expr_internal.FileType
 Filter = expr_internal.Filter
-GroupingSet = expr_internal.GroupingSet
+_GroupingSetInternal = expr_internal.GroupingSet
 Join = expr_internal.Join
 ILike = expr_internal.ILike
 InList = expr_internal.InList
@@ -1430,3 +1430,135 @@ def __repr__(self) -> str:
 
 
 SortKey = Expr | SortExpr | str
+
+
+class GroupingSet:
+    """Factory for creating grouping set expressions.
+
+    Grouping sets control how
+    :py:meth:`~datafusion.dataframe.DataFrame.aggregate` groups rows.
+    Instead of a single ``GROUP BY``, they produce multiple grouping
+    levels in one pass — subtotals, cross-tabulations, or arbitrary
+    column subsets.
+
+    Use :py:func:`~datafusion.functions.grouping` in the aggregate list
+    to tell which columns are aggregated across in each result row.
+    """
+
+    @staticmethod
+    def rollup(*exprs: Expr) -> Expr:
+        """Create a ``ROLLUP`` grouping set for use with ``aggregate()``.
+
+        ``ROLLUP`` generates all prefixes of the given column list as
+        grouping sets. For example, ``rollup(a, b)`` produces grouping
+        sets ``(a, b)``, ``(a)``, and ``()`` (grand total).
+
+        This is equivalent to ``GROUP BY ROLLUP(a, b)`` in SQL.
+
+        Args:
+            *exprs: Column expressions to include in the rollup.
+
+        Examples:
+            >>> import pyarrow as pa
+            >>> import datafusion as dfn
+            >>> from datafusion.expr import GroupingSet
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 1, 2], "b": [10, 20, 30]})
+            >>> result = df.aggregate(
+            ...     [GroupingSet.rollup(dfn.col("a"))],
+            ...     [dfn.functions.sum(dfn.col("b")).alias("s"),
+            ...      dfn.functions.grouping(dfn.col("a"))],
+            ... ).sort(dfn.col("a").sort(nulls_first=False))
+            >>> batches = result.collect()
+            >>> pa.concat_arrays([b.column("s") for b in batches]).to_pylist()
+            [30, 30, 60]
+
+        See Also:
+            :py:meth:`cube`, :py:meth:`grouping_sets`,
+            :py:func:`~datafusion.functions.grouping`
+        """
+        args = [e.expr for e in exprs]
+        return Expr(_GroupingSetInternal.rollup(*args))
+
+    @staticmethod
+    def cube(*exprs: Expr) -> Expr:
+        """Create a ``CUBE`` grouping set for use with ``aggregate()``.
+
+        ``CUBE`` generates all possible subsets of the given column list
+        as grouping sets. For example, ``cube(a, b)`` produces grouping
+        sets ``(a, b)``, ``(a)``, ``(b)``, and ``()`` (grand total).
+
+        This is equivalent to ``GROUP BY CUBE(a, b)`` in SQL.
+
+        Args:
+            *exprs: Column expressions to include in the cube.
+
+        Examples:
+            With a single column, ``cube`` behaves identically to
+            :py:meth:`rollup`:
+
+            >>> import pyarrow as pa
+            >>> import datafusion as dfn
+            >>> from datafusion.expr import GroupingSet
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 1, 2], "b": [10, 20, 30]})
+            >>> result = df.aggregate(
+            ...     [GroupingSet.cube(dfn.col("a"))],
+            ...     [dfn.functions.sum(dfn.col("b")).alias("s"),
+            ...      dfn.functions.grouping(dfn.col("a"))],
+            ... ).sort(dfn.col("a").sort(nulls_first=False))
+            >>> batches = result.collect()
+            >>> pa.concat_arrays([b.column(2) for b in batches]).to_pylist()
+            [0, 0, 1]
+
+        See Also:
+            :py:meth:`rollup`, :py:meth:`grouping_sets`,
+            :py:func:`~datafusion.functions.grouping`
+        """
+        args = [e.expr for e in exprs]
+        return Expr(_GroupingSetInternal.cube(*args))
+
+    @staticmethod
+    def grouping_sets(*expr_lists: list[Expr]) -> Expr:
+        """Create explicit grouping sets for use with ``aggregate()``.
+
+        Each argument is a list of column expressions representing one
+        grouping set. For example, ``grouping_sets([a], [b])`` groups
+        by ``a`` alone and by ``b`` alone in a single query.
+
+        This is equivalent to ``GROUP BY GROUPING SETS ((a), (b))`` in
+        SQL.
+
+        Args:
+            *expr_lists: Each positional argument is a list of
+                expressions forming one grouping set.
+
+        Examples:
+            >>> import pyarrow as pa
+            >>> import datafusion as dfn
+            >>> from datafusion.expr import GroupingSet
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict(
+            ...     {"a": ["x", "x", "y"], "b": ["m", "n", "m"],
+            ...      "c": [1, 2, 3]})
+            >>> result = df.aggregate(
+            ...     [GroupingSet.grouping_sets(
+            ...         [dfn.col("a")], [dfn.col("b")])],
+            ...     [dfn.functions.sum(dfn.col("c")).alias("s"),
+            ...      dfn.functions.grouping(dfn.col("a")),
+            ...      dfn.functions.grouping(dfn.col("b"))],
+            ... ).sort(
+            ...     dfn.col("a").sort(nulls_first=False),
+            ...     dfn.col("b").sort(nulls_first=False),
+            ... )
+            >>> batches = result.collect()
+            >>> pa.concat_arrays(
+            ...     [b.column("s") for b in batches]).to_pylist()
+            [3, 3, 4, 2]
+
+        See Also:
+            :py:meth:`rollup`, :py:meth:`cube`,
+            :py:func:`~datafusion.functions.grouping`
+        """
+        raw_lists = [[e.expr for e in lst] for lst in expr_lists]
+        return Expr(_GroupingSetInternal.grouping_sets(*raw_lists))
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
@@ -4413,37 +4413,47 @@ def grouping(
     distinct: bool = False,
     filter: Expr | None = None,
 ) -> Expr:
-    """Returns 1 if the data is aggregated across the specified column, or 0 otherwise.
+    """Indicates whether a column is aggregated across in the current row.
 
-    This function is used with ``GROUPING SETS``, ``CUBE``, or ``ROLLUP`` to
-    distinguish between aggregated and non-aggregated rows. In a regular
-    ``GROUP BY`` without grouping sets, it always returns 0.
+    Returns 0 when the column is part of the grouping key for that row
+    (i.e., the row contains per-group results for that column). Returns 1
+    when the column is *not* part of the grouping key (i.e., the row's
+    aggregate spans all values of that column).
 
-    Note: The ``grouping`` aggregate function is rewritten by the query
-    optimizer before execution, so it works correctly even though its
-    physical plan is not directly implemented.
+    This function is meaningful with
+    :py:meth:`GroupingSet.rollup <datafusion.expr.GroupingSet.rollup>`,
+    :py:meth:`GroupingSet.cube <datafusion.expr.GroupingSet.cube>`, or
+    :py:meth:`GroupingSet.grouping_sets <datafusion.expr.GroupingSet.grouping_sets>`,
+    where different rows are grouped by different subsets of columns. In a
+    regular ``GROUP BY`` without grouping sets every column is always part
+    of the key, so ``grouping()`` always returns 0.
 
     Args:
         expression: The column to check grouping status for
         distinct: If True, compute on distinct values only
         filter: If provided, only compute against rows for which the filter is True
 
     Examples:
-        In a simple ``GROUP BY`` (no grouping sets), ``grouping()`` always
-        returns 0, indicating the column is part of the grouping key:
+        With :py:meth:`~datafusion.expr.GroupingSet.rollup`, the result
+        includes both per-group rows (``grouping(a) = 0``) and a
+        grand-total row where ``a`` is aggregated across
+        (``grouping(a) = 1``):
 
         >>> import pyarrow as pa
+        >>> from datafusion.expr import GroupingSet
         >>> ctx = dfn.SessionContext()
         >>> df = ctx.from_pydict({"a": [1, 1, 2], "b": [10, 20, 30]})
         >>> result = df.aggregate(
-        ...     [dfn.col("a")],
-        ...     [dfn.functions.grouping(dfn.col("a")),
-        ...      dfn.functions.sum(dfn.col("b")).alias("s")])
+        ...     [GroupingSet.rollup(dfn.col("a"))],
+        ...     [dfn.functions.sum(dfn.col("b")).alias("s"),
+        ...      dfn.functions.grouping(dfn.col("a"))],
+        ... ).sort(dfn.col("a").sort(nulls_first=False))
         >>> batches = result.collect()
-        >>> grouping_vals = pa.concat_arrays(
-        ...     [batch.column(1) for batch in batches]).to_pylist()
-        >>> all(v == 0 for v in grouping_vals)
-        True
+        >>> pa.concat_arrays([b.column(2) for b in batches]).to_pylist()
+        [0, 0, 1]
+
+    See Also:
+        :py:class:`~datafusion.expr.GroupingSet`
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.grouping(expression.expr, distinct=distinct, filter=filter_raw))
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
@@ -22,6 +22,7 @@
 import pytest
 from datafusion import SessionContext, column, literal
 from datafusion import functions as f
+from datafusion.expr import GroupingSet
 
 np.seterr(invalid="ignore")
 
@@ -1837,33 +1838,72 @@ def test_percentile_cont(filter_expr, expected):
     assert result.column(0)[0].as_py() == expected
 
 
-def test_grouping():
+def test_rollup():
+    # With ROLLUP, per-group rows have grouping()=0 and the grand-total row
+    # (where the column is aggregated across) has grouping()=1.
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [1, 1, 2], "b": [10, 20, 30]})
-    # In a simple GROUP BY (no grouping sets), grouping() returns 0 for all rows.
     result = df.aggregate(
-        [column("a")], [f.grouping(column("a")), f.sum(column("b")).alias("s")]
-    ).collect()
-    grouping_col = pa.concat_arrays([batch.column(1) for batch in result]).to_pylist()
-    assert all(v == 0 for v in grouping_col)
+        [GroupingSet.rollup(column("a"))],
+        [f.sum(column("b")).alias("s"), f.grouping(column("a"))],
+    ).sort(column("a").sort(ascending=True, nulls_first=False))
+    batches = result.collect()
+    g = pa.concat_arrays([b.column(2) for b in batches]).to_pylist()
+    s = pa.concat_arrays([b.column("s") for b in batches]).to_pylist()
+    # Two per-group rows (g=0) plus one grand-total row (g=1)
+    assert g == [0, 0, 1]
+    assert s == [30, 30, 60]
+
+
+def test_rollup_multi_column():
+    # rollup(a, b) produces grouping sets (a, b), (a), ().
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [1, 1, 2], "b": ["x", "y", "x"], "c": [10, 20, 30]})
+    result = df.aggregate(
+        [GroupingSet.rollup(column("a"), column("b"))],
+        [f.sum(column("c")).alias("s")],
+    )
+    total_rows = sum(b.num_rows for b in result.collect())
+    # 3 detail (a,b) + 2 subtotal (a) + 1 grand total = 6
+    assert total_rows == 6
 
 
-def test_grouping_multiple_columns():
-    # Verify grouping() works when multiple columns are in the GROUP BY clause.
+def test_cube():
+    # cube(a, b) produces all subsets: (a,b), (a), (b), ().
     ctx = SessionContext()
-    df = ctx.from_pydict({"a": [1, 1, 2], "b": [10, 10, 30], "c": [100, 200, 300]})
+    df = ctx.from_pydict({"a": [1, 1, 2], "b": ["x", "y", "x"], "c": [10, 20, 30]})
     result = df.aggregate(
-        [column("a"), column("b")],
+        [GroupingSet.cube(column("a"), column("b"))],
+        [f.sum(column("c")).alias("s")],
+    )
+    total_rows = sum(b.num_rows for b in result.collect())
+    # 3 (a,b) + 2 (a) + 2 (b) + 1 () = 8
+    assert total_rows == 8
+
+
+def test_grouping_sets():
+    # GROUPING SETS lets you choose exactly which column subsets to group by.
+    # Each row's grouping() value tells you which columns are aggregated across.
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": ["x", "x", "y"], "b": ["m", "n", "m"], "c": [1, 2, 3]})
+    result = df.aggregate(
+        [GroupingSet.grouping_sets([column("a")], [column("b")])],
         [
+            f.sum(column("c")).alias("s"),
             f.grouping(column("a")),
             f.grouping(column("b")),
-            f.sum(column("c")).alias("s"),
         ],
-    ).collect()
-    grouping_a = pa.concat_arrays([batch.column(2) for batch in result]).to_pylist()
-    grouping_b = pa.concat_arrays([batch.column(3) for batch in result]).to_pylist()
-    assert all(v == 0 for v in grouping_a)
-    assert all(v == 0 for v in grouping_b)
+    ).sort(
+        column("a").sort(ascending=True, nulls_first=False),
+        column("b").sort(ascending=True, nulls_first=False),
+    )
+    batches = result.collect()
+    ga = pa.concat_arrays([b.column(3) for b in batches]).to_pylist()
+    gb = pa.concat_arrays([b.column(4) for b in batches]).to_pylist()
+    # Rows grouped by (a): ga=0 (a is a key), gb=1 (b is aggregated across)
+    # Rows grouped by (b): ga=1 (a is aggregated across), gb=0 (b is a key)
+    assert ga == [0, 0, 1, 1]
+    assert gb == [1, 1, 0, 0]
 
 
 def test_var_population():