Skip to content

Commit 74c1485

Browse files
timsaucerclaude
andcommitted
Address PR review feedback: add quantile_cont alias and simplify examples
- Add quantile_cont as alias for percentile_cont (matches upstream) - Replace pa.concat_arrays batch pattern with collect_column() in docstrings - Add percentile_cont, quantile_cont, var_population to docs function list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 18ab457 commit 74c1485

File tree

4 files changed

+31
-22
lines changed

4 files changed

+31
-22
lines changed

docs/source/user-guide/common-operations/aggregations.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,7 @@ The available aggregate functions are:
354354
- :py:func:`datafusion.functions.stddev_pop`
355355
- :py:func:`datafusion.functions.var_samp`
356356
- :py:func:`datafusion.functions.var_pop`
357+
- :py:func:`datafusion.functions.var_population`
357358
6. Linear Regression Functions
358359
- :py:func:`datafusion.functions.regr_count`
359360
- :py:func:`datafusion.functions.regr_slope`
@@ -370,7 +371,9 @@ The available aggregate functions are:
370371
- :py:func:`datafusion.functions.nth_value`
371372
8. String Functions
372373
- :py:func:`datafusion.functions.string_agg`
373-
9. Approximation Functions
374+
9. Percentile Functions
375+
- :py:func:`datafusion.functions.percentile_cont`
376+
- :py:func:`datafusion.functions.quantile_cont`
374377
- :py:func:`datafusion.functions.approx_distinct`
375378
- :py:func:`datafusion.functions.approx_median`
376379
- :py:func:`datafusion.functions.approx_percentile_cont`

python/datafusion/expr.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,7 +1458,6 @@ def rollup(*exprs: Expr) -> Expr:
14581458
*exprs: Column expressions to include in the rollup.
14591459
14601460
Examples:
1461-
>>> import pyarrow as pa
14621461
>>> import datafusion as dfn
14631462
>>> from datafusion.expr import GroupingSet
14641463
>>> ctx = dfn.SessionContext()
@@ -1468,8 +1467,7 @@ def rollup(*exprs: Expr) -> Expr:
14681467
... [dfn.functions.sum(dfn.col("b")).alias("s"),
14691468
... dfn.functions.grouping(dfn.col("a"))],
14701469
... ).sort(dfn.col("a").sort(nulls_first=False))
1471-
>>> batches = result.collect()
1472-
>>> pa.concat_arrays([b.column("s") for b in batches]).to_pylist()
1470+
>>> result.collect_column("s").to_pylist()
14731471
[30, 30, 60]
14741472
14751473
See Also:
@@ -1496,7 +1494,6 @@ def cube(*exprs: Expr) -> Expr:
14961494
With a single column, ``cube`` behaves identically to
14971495
:py:meth:`rollup`:
14981496
1499-
>>> import pyarrow as pa
15001497
>>> import datafusion as dfn
15011498
>>> from datafusion.expr import GroupingSet
15021499
>>> ctx = dfn.SessionContext()
@@ -1506,9 +1503,8 @@ def cube(*exprs: Expr) -> Expr:
15061503
... [dfn.functions.sum(dfn.col("b")).alias("s"),
15071504
... dfn.functions.grouping(dfn.col("a"))],
15081505
... ).sort(dfn.col("a").sort(nulls_first=False))
1509-
>>> batches = result.collect()
1510-
>>> pa.concat_arrays([b.column(2) for b in batches]).to_pylist()
1511-
[0, 0, 1]
1506+
>>> result.collect_column("s").to_pylist()
1507+
[30, 30, 60]
15121508
15131509
See Also:
15141510
:py:meth:`rollup`, :py:meth:`grouping_sets`,
@@ -1533,7 +1529,6 @@ def grouping_sets(*expr_lists: list[Expr]) -> Expr:
15331529
expressions forming one grouping set.
15341530
15351531
Examples:
1536-
>>> import pyarrow as pa
15371532
>>> import datafusion as dfn
15381533
>>> from datafusion.expr import GroupingSet
15391534
>>> ctx = dfn.SessionContext()
@@ -1550,9 +1545,7 @@ def grouping_sets(*expr_lists: list[Expr]) -> Expr:
15501545
... dfn.col("a").sort(nulls_first=False),
15511546
... dfn.col("b").sort(nulls_first=False),
15521547
... )
1553-
>>> batches = result.collect()
1554-
>>> pa.concat_arrays(
1555-
... [b.column("s") for b in batches]).to_pylist()
1548+
>>> result.collect_column("s").to_pylist()
15561549
[3, 3, 4, 2]
15571550
15581551
See Also:

python/datafusion/functions.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@
261261
"pi",
262262
"pow",
263263
"power",
264+
"quantile_cont",
264265
"radians",
265266
"random",
266267
"range",
@@ -4350,6 +4351,19 @@ def percentile_cont(
43504351
return Expr(f.percentile_cont(sort_expr_raw, percentile, filter=filter_raw))
43514352

43524353

4354+
def quantile_cont(
4355+
sort_expression: Expr | SortExpr,
4356+
percentile: float,
4357+
filter: Expr | None = None,
4358+
) -> Expr:
4359+
"""Computes the exact percentile of input values using continuous interpolation.
4360+
4361+
See Also:
4362+
This is an alias for :py:func:`percentile_cont`.
4363+
"""
4364+
return percentile_cont(sort_expression, percentile, filter)
4365+
4366+
43534367
def array_agg(
43544368
expression: Expr,
43554369
distinct: bool = False,
@@ -4449,7 +4463,6 @@ def grouping(
44494463
grand-total row where ``a`` is aggregated across
44504464
(``grouping(a) = 1``):
44514465
4452-
>>> import pyarrow as pa
44534466
>>> from datafusion.expr import GroupingSet
44544467
>>> ctx = dfn.SessionContext()
44554468
>>> df = ctx.from_pydict({"a": [1, 1, 2], "b": [10, 20, 30]})
@@ -4458,9 +4471,8 @@ def grouping(
44584471
... [dfn.functions.sum(dfn.col("b")).alias("s"),
44594472
... dfn.functions.grouping(dfn.col("a"))],
44604473
... ).sort(dfn.col("a").sort(nulls_first=False))
4461-
>>> batches = result.collect()
4462-
>>> pa.concat_arrays([b.column(2) for b in batches]).to_pylist()
4463-
[0, 0, 1]
4474+
>>> result.collect_column("s").to_pylist()
4475+
[30, 30, 60]
44644476
44654477
See Also:
44664478
:py:class:`~datafusion.expr.GroupingSet`

python/tests/test_functions.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1822,18 +1822,19 @@ def test_conditional_functions(df_with_nulls, expr, expected):
18221822

18231823

18241824
@pytest.mark.parametrize(
1825-
("filter_expr", "expected"),
1825+
("func", "filter_expr", "expected"),
18261826
[
1827-
(None, 3.0),
1828-
(column("a") > literal(1.0), 3.5),
1827+
(f.percentile_cont, None, 3.0),
1828+
(f.percentile_cont, column("a") > literal(1.0), 3.5),
1829+
(f.quantile_cont, None, 3.0),
18291830
],
1830-
ids=["no_filter", "with_filter"],
1831+
ids=["no_filter", "with_filter", "quantile_cont_alias"],
18311832
)
1832-
def test_percentile_cont(filter_expr, expected):
1833+
def test_percentile_cont(func, filter_expr, expected):
18331834
ctx = SessionContext()
18341835
df = ctx.from_pydict({"a": [1.0, 2.0, 3.0, 4.0, 5.0]})
18351836
result = df.aggregate(
1836-
[], [f.percentile_cont(column("a"), 0.5, filter=filter_expr).alias("v")]
1837+
[], [func(column("a"), 0.5, filter=filter_expr).alias("v")]
18371838
).collect()[0]
18381839
assert result.column(0)[0].as_py() == expected
18391840

0 commit comments

Comments
 (0)