Skip to content
This repository was archived by the owner on Jun 9, 2026. It is now read-only.

Commit aca9d13

Browse files
ntjohnson1claude
andcommitted
Add docstring examples for Aggregate statistical and regression functions
Add example usage to docstrings for Aggregate statistical and regression functions to improve documentation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1160d5a commit aca9d13

1 file changed

Lines changed: 180 additions & 0 deletions

File tree

python/datafusion/functions.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2055,6 +2055,15 @@ def corr(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
20552055
value_y: The dependent variable for correlation
20562056
value_x: The independent variable for correlation
20572057
filter: If provided, only compute against rows for which the filter is True
2058+
2059+
Examples:
2060+
---------
2061+
>>> ctx = dfn.SessionContext()
2062+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [1.0, 2.0, 3.0]})
2063+
>>> result = df.aggregate(
2064+
... [], [dfn.functions.corr(dfn.col("a"), dfn.col("b")).alias("v")])
2065+
>>> result.collect_column("v")[0].as_py()
2066+
1.0
20582067
"""
20592068
filter_raw = filter.expr if filter is not None else None
20602069
return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw))
@@ -2101,6 +2110,22 @@ def covar_pop(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
21012110
value_y: The dependent variable for covariance
21022111
value_x: The independent variable for covariance
21032112
filter: If provided, only compute against rows for which the filter is True
2113+
2114+
Examples:
2115+
---------
2116+
>>> import builtins
2117+
>>> ctx = dfn.SessionContext()
2118+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
2119+
>>> result = df.aggregate(
2120+
... [],
2121+
... [dfn.functions.covar_pop(
2122+
... dfn.col("a"), dfn.col("b")
2123+
... ).alias("v")]
2124+
... )
2125+
>>> builtins.round(
2126+
... result.collect_column("v")[0].as_py(), 4
2127+
... )
2128+
0.6667
21042129
"""
21052130
filter_raw = filter.expr if filter is not None else None
21062131
return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw))
@@ -2118,6 +2143,15 @@ def covar_samp(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr
21182143
value_y: The dependent variable for covariance
21192144
value_x: The independent variable for covariance
21202145
filter: If provided, only compute against rows for which the filter is True
2146+
2147+
Examples:
2148+
---------
2149+
>>> ctx = dfn.SessionContext()
2150+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
2151+
>>> result = df.aggregate(
2152+
... [], [dfn.functions.covar_samp(dfn.col("a"), dfn.col("b")).alias("v")])
2153+
>>> result.collect_column("v")[0].as_py()
2154+
1.0
21212155
"""
21222156
filter_raw = filter.expr if filter is not None else None
21232157
return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw))
@@ -2127,6 +2161,15 @@ def covar(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
21272161
"""Computes the sample covariance.
21282162
21292163
This is an alias for :py:func:`covar_samp`.
2164+
2165+
Examples:
2166+
---------
2167+
>>> ctx = dfn.SessionContext()
2168+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
2169+
>>> result = df.aggregate(
2170+
... [], [dfn.functions.covar(dfn.col("a"), dfn.col("b")).alias("v")])
2171+
>>> result.collect_column("v")[0].as_py()
2172+
1.0
21302173
"""
21312174
return covar_samp(value_y, value_x, filter)
21322175

@@ -2215,6 +2258,14 @@ def stddev(expression: Expr, filter: Expr | None = None) -> Expr:
22152258
Args:
22162259
expression: The value to find the minimum of
22172260
filter: If provided, only compute against rows for which the filter is True
2261+
2262+
Examples:
2263+
---------
2264+
>>> ctx = dfn.SessionContext()
2265+
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
2266+
>>> result = df.aggregate([], [dfn.functions.stddev(dfn.col("a")).alias("v")])
2267+
>>> result.collect_column("v")[0].as_py()
2268+
2.0
22182269
"""
22192270
filter_raw = filter.expr if filter is not None else None
22202271
return Expr(f.stddev(expression.expr, filter=filter_raw))
@@ -2229,6 +2280,14 @@ def stddev_pop(expression: Expr, filter: Expr | None = None) -> Expr:
22292280
Args:
22302281
expression: The value to find the minimum of
22312282
filter: If provided, only compute against rows for which the filter is True
2283+
2284+
Examples:
2285+
---------
2286+
>>> ctx = dfn.SessionContext()
2287+
>>> df = ctx.from_pydict({"a": [1.0, 3.0]})
2288+
>>> result = df.aggregate([], [dfn.functions.stddev_pop(dfn.col("a")).alias("v")])
2289+
>>> result.collect_column("v")[0].as_py()
2290+
1.0
22322291
"""
22332292
filter_raw = filter.expr if filter is not None else None
22342293
return Expr(f.stddev_pop(expression.expr, filter=filter_raw))
@@ -2238,6 +2297,14 @@ def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr:
22382297
"""Computes the sample standard deviation of the argument.
22392298
22402299
This is an alias for :py:func:`stddev`.
2300+
2301+
Examples:
2302+
---------
2303+
>>> ctx = dfn.SessionContext()
2304+
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
2305+
>>> result = df.aggregate([], [dfn.functions.stddev_samp(dfn.col("a")).alias("v")])
2306+
>>> result.collect_column("v")[0].as_py()
2307+
2.0
22412308
"""
22422309
return stddev(arg, filter=filter)
22432310

@@ -2246,6 +2313,14 @@ def var(expression: Expr, filter: Expr | None = None) -> Expr:
22462313
"""Computes the sample variance of the argument.
22472314
22482315
This is an alias for :py:func:`var_samp`.
2316+
2317+
Examples:
2318+
---------
2319+
>>> ctx = dfn.SessionContext()
2320+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2321+
>>> result = df.aggregate([], [dfn.functions.var(dfn.col("a")).alias("v")])
2322+
>>> result.collect_column("v")[0].as_py()
2323+
1.0
22492324
"""
22502325
return var_samp(expression, filter)
22512326

@@ -2259,6 +2334,14 @@ def var_pop(expression: Expr, filter: Expr | None = None) -> Expr:
22592334
Args:
22602335
expression: The variable to compute the variance for
22612336
filter: If provided, only compute against rows for which the filter is True
2337+
2338+
Examples:
2339+
---------
2340+
>>> ctx = dfn.SessionContext()
2341+
>>> df = ctx.from_pydict({"a": [0.0, 2.0]})
2342+
>>> result = df.aggregate([], [dfn.functions.var_pop(dfn.col("a")).alias("v")])
2343+
>>> result.collect_column("v")[0].as_py()
2344+
1.0
22622345
"""
22632346
filter_raw = filter.expr if filter is not None else None
22642347
return Expr(f.var_pop(expression.expr, filter=filter_raw))
@@ -2273,6 +2356,14 @@ def var_samp(expression: Expr, filter: Expr | None = None) -> Expr:
22732356
Args:
22742357
expression: The variable to compute the variance for
22752358
filter: If provided, only compute against rows for which the filter is True
2359+
2360+
Examples:
2361+
---------
2362+
>>> ctx = dfn.SessionContext()
2363+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2364+
>>> result = df.aggregate([], [dfn.functions.var_samp(dfn.col("a")).alias("v")])
2365+
>>> result.collect_column("v")[0].as_py()
2366+
1.0
22762367
"""
22772368
filter_raw = filter.expr if filter is not None else None
22782369
return Expr(f.var_sample(expression.expr, filter=filter_raw))
@@ -2282,6 +2373,14 @@ def var_sample(expression: Expr, filter: Expr | None = None) -> Expr:
22822373
"""Computes the sample variance of the argument.
22832374
22842375
This is an alias for :py:func:`var_samp`.
2376+
2377+
Examples:
2378+
---------
2379+
>>> ctx = dfn.SessionContext()
2380+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2381+
>>> result = df.aggregate([], [dfn.functions.var_sample(dfn.col("a")).alias("v")])
2382+
>>> result.collect_column("v")[0].as_py()
2383+
1.0
22852384
"""
22862385
return var_samp(expression, filter)
22872386

@@ -2303,6 +2402,15 @@ def regr_avgx(
23032402
y: The linear regression dependent variable
23042403
x: The linear regression independent variable
23052404
filter: If provided, only compute against rows for which the filter is True
2405+
2406+
Examples:
2407+
---------
2408+
>>> ctx = dfn.SessionContext()
2409+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
2410+
>>> result = df.aggregate(
2411+
... [], [dfn.functions.regr_avgx(dfn.col("y"), dfn.col("x")).alias("v")])
2412+
>>> result.collect_column("v")[0].as_py()
2413+
5.0
23062414
"""
23072415
filter_raw = filter.expr if filter is not None else None
23082416

@@ -2326,6 +2434,15 @@ def regr_avgy(
23262434
y: The linear regression dependent variable
23272435
x: The linear regression independent variable
23282436
filter: If provided, only compute against rows for which the filter is True
2437+
2438+
Examples:
2439+
---------
2440+
>>> ctx = dfn.SessionContext()
2441+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
2442+
>>> result = df.aggregate(
2443+
... [], [dfn.functions.regr_avgy(dfn.col("y"), dfn.col("x")).alias("v")])
2444+
>>> result.collect_column("v")[0].as_py()
2445+
2.0
23292446
"""
23302447
filter_raw = filter.expr if filter is not None else None
23312448

@@ -2349,6 +2466,15 @@ def regr_count(
23492466
y: The linear regression dependent variable
23502467
x: The linear regression independent variable
23512468
filter: If provided, only compute against rows for which the filter is True
2469+
2470+
Examples:
2471+
---------
2472+
>>> ctx = dfn.SessionContext()
2473+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
2474+
>>> result = df.aggregate(
2475+
... [], [dfn.functions.regr_count(dfn.col("y"), dfn.col("x")).alias("v")])
2476+
>>> result.collect_column("v")[0].as_py()
2477+
3
23522478
"""
23532479
filter_raw = filter.expr if filter is not None else None
23542480

@@ -2372,6 +2498,15 @@ def regr_intercept(
23722498
y: The linear regression dependent variable
23732499
x: The linear regression independent variable
23742500
filter: If provided, only compute against rows for which the filter is True
2501+
2502+
Examples:
2503+
---------
2504+
>>> ctx = dfn.SessionContext()
2505+
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
2506+
>>> result = df.aggregate(
2507+
... [], [dfn.functions.regr_intercept(dfn.col("y"), dfn.col("x")).alias("v")])
2508+
>>> result.collect_column("v")[0].as_py()
2509+
0.0
23752510
"""
23762511
filter_raw = filter.expr if filter is not None else None
23772512

@@ -2395,6 +2530,15 @@ def regr_r2(
23952530
y: The linear regression dependent variable
23962531
x: The linear regression independent variable
23972532
filter: If provided, only compute against rows for which the filter is True
2533+
2534+
Examples:
2535+
---------
2536+
>>> ctx = dfn.SessionContext()
2537+
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
2538+
>>> result = df.aggregate(
2539+
... [], [dfn.functions.regr_r2(dfn.col("y"), dfn.col("x")).alias("v")])
2540+
>>> result.collect_column("v")[0].as_py()
2541+
1.0
23982542
"""
23992543
filter_raw = filter.expr if filter is not None else None
24002544

@@ -2418,6 +2562,15 @@ def regr_slope(
24182562
y: The linear regression dependent variable
24192563
x: The linear regression independent variable
24202564
filter: If provided, only compute against rows for which the filter is True
2565+
2566+
Examples:
2567+
---------
2568+
>>> ctx = dfn.SessionContext()
2569+
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
2570+
>>> result = df.aggregate(
2571+
... [], [dfn.functions.regr_slope(dfn.col("y"), dfn.col("x")).alias("v")])
2572+
>>> result.collect_column("v")[0].as_py()
2573+
2.0
24212574
"""
24222575
filter_raw = filter.expr if filter is not None else None
24232576

@@ -2441,6 +2594,15 @@ def regr_sxx(
24412594
y: The linear regression dependent variable
24422595
x: The linear regression independent variable
24432596
filter: If provided, only compute against rows for which the filter is True
2597+
2598+
Examples:
2599+
---------
2600+
>>> ctx = dfn.SessionContext()
2601+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
2602+
>>> result = df.aggregate(
2603+
... [], [dfn.functions.regr_sxx(dfn.col("y"), dfn.col("x")).alias("v")])
2604+
>>> result.collect_column("v")[0].as_py()
2605+
2.0
24442606
"""
24452607
filter_raw = filter.expr if filter is not None else None
24462608

@@ -2464,6 +2626,15 @@ def regr_sxy(
24642626
y: The linear regression dependent variable
24652627
x: The linear regression independent variable
24662628
filter: If provided, only compute against rows for which the filter is True
2629+
2630+
Examples:
2631+
---------
2632+
>>> ctx = dfn.SessionContext()
2633+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
2634+
>>> result = df.aggregate(
2635+
... [], [dfn.functions.regr_sxy(dfn.col("y"), dfn.col("x")).alias("v")])
2636+
>>> result.collect_column("v")[0].as_py()
2637+
2.0
24672638
"""
24682639
filter_raw = filter.expr if filter is not None else None
24692640

@@ -2487,6 +2658,15 @@ def regr_syy(
24872658
y: The linear regression dependent variable
24882659
x: The linear regression independent variable
24892660
filter: If provided, only compute against rows for which the filter is True
2661+
2662+
Examples:
2663+
---------
2664+
>>> ctx = dfn.SessionContext()
2665+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
2666+
>>> result = df.aggregate(
2667+
... [], [dfn.functions.regr_syy(dfn.col("y"), dfn.col("x")).alias("v")])
2668+
>>> result.collect_column("v")[0].as_py()
2669+
2.0
24902670
"""
24912671
filter_raw = filter.expr if filter is not None else None
24922672

0 commit comments

Comments
 (0)