Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 180 additions & 0 deletions python/datafusion/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2055,6 +2055,15 @@ def corr(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
value_y: The dependent variable for correlation
value_x: The independent variable for correlation
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.corr(dfn.col("a"), dfn.col("b")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw))
Expand Down Expand Up @@ -2101,6 +2110,22 @@ def covar_pop(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
value_y: The dependent variable for covariance
value_x: The independent variable for covariance
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> import builtins
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [],
... [dfn.functions.covar_pop(
... dfn.col("a"), dfn.col("b")
... ).alias("v")]
... )
>>> builtins.round(
... result.collect_column("v")[0].as_py(), 4
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This example can be simplified by choosing input values with an exact covariance result instead of importing builtins just to round the output.

That would make covar_pop read more like the surrounding examples.

... )
0.6667
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw))
Expand All @@ -2118,6 +2143,15 @@ def covar_samp(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr
value_y: The dependent variable for covariance
value_x: The independent variable for covariance
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [], [dfn.functions.covar_samp(dfn.col("a"), dfn.col("b")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw))
Expand All @@ -2127,6 +2161,15 @@ def covar(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample covariance.

This is an alias for :py:func:`covar_samp`.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [], [dfn.functions.covar(dfn.col("a"), dfn.col("b")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since covar() is an alias of covar_samp(), and the new example is a verbatim duplicate of the covar_samp function’s example, do you think keeping the example only on covar_samp() and leaving the alias docstring short would avoid doc drift?

"""
return covar_samp(value_y, value_x, filter)

Expand Down Expand Up @@ -2215,6 +2258,14 @@ def stddev(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: The value to find the minimum of
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
>>> result = df.aggregate([], [dfn.functions.stddev(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.stddev(expression.expr, filter=filter_raw))
Expand All @@ -2229,6 +2280,14 @@ def stddev_pop(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: The value to find the minimum of
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.stddev_pop(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.stddev_pop(expression.expr, filter=filter_raw))
Expand All @@ -2238,6 +2297,14 @@ def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample standard deviation of the argument.

This is an alias for :py:func:`stddev`.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
>>> result = df.aggregate([], [dfn.functions.stddev_samp(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
return stddev(arg, filter=filter)

Expand All @@ -2246,6 +2313,14 @@ def var(expression: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample variance of the argument.

This is an alias for :py:func:`var_samp`.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.var(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
return var_samp(expression, filter)

Expand All @@ -2259,6 +2334,14 @@ def var_pop(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: The variable to compute the variance for
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [0.0, 2.0]})
>>> result = df.aggregate([], [dfn.functions.var_pop(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.var_pop(expression.expr, filter=filter_raw))
Expand All @@ -2273,6 +2356,14 @@ def var_samp(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: The variable to compute the variance for
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.var_samp(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.var_sample(expression.expr, filter=filter_raw))
Expand All @@ -2282,6 +2373,14 @@ def var_sample(expression: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample variance of the argument.

This is an alias for :py:func:`var_samp`.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.var_sample(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
return var_samp(expression, filter)

Expand All @@ -2303,6 +2402,15 @@ def regr_avgx(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_avgx(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
5.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2326,6 +2434,15 @@ def regr_avgy(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_avgy(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2349,6 +2466,15 @@ def regr_count(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_count(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
3
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2372,6 +2498,15 @@ def regr_intercept(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_intercept(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
0.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2395,6 +2530,15 @@ def regr_r2(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_r2(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2418,6 +2562,15 @@ def regr_slope(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_slope(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2441,6 +2594,15 @@ def regr_sxx(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_sxx(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2464,6 +2626,15 @@ def regr_sxy(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_sxy(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2487,6 +2658,15 @@ def regr_syy(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_syy(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None

Expand Down
Loading