-
Notifications
You must be signed in to change notification settings - Fork 151
Add docstring examples for Aggregate statistical and regression functions #1417
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 1 commit
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
aca9d13
Add docstring examples for Aggregate statistical and regression funct…
ntjohnson1 1b8f920
Simplify covar
ntjohnson1 e03b5e0
Make sure everything is google doc style
ntjohnson1 1f6cadc
Merge branch 'main' of github.com:apache/datafusion-python into nick/…
ntjohnson1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2055,6 +2055,15 @@ def corr(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: | |
| value_y: The dependent variable for correlation | ||
| value_x: The independent variable for correlation | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [1.0, 2.0, 3.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.corr(dfn.col("a"), dfn.col("b")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 1.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
| return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw)) | ||
|
|
@@ -2101,6 +2110,22 @@ def covar_pop(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: | |
| value_y: The dependent variable for covariance | ||
| value_x: The independent variable for covariance | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> import builtins | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], | ||
| ... [dfn.functions.covar_pop( | ||
| ... dfn.col("a"), dfn.col("b") | ||
| ... ).alias("v")] | ||
| ... ) | ||
| >>> builtins.round( | ||
| ... result.collect_column("v")[0].as_py(), 4 | ||
| ... ) | ||
| 0.6667 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
| return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw)) | ||
|
|
@@ -2118,6 +2143,15 @@ def covar_samp(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr | |
| value_y: The dependent variable for covariance | ||
| value_x: The independent variable for covariance | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.covar_samp(dfn.col("a"), dfn.col("b")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 1.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
| return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw)) | ||
|
|
@@ -2127,6 +2161,15 @@ def covar(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: | |
| """Computes the sample covariance. | ||
|
|
||
| This is an alias for :py:func:`covar_samp`. | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.covar(dfn.col("a"), dfn.col("b")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 1.0 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since |
||
| """ | ||
| return covar_samp(value_y, value_x, filter) | ||
|
|
||
|
|
@@ -2215,6 +2258,14 @@ def stddev(expression: Expr, filter: Expr | None = None) -> Expr: | |
| Args: | ||
| expression: The value to find the minimum of | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]}) | ||
| >>> result = df.aggregate([], [dfn.functions.stddev(dfn.col("a")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 2.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
| return Expr(f.stddev(expression.expr, filter=filter_raw)) | ||
|
|
@@ -2229,6 +2280,14 @@ def stddev_pop(expression: Expr, filter: Expr | None = None) -> Expr: | |
| Args: | ||
| expression: The value to find the minimum of | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [1.0, 3.0]}) | ||
| >>> result = df.aggregate([], [dfn.functions.stddev_pop(dfn.col("a")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 1.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
| return Expr(f.stddev_pop(expression.expr, filter=filter_raw)) | ||
|
|
@@ -2238,6 +2297,14 @@ def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr: | |
| """Computes the sample standard deviation of the argument. | ||
|
|
||
| This is an alias for :py:func:`stddev`. | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]}) | ||
| >>> result = df.aggregate([], [dfn.functions.stddev_samp(dfn.col("a")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 2.0 | ||
| """ | ||
| return stddev(arg, filter=filter) | ||
|
|
||
|
|
@@ -2246,6 +2313,14 @@ def var(expression: Expr, filter: Expr | None = None) -> Expr: | |
| """Computes the sample variance of the argument. | ||
|
|
||
| This is an alias for :py:func:`var_samp`. | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) | ||
| >>> result = df.aggregate([], [dfn.functions.var(dfn.col("a")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 1.0 | ||
| """ | ||
| return var_samp(expression, filter) | ||
|
|
||
|
|
@@ -2259,6 +2334,14 @@ def var_pop(expression: Expr, filter: Expr | None = None) -> Expr: | |
| Args: | ||
| expression: The variable to compute the variance for | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [0.0, 2.0]}) | ||
| >>> result = df.aggregate([], [dfn.functions.var_pop(dfn.col("a")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 1.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
| return Expr(f.var_pop(expression.expr, filter=filter_raw)) | ||
|
|
@@ -2273,6 +2356,14 @@ def var_samp(expression: Expr, filter: Expr | None = None) -> Expr: | |
| Args: | ||
| expression: The variable to compute the variance for | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) | ||
| >>> result = df.aggregate([], [dfn.functions.var_samp(dfn.col("a")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 1.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
| return Expr(f.var_sample(expression.expr, filter=filter_raw)) | ||
|
|
@@ -2282,6 +2373,14 @@ def var_sample(expression: Expr, filter: Expr | None = None) -> Expr: | |
| """Computes the sample variance of the argument. | ||
|
|
||
| This is an alias for :py:func:`var_samp`. | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) | ||
| >>> result = df.aggregate([], [dfn.functions.var_sample(dfn.col("a")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 1.0 | ||
| """ | ||
| return var_samp(expression, filter) | ||
|
|
||
|
|
@@ -2303,6 +2402,15 @@ def regr_avgx( | |
| y: The linear regression dependent variable | ||
| x: The linear regression independent variable | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.regr_avgx(dfn.col("y"), dfn.col("x")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 5.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
|
|
||
|
|
@@ -2326,6 +2434,15 @@ def regr_avgy( | |
| y: The linear regression dependent variable | ||
| x: The linear regression independent variable | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.regr_avgy(dfn.col("y"), dfn.col("x")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 2.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
|
|
||
|
|
@@ -2349,6 +2466,15 @@ def regr_count( | |
| y: The linear regression dependent variable | ||
| x: The linear regression independent variable | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.regr_count(dfn.col("y"), dfn.col("x")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 3 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
|
|
||
|
|
@@ -2372,6 +2498,15 @@ def regr_intercept( | |
| y: The linear regression dependent variable | ||
| x: The linear regression independent variable | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.regr_intercept(dfn.col("y"), dfn.col("x")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 0.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
|
|
||
|
|
@@ -2395,6 +2530,15 @@ def regr_r2( | |
| y: The linear regression dependent variable | ||
| x: The linear regression independent variable | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.regr_r2(dfn.col("y"), dfn.col("x")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 1.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
|
|
||
|
|
@@ -2418,6 +2562,15 @@ def regr_slope( | |
| y: The linear regression dependent variable | ||
| x: The linear regression independent variable | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.regr_slope(dfn.col("y"), dfn.col("x")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 2.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
|
|
||
|
|
@@ -2441,6 +2594,15 @@ def regr_sxx( | |
| y: The linear regression dependent variable | ||
| x: The linear regression independent variable | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.regr_sxx(dfn.col("y"), dfn.col("x")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 2.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
|
|
||
|
|
@@ -2464,6 +2626,15 @@ def regr_sxy( | |
| y: The linear regression dependent variable | ||
| x: The linear regression independent variable | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.regr_sxy(dfn.col("y"), dfn.col("x")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 2.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
|
|
||
|
|
@@ -2487,6 +2658,15 @@ def regr_syy( | |
| y: The linear regression dependent variable | ||
| x: The linear regression independent variable | ||
| filter: If provided, only compute against rows for which the filter is True | ||
|
|
||
| Examples: | ||
| --------- | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) | ||
| >>> result = df.aggregate( | ||
| ... [], [dfn.functions.regr_syy(dfn.col("y"), dfn.col("x")).alias("v")]) | ||
| >>> result.collect_column("v")[0].as_py() | ||
| 2.0 | ||
| """ | ||
| filter_raw = filter.expr if filter is not None else None | ||
|
|
||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This example can be simplified by choosing input values with an exact covariance result instead of importing
builtinsjust to round the output.That would make
covar_popread more like the surrounding examples.