Skip to content

Commit d62faef

Browse files
timsaucerclaude
andcommitted
Add doctest examples to new DataFrame method docstrings
Add >>> style usage examples for window, explain, except_distinct, intersect_distinct, union_by_name, union_by_name_distinct, distinct_on, sort_by, and unnest_columns to match existing docstring conventions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent bb18bf9 commit d62faef

File tree

1 file changed

+97
-0
lines changed

1 file changed

+97
-0
lines changed

python/datafusion/dataframe.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,21 @@ def window(self, *exprs: Expr) -> DataFrame:
498498
499499
Returns:
500500
DataFrame with new window function columns appended.
501+
502+
Examples:
503+
Add a row number within each group:
504+
505+
>>> import datafusion.functions as f
506+
>>> from datafusion import col
507+
>>> ctx = dfn.SessionContext()
508+
>>> df = ctx.from_pydict({"a": [1, 2, 3], "b": ["x", "x", "y"]})
509+
>>> df = df.window(
510+
... f.row_number(
511+
... partition_by=[col("b")], order_by=[col("a")]
512+
... ).alias("rn")
513+
... )
514+
>>> "rn" in df.schema().names
515+
True
501516
"""
502517
raw = expr_list_to_raw_expr_list(exprs)
503518
return DataFrame(self.df.window(*raw))
@@ -967,6 +982,18 @@ def explain(
967982
analyze: If ``True``, the plan will run and metrics reported.
968983
format: Output format for the plan. Defaults to
969984
:py:attr:`ExplainFormat.INDENT`.
985+
986+
Examples:
987+
Show the plan in tree format:
988+
989+
>>> from datafusion import ExplainFormat
990+
>>> ctx = dfn.SessionContext()
991+
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
992+
>>> df.explain(format=ExplainFormat.TREE) # doctest: +SKIP
993+
994+
Show plan with runtime metrics:
995+
996+
>>> df.explain(analyze=True) # doctest: +SKIP
970997
"""
971998
fmt = format.value if format is not None else None
972999
self.df.explain(verbose, analyze, fmt)
@@ -1092,6 +1119,15 @@ def except_distinct(self, other: DataFrame) -> DataFrame:
10921119
10931120
Returns:
10941121
DataFrame after set difference with deduplication.
1122+
1123+
Examples:
1124+
Remove rows present in ``df2`` and deduplicate:
1125+
1126+
>>> ctx = dfn.SessionContext()
1127+
>>> df1 = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [10, 20, 30, 10]})
1128+
>>> df2 = ctx.from_pydict({"a": [1, 2], "b": [10, 20]})
1129+
>>> df1.except_distinct(df2).sort("a").to_pydict()
1130+
{'a': [3], 'b': [30]}
10951131
"""
10961132
return DataFrame(self.df.except_distinct(other.df))
10971133

@@ -1108,6 +1144,15 @@ def intersect_distinct(self, other: DataFrame) -> DataFrame:
11081144
11091145
Returns:
11101146
DataFrame after intersection with deduplication.
1147+
1148+
Examples:
1149+
Find rows common to both DataFrames:
1150+
1151+
>>> ctx = dfn.SessionContext()
1152+
>>> df1 = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]})
1153+
>>> df2 = ctx.from_pydict({"a": [1, 4], "b": [10, 40]})
1154+
>>> df1.intersect_distinct(df2).to_pydict()
1155+
{'a': [1], 'b': [10]}
11111156
"""
11121157
return DataFrame(self.df.intersect_distinct(other.df))
11131158

@@ -1123,6 +1168,15 @@ def union_by_name(self, other: DataFrame) -> DataFrame:
11231168
11241169
Returns:
11251170
DataFrame after union by name.
1171+
1172+
Examples:
1173+
Combine DataFrames with different column orders:
1174+
1175+
>>> ctx = dfn.SessionContext()
1176+
>>> df1 = ctx.from_pydict({"a": [1], "b": [10]})
1177+
>>> df2 = ctx.from_pydict({"b": [20], "a": [2]})
1178+
>>> df1.union_by_name(df2).sort("a").to_pydict()
1179+
{'a': [1, 2], 'b': [10, 20]}
11261180
"""
11271181
return DataFrame(self.df.union_by_name(other.df))
11281182

@@ -1136,6 +1190,15 @@ def union_by_name_distinct(self, other: DataFrame) -> DataFrame:
11361190
11371191
Returns:
11381192
DataFrame after union by name with deduplication.
1193+
1194+
Examples:
1195+
Union by name and remove duplicate rows:
1196+
1197+
>>> ctx = dfn.SessionContext()
1198+
>>> df1 = ctx.from_pydict({"a": [1, 1], "b": [10, 10]})
1199+
>>> df2 = ctx.from_pydict({"b": [10], "a": [1]})
1200+
>>> df1.union_by_name_distinct(df2).to_pydict()
1201+
{'a': [1], 'b': [10]}
11391202
"""
11401203
return DataFrame(self.df.union_by_name_distinct(other.df))
11411204

@@ -1158,6 +1221,19 @@ def distinct_on(
11581221
11591222
Returns:
11601223
DataFrame after deduplication.
1224+
1225+
Examples:
1226+
Keep the row with the smallest ``b`` for each unique ``a``:
1227+
1228+
>>> from datafusion import col
1229+
>>> ctx = dfn.SessionContext()
1230+
>>> df = ctx.from_pydict({"a": [1, 1, 2, 2], "b": [10, 20, 30, 40]})
1231+
>>> df.distinct_on(
1232+
... [col("a")],
1233+
... [col("a"), col("b")],
1234+
... [col("a").sort(ascending=True), col("b").sort(ascending=True)],
1235+
... ).sort("a").to_pydict()
1236+
{'a': [1, 2], 'b': [10, 30]}
11611237
"""
11621238
on_raw = expr_list_to_raw_expr_list(on_expr)
11631239
select_raw = expr_list_to_raw_expr_list(select_expr)
@@ -1176,6 +1252,14 @@ def sort_by(self, *exprs: Expr | str) -> DataFrame:
11761252
11771253
Returns:
11781254
DataFrame after sorting.
1255+
1256+
Examples:
1257+
Sort by a single column:
1258+
1259+
>>> ctx = dfn.SessionContext()
1260+
>>> df = ctx.from_pydict({"a": [3, 1, 2]})
1261+
>>> df.sort_by("a").to_pydict()
1262+
{'a': [1, 2, 3]}
11791263
"""
11801264
exprs = [self.parse_sql_expr(e) if isinstance(e, str) else e for e in exprs]
11811265
raw = expr_list_to_raw_expr_list(exprs)
@@ -1472,6 +1556,19 @@ def unnest_columns(
14721556
14731557
Returns:
14741558
A DataFrame with the columns expanded.
1559+
1560+
Examples:
1561+
Unnest an array column:
1562+
1563+
>>> ctx = dfn.SessionContext()
1564+
>>> df = ctx.from_pydict({"a": [[1, 2], [3]], "b": ["x", "y"]})
1565+
>>> df.unnest_columns("a").to_pydict()
1566+
{'a': [1, 2, 3], 'b': ['x', 'x', 'y']}
1567+
1568+
With explicit recursion depth:
1569+
1570+
>>> df.unnest_columns("a", recursions=[("a", "a", 1)]).to_pydict()
1571+
{'a': [1, 2, 3], 'b': ['x', 'x', 'y']}
14751572
"""
14761573
columns = list(columns)
14771574
return DataFrame(

0 commit comments

Comments
 (0)