Skip to content

Commit b46987e

Browse files
timsaucerclaude
andcommitted
Address PR review feedback for DataFrame operations
- Use upstream parse error for explain format instead of hardcoded options - Fix sort_by to use column name resolution consistent with sort() - Use ExplainFormat enum members directly in tests instead of string lookup - Merge union_by_name_distinct into union_by_name(distinct=False) for a more Pythonic API - Update check-upstream skill to note union_by_name_distinct coverage Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3849301 commit b46987e

File tree

4 files changed

+32
-43
lines changed

4 files changed

+32
-43
lines changed

.ai/skills/check-upstream/SKILL.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ The user may specify an area via `$ARGUMENTS`. If no area is specified or "all"
109109
**Evaluated and not requiring separate Python exposure:**
110110
- `show_limit` — already covered by `DataFrame.show()`, which provides the same functionality with a simpler API
111111
- `with_param_values` — already covered by the `param_values` argument on `SessionContext.sql()`, which accomplishes the same thing more robustly
112+
- `union_by_name_distinct` — already covered by `DataFrame.union_by_name(distinct=True)`, which provides a more Pythonic API
112113

113114
**How to check:**
114115
1. Fetch the upstream DataFrame documentation page listing all methods

crates/core/src/dataframe.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -823,11 +823,8 @@ impl PyDataFrame {
823823
let explain_format = match format {
824824
Some(f) => f
825825
.parse::<datafusion::common::format::ExplainFormat>()
826-
.map_err(|_| {
827-
PyDataFusionError::Common(format!(
828-
"Invalid explain format: '{}'. Valid options: indent, tree, pgjson, graphviz",
829-
f
830-
))
826+
.map_err(|e| {
827+
PyDataFusionError::Common(format!("Invalid explain format '{}': {}", f, e))
831828
})?,
832829
None => datafusion::common::format::ExplainFormat::Indent,
833830
};

python/datafusion/dataframe.py

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
Expr,
4545
SortExpr,
4646
SortKey,
47+
_to_raw_expr,
4748
ensure_expr,
4849
ensure_expr_list,
4950
expr_list_to_raw_expr_list,
@@ -1170,7 +1171,7 @@ def intersect_distinct(self, other: DataFrame) -> DataFrame:
11701171
"""
11711172
return DataFrame(self.df.intersect_distinct(other.df))
11721173

1173-
def union_by_name(self, other: DataFrame) -> DataFrame:
1174+
def union_by_name(self, other: DataFrame, distinct: bool = False) -> DataFrame:
11741175
"""Union two :py:class:`DataFrame` matching columns by name.
11751176
11761177
Unlike :py:meth:`union` which matches columns by position, this method
@@ -1179,6 +1180,7 @@ def union_by_name(self, other: DataFrame) -> DataFrame:
11791180
11801181
Args:
11811182
other: DataFrame to union with.
1183+
distinct: If ``True``, duplicate rows are removed from the result.
11821184
11831185
Returns:
11841186
DataFrame after union by name.
@@ -1191,30 +1193,17 @@ def union_by_name(self, other: DataFrame) -> DataFrame:
11911193
>>> df2 = ctx.from_pydict({"b": [20], "a": [2]})
11921194
>>> df1.union_by_name(df2).sort("a").to_pydict()
11931195
{'a': [1, 2], 'b': [10, 20]}
1194-
"""
1195-
return DataFrame(self.df.union_by_name(other.df))
1196-
1197-
def union_by_name_distinct(self, other: DataFrame) -> DataFrame:
1198-
"""Union two :py:class:`DataFrame` by name with deduplication.
1199-
1200-
Combines :py:meth:`union_by_name` with deduplication of rows.
1201-
1202-
Args:
1203-
other: DataFrame to union with.
12041196
1205-
Returns:
1206-
DataFrame after union by name with deduplication.
1207-
1208-
Examples:
1209-
Union by name and remove duplicate rows:
1197+
Union by name with deduplication:
12101198
1211-
>>> ctx = dfn.SessionContext()
12121199
>>> df1 = ctx.from_pydict({"a": [1, 1], "b": [10, 10]})
12131200
>>> df2 = ctx.from_pydict({"b": [10], "a": [1]})
1214-
>>> df1.union_by_name_distinct(df2).to_pydict()
1201+
>>> df1.union_by_name(df2, distinct=True).to_pydict()
12151202
{'a': [1], 'b': [10]}
12161203
"""
1217-
return DataFrame(self.df.union_by_name_distinct(other.df))
1204+
if distinct:
1205+
return DataFrame(self.df.union_by_name_distinct(other.df))
1206+
return DataFrame(self.df.union_by_name(other.df))
12181207

12191208
def distinct_on(
12201209
self,
@@ -1275,8 +1264,7 @@ def sort_by(self, *exprs: Expr | str) -> DataFrame:
12751264
>>> df.sort_by("a").to_pydict()
12761265
{'a': [1, 2, 3]}
12771266
"""
1278-
exprs = [self.parse_sql_expr(e) if isinstance(e, str) else e for e in exprs]
1279-
raw = expr_list_to_raw_expr_list(exprs)
1267+
raw = [_to_raw_expr(e) for e in exprs]
12801268
return DataFrame(self.df.sort_by(raw))
12811269

12821270
def write_csv(

python/tests/test_dataframe.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import pytest
3030
from datafusion import (
3131
DataFrame,
32+
ExplainFormat,
3233
InsertOp,
3334
ParquetColumnOptions,
3435
ParquetWriterOptions,
@@ -3598,14 +3599,6 @@ def test_read_parquet_file_sort_order(tmp_path, file_sort_order):
35983599
[10, 20],
35993600
id="union_by_name: matches columns by name not position",
36003601
),
3601-
pytest.param(
3602-
{"a": [1, 1], "b": [10, 10]},
3603-
{"b": [10], "a": [1]}, # reversed column order with duplicates
3604-
"union_by_name_distinct",
3605-
[1],
3606-
[10],
3607-
id="union_by_name_distinct: matches by name and deduplicates",
3608-
),
36093602
],
36103603
)
36113604
def test_set_operations_distinct(df1_data, df2_data, method, expected_a, expected_b):
@@ -3619,6 +3612,15 @@ def test_set_operations_distinct(df1_data, df2_data, method, expected_a, expecte
36193612
assert result.column(1).to_pylist() == expected_b
36203613

36213614

3615+
def test_union_by_name_distinct():
3616+
ctx = SessionContext()
3617+
df1 = ctx.from_pydict({"a": [1, 1], "b": [10, 10]})
3618+
df2 = ctx.from_pydict({"b": [10], "a": [1]})
3619+
result = df1.union_by_name(df2, distinct=True).collect()[0]
3620+
assert result.column(0).to_pylist() == [1]
3621+
assert result.column(1).to_pylist() == [10]
3622+
3623+
36223624
def test_distinct_on():
36233625
ctx = SessionContext()
36243626
df = ctx.from_pydict({"a": [1, 1, 2, 2], "b": [10, 20, 30, 40]})
@@ -3655,20 +3657,21 @@ def test_sort_by(input_values, expected):
36553657
@pytest.mark.parametrize(
36563658
("fmt", "verbose", "analyze", "expected_substring"),
36573659
[
3658-
(None, False, False, None),
3659-
("TREE", False, False, "---"),
3660-
("INDENT", True, True, None),
3661-
("PGJSON", False, False, '"Plan"'),
3662-
("GRAPHVIZ", False, False, "digraph"),
3660+
pytest.param(None, False, False, None, id="default format"),
3661+
pytest.param(ExplainFormat.TREE, False, False, "---", id="tree format"),
3662+
pytest.param(
3663+
ExplainFormat.INDENT, True, True, None, id="indent verbose+analyze"
3664+
),
3665+
pytest.param(ExplainFormat.PGJSON, False, False, '"Plan"', id="pgjson format"),
3666+
pytest.param(
3667+
ExplainFormat.GRAPHVIZ, False, False, "digraph", id="graphviz format"
3668+
),
36633669
],
36643670
)
36653671
def test_explain_with_format(capsys, fmt, verbose, analyze, expected_substring):
3666-
from datafusion import ExplainFormat
3667-
36683672
ctx = SessionContext()
36693673
df = ctx.from_pydict({"a": [1]})
3670-
explain_fmt = ExplainFormat[fmt] if fmt is not None else None
3671-
df.explain(verbose=verbose, analyze=analyze, format=explain_fmt)
3674+
df.explain(verbose=verbose, analyze=analyze, format=fmt)
36723675
captured = capsys.readouterr()
36733676
assert "plan_type" in captured.out
36743677
if expected_substring is not None:

0 commit comments

Comments
 (0)