Address PR review feedback for DataFrame operations

timsaucer · claude · timsaucer · commit b46987ea6386 · 2026-04-07T09:24:48.000-04:00
- Use upstream parse error for explain format instead of hardcoded options
- Fix sort_by to use column name resolution consistent with sort()
- Use ExplainFormat enum members directly in tests instead of string lookup
- Merge union_by_name_distinct into union_by_name(distinct=False) for a
  more Pythonic API
- Update check-upstream skill to note union_by_name_distinct coverage

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.ai/skills/check-upstream/SKILL.md b/.ai/skills/check-upstream/SKILL.md
@@ -109,6 +109,7 @@ The user may specify an area via `$ARGUMENTS`. If no area is specified or "all"
 **Evaluated and not requiring separate Python exposure:**
 - `show_limit` — already covered by `DataFrame.show()`, which provides the same functionality with a simpler API
 - `with_param_values` — already covered by the `param_values` argument on `SessionContext.sql()`, which accomplishes the same thing more robustly
+- `union_by_name_distinct` — already covered by `DataFrame.union_by_name(distinct=True)`, which provides a more Pythonic API
 
 **How to check:**
 1. Fetch the upstream DataFrame documentation page listing all methods
diff --git a/crates/core/src/dataframe.rs b/crates/core/src/dataframe.rs
@@ -823,11 +823,8 @@ impl PyDataFrame {
         let explain_format = match format {
             Some(f) => f
                 .parse::<datafusion::common::format::ExplainFormat>()
-                .map_err(|_| {
-                    PyDataFusionError::Common(format!(
-                        "Invalid explain format: '{}'. Valid options: indent, tree, pgjson, graphviz",
-                        f
-                    ))
+                .map_err(|e| {
+                    PyDataFusionError::Common(format!("Invalid explain format '{}': {}", f, e))
                 })?,
             None => datafusion::common::format::ExplainFormat::Indent,
         };
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -44,6 +44,7 @@
     Expr,
     SortExpr,
     SortKey,
+    _to_raw_expr,
     ensure_expr,
     ensure_expr_list,
     expr_list_to_raw_expr_list,
@@ -1170,7 +1171,7 @@ def intersect_distinct(self, other: DataFrame) -> DataFrame:
         """
         return DataFrame(self.df.intersect_distinct(other.df))
 
-    def union_by_name(self, other: DataFrame) -> DataFrame:
+    def union_by_name(self, other: DataFrame, distinct: bool = False) -> DataFrame:
         """Union two :py:class:`DataFrame` matching columns by name.
 
         Unlike :py:meth:`union` which matches columns by position, this method
@@ -1179,6 +1180,7 @@ def union_by_name(self, other: DataFrame) -> DataFrame:
 
         Args:
             other: DataFrame to union with.
+            distinct: If ``True``, duplicate rows are removed from the result.
 
         Returns:
             DataFrame after union by name.
@@ -1191,30 +1193,17 @@ def union_by_name(self, other: DataFrame) -> DataFrame:
             >>> df2 = ctx.from_pydict({"b": [20], "a": [2]})
             >>> df1.union_by_name(df2).sort("a").to_pydict()
             {'a': [1, 2], 'b': [10, 20]}
-        """
-        return DataFrame(self.df.union_by_name(other.df))
-
-    def union_by_name_distinct(self, other: DataFrame) -> DataFrame:
-        """Union two :py:class:`DataFrame` by name with deduplication.
-
-        Combines :py:meth:`union_by_name` with deduplication of rows.
-
-        Args:
-            other: DataFrame to union with.
 
-        Returns:
-            DataFrame after union by name with deduplication.
-
-        Examples:
-            Union by name and remove duplicate rows:
+            Union by name with deduplication:
 
-            >>> ctx = dfn.SessionContext()
             >>> df1 = ctx.from_pydict({"a": [1, 1], "b": [10, 10]})
             >>> df2 = ctx.from_pydict({"b": [10], "a": [1]})
-            >>> df1.union_by_name_distinct(df2).to_pydict()
+            >>> df1.union_by_name(df2, distinct=True).to_pydict()
             {'a': [1], 'b': [10]}
         """
-        return DataFrame(self.df.union_by_name_distinct(other.df))
+        if distinct:
+            return DataFrame(self.df.union_by_name_distinct(other.df))
+        return DataFrame(self.df.union_by_name(other.df))
 
     def distinct_on(
         self,
@@ -1275,8 +1264,7 @@ def sort_by(self, *exprs: Expr | str) -> DataFrame:
             >>> df.sort_by("a").to_pydict()
             {'a': [1, 2, 3]}
         """
-        exprs = [self.parse_sql_expr(e) if isinstance(e, str) else e for e in exprs]
-        raw = expr_list_to_raw_expr_list(exprs)
+        raw = [_to_raw_expr(e) for e in exprs]
         return DataFrame(self.df.sort_by(raw))
 
     def write_csv(
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -29,6 +29,7 @@
 import pytest
 from datafusion import (
     DataFrame,
+    ExplainFormat,
     InsertOp,
     ParquetColumnOptions,
     ParquetWriterOptions,
@@ -3598,14 +3599,6 @@ def test_read_parquet_file_sort_order(tmp_path, file_sort_order):
             [10, 20],
             id="union_by_name: matches columns by name not position",
         ),
-        pytest.param(
-            {"a": [1, 1], "b": [10, 10]},
-            {"b": [10], "a": [1]},  # reversed column order with duplicates
-            "union_by_name_distinct",
-            [1],
-            [10],
-            id="union_by_name_distinct: matches by name and deduplicates",
-        ),
     ],
 )
 def test_set_operations_distinct(df1_data, df2_data, method, expected_a, expected_b):
@@ -3619,6 +3612,15 @@ def test_set_operations_distinct(df1_data, df2_data, method, expected_a, expecte
     assert result.column(1).to_pylist() == expected_b
 
 
+def test_union_by_name_distinct():
+    ctx = SessionContext()
+    df1 = ctx.from_pydict({"a": [1, 1], "b": [10, 10]})
+    df2 = ctx.from_pydict({"b": [10], "a": [1]})
+    result = df1.union_by_name(df2, distinct=True).collect()[0]
+    assert result.column(0).to_pylist() == [1]
+    assert result.column(1).to_pylist() == [10]
+
+
 def test_distinct_on():
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [1, 1, 2, 2], "b": [10, 20, 30, 40]})
@@ -3655,20 +3657,21 @@ def test_sort_by(input_values, expected):
 @pytest.mark.parametrize(
     ("fmt", "verbose", "analyze", "expected_substring"),
     [
-        (None, False, False, None),
-        ("TREE", False, False, "---"),
-        ("INDENT", True, True, None),
-        ("PGJSON", False, False, '"Plan"'),
-        ("GRAPHVIZ", False, False, "digraph"),
+        pytest.param(None, False, False, None, id="default format"),
+        pytest.param(ExplainFormat.TREE, False, False, "---", id="tree format"),
+        pytest.param(
+            ExplainFormat.INDENT, True, True, None, id="indent verbose+analyze"
+        ),
+        pytest.param(ExplainFormat.PGJSON, False, False, '"Plan"', id="pgjson format"),
+        pytest.param(
+            ExplainFormat.GRAPHVIZ, False, False, "digraph", id="graphviz format"
+        ),
     ],
 )
 def test_explain_with_format(capsys, fmt, verbose, analyze, expected_substring):
-    from datafusion import ExplainFormat
-
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [1]})
-    explain_fmt = ExplainFormat[fmt] if fmt is not None else None
-    df.explain(verbose=verbose, analyze=analyze, format=explain_fmt)
+    df.explain(verbose=verbose, analyze=analyze, format=fmt)
     captured = capsys.readouterr()
     assert "plan_type" in captured.out
     if expected_substring is not None: