Add missing DataFrame methods for set operations and query

timsaucer · claude · timsaucer · commit d7d3a7a3bc8c · 2026-03-31T14:10:47.000-04:00
Expose upstream DataFusion DataFrame methods that were not yet available in the Python API. Closes #1455. Set operations: - except_distinct: set difference with deduplication - intersect_distinct: set intersection with deduplication - union_by_name: union matching columns by name instead of position - union_by_name_distinct: union by name with deduplication Query: - distinct_on: deduplicate rows based on specific columns - sort_by: sort by expressions with ascending order and nulls last Note: show_limit is already covered by the existing show(num) method. explain_with_options and with_param_values are deferred as they require exposing additional types (ExplainOption, ParamValues). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/crates/core/src/dataframe.rs b/crates/core/src/dataframe.rs
@@ -922,6 +922,71 @@ impl PyDataFrame {
         Ok(Self::new(new_df))
     }
 
+    /// Calculate the set difference with deduplication
+    fn except_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
+        let new_df = self
+            .df
+            .as_ref()
+            .clone()
+            .except_distinct(py_df.df.as_ref().clone())?;
+        Ok(Self::new(new_df))
+    }
+
+    /// Calculate the intersection with deduplication
+    fn intersect_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
+        let new_df = self
+            .df
+            .as_ref()
+            .clone()
+            .intersect_distinct(py_df.df.as_ref().clone())?;
+        Ok(Self::new(new_df))
+    }
+
+    /// Union two DataFrames matching columns by name
+    fn union_by_name(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
+        let new_df = self
+            .df
+            .as_ref()
+            .clone()
+            .union_by_name(py_df.df.as_ref().clone())?;
+        Ok(Self::new(new_df))
+    }
+
+    /// Union two DataFrames by name with deduplication
+    fn union_by_name_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
+        let new_df = self
+            .df
+            .as_ref()
+            .clone()
+            .union_by_name_distinct(py_df.df.as_ref().clone())?;
+        Ok(Self::new(new_df))
+    }
+
+    /// Deduplicate rows based on specific columns, keeping the first row per group
+    fn distinct_on(
+        &self,
+        on_expr: Vec<PyExpr>,
+        select_expr: Vec<PyExpr>,
+        sort_expr: Option<Vec<PySortExpr>>,
+    ) -> PyDataFusionResult<Self> {
+        let on_expr = on_expr.into_iter().map(|e| e.into()).collect();
+        let select_expr = select_expr.into_iter().map(|e| e.into()).collect();
+        let sort_expr = sort_expr.map(to_sort_expressions);
+        let df = self
+            .df
+            .as_ref()
+            .clone()
+            .distinct_on(on_expr, select_expr, sort_expr)?;
+        Ok(Self::new(df))
+    }
+
+    /// Sort by column expressions with ascending order and nulls last
+    fn sort_by(&self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
+        let exprs = exprs.into_iter().map(|e| e.into()).collect();
+        let df = self.df.as_ref().clone().sort_by(exprs)?;
+        Ok(Self::new(df))
+    }
+
     /// Write a `DataFrame` to a CSV file.
     fn write_csv(
         &self,
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1036,6 +1036,109 @@ def except_all(self, other: DataFrame) -> DataFrame:
         """
         return DataFrame(self.df.except_all(other.df))
 
+    def except_distinct(self, other: DataFrame) -> DataFrame:
+        """Calculate the set difference with deduplication.
+
+        Returns rows that are in this DataFrame but not in ``other``,
+        removing any duplicates. This is the complement of :py:meth:`except_all`
+        which preserves duplicates.
+
+        The two :py:class:`DataFrame` must have exactly the same schema.
+
+        Args:
+            other: DataFrame to calculate exception with.
+
+        Returns:
+            DataFrame after set difference with deduplication.
+        """
+        return DataFrame(self.df.except_distinct(other.df))
+
+    def intersect_distinct(self, other: DataFrame) -> DataFrame:
+        """Calculate the intersection with deduplication.
+
+        Returns distinct rows that appear in both DataFrames. This is the
+        complement of :py:meth:`intersect` which preserves duplicates.
+
+        The two :py:class:`DataFrame` must have exactly the same schema.
+
+        Args:
+            other: DataFrame to intersect with.
+
+        Returns:
+            DataFrame after intersection with deduplication.
+        """
+        return DataFrame(self.df.intersect_distinct(other.df))
+
+    def union_by_name(self, other: DataFrame) -> DataFrame:
+        """Union two :py:class:`DataFrame` matching columns by name.
+
+        Unlike :py:meth:`union` which matches columns by position, this method
+        matches columns by their names, allowing DataFrames with different
+        column orders to be combined.
+
+        Args:
+            other: DataFrame to union with.
+
+        Returns:
+            DataFrame after union by name.
+        """
+        return DataFrame(self.df.union_by_name(other.df))
+
+    def union_by_name_distinct(self, other: DataFrame) -> DataFrame:
+        """Union two :py:class:`DataFrame` by name with deduplication.
+
+        Combines :py:meth:`union_by_name` with deduplication of rows.
+
+        Args:
+            other: DataFrame to union with.
+
+        Returns:
+            DataFrame after union by name with deduplication.
+        """
+        return DataFrame(self.df.union_by_name_distinct(other.df))
+
+    def distinct_on(
+        self,
+        on_expr: list[Expr],
+        select_expr: list[Expr],
+        sort_expr: list[SortKey] | None = None,
+    ) -> DataFrame:
+        """Deduplicate rows based on specific columns.
+
+        Returns a new DataFrame with one row per unique combination of the
+        ``on_expr`` columns, keeping the first row per group as determined by
+        ``sort_expr``.
+
+        Args:
+            on_expr: Expressions that determine uniqueness.
+            select_expr: Expressions to include in the output.
+            sort_expr: Optional sort expressions to determine which row to keep.
+
+        Returns:
+            DataFrame after deduplication.
+        """
+        on_raw = expr_list_to_raw_expr_list(on_expr)
+        select_raw = expr_list_to_raw_expr_list(select_expr)
+        sort_raw = sort_list_to_raw_sort_list(sort_expr) if sort_expr else None
+        return DataFrame(self.df.distinct_on(on_raw, select_raw, sort_raw))
+
+    def sort_by(self, *exprs: Expr | str) -> DataFrame:
+        """Sort the DataFrame by column expressions in ascending order.
+
+        This is a convenience method that sorts all columns in ascending order
+        with nulls last. For more control over sort direction and null ordering,
+        use :py:meth:`sort` instead.
+
+        Args:
+            exprs: Expressions or column names to sort by.
+
+        Returns:
+            DataFrame after sorting.
+        """
+        exprs = [self.parse_sql_expr(e) if isinstance(e, str) else e for e in exprs]
+        raw = expr_list_to_raw_expr_list(exprs)
+        return DataFrame(self.df.sort_by(raw))
+
     def write_csv(
         self,
         path: str | pathlib.Path,
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -3569,3 +3569,66 @@ def test_read_parquet_file_sort_order(tmp_path, file_sort_order):
     pa.parquet.write_table(table, path)
     df = ctx.read_parquet(path, file_sort_order=file_sort_order)
     assert df.collect()[0].column(0).to_pylist() == [1, 2]
+
+
+def test_except_distinct():
+    ctx = SessionContext()
+    df1 = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [10, 20, 30, 10]})
+    df2 = ctx.from_pydict({"a": [1, 2], "b": [10, 20]})
+    result = (
+        df1.except_distinct(df2).sort(column("a").sort(ascending=True)).collect()[0]
+    )
+    assert result.column(0).to_pylist() == [3]
+    assert result.column(1).to_pylist() == [30]
+
+
+def test_intersect_distinct():
+    ctx = SessionContext()
+    df1 = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [10, 20, 30, 10]})
+    df2 = ctx.from_pydict({"a": [1, 4], "b": [10, 40]})
+    result = df1.intersect_distinct(df2).collect()[0]
+    assert result.column(0).to_pylist() == [1]
+    assert result.column(1).to_pylist() == [10]
+
+
+def test_union_by_name():
+    ctx = SessionContext()
+    df1 = ctx.from_pydict({"a": [1], "b": [10]})
+    # Different column order
+    df2 = ctx.from_pydict({"b": [20], "a": [2]})
+    batches = df1.union_by_name(df2).sort(column("a").sort(ascending=True)).collect()
+    rows = pa.concat_arrays([b.column(0) for b in batches]).to_pylist()
+    assert rows == [1, 2]
+
+
+def test_union_by_name_distinct():
+    ctx = SessionContext()
+    df1 = ctx.from_pydict({"a": [1, 1], "b": [10, 10]})
+    df2 = ctx.from_pydict({"b": [10], "a": [1]})
+    batches = df1.union_by_name_distinct(df2).collect()
+    total_rows = sum(b.num_rows for b in batches)
+    assert total_rows == 1
+
+
+def test_distinct_on():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [1, 1, 2, 2], "b": [10, 20, 30, 40]})
+    result = (
+        df.distinct_on(
+            [column("a")],
+            [column("a"), column("b")],
+            [column("a").sort(ascending=True), column("b").sort(ascending=True)],
+        )
+        .sort(column("a").sort(ascending=True))
+        .collect()[0]
+    )
+    # Keeps the first row per group (smallest b per a)
+    assert result.column(0).to_pylist() == [1, 2]
+    assert result.column(1).to_pylist() == [10, 30]
+
+
+def test_sort_by():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [3, 1, 2]})
+    result = df.sort_by(column("a")).collect()[0]
+    assert result.column(0).to_pylist() == [1, 2, 3]