Consolidate except_all/except_distinct and intersect/intersect_distinct into single methods with distinct flag

timsaucer · claude · timsaucer · commit 846d23e73743 · 2026-04-07T13:58:11.000-04:00
Follows the same pattern as union(distinct=) and union_by_name(distinct=).
Also deprecates union_distinct() in favor of union(distinct=True).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/crates/core/src/dataframe.rs b/crates/core/src/dataframe.rs
@@ -890,17 +890,6 @@ impl PyDataFrame {
         Ok(Self::new(new_df))
     }
 
-    /// Calculate the distinct union of two `DataFrame`s.  The
-    /// two `DataFrame`s must have exactly the same schema
-    fn union_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
-        let new_df = self
-            .df
-            .as_ref()
-            .clone()
-            .union_distinct(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
-    }
-
     #[pyo3(signature = (column, preserve_nulls=true, recursions=None))]
     fn unnest_column(
         &self,
@@ -935,38 +924,28 @@ impl PyDataFrame {
     }
 
     /// Calculate the intersection of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
-    fn intersect(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
-        let new_df = self
-            .df
-            .as_ref()
-            .clone()
-            .intersect(py_df.df.as_ref().clone())?;
+    #[pyo3(signature = (py_df, distinct=false))]
+    fn intersect(&self, py_df: PyDataFrame, distinct: bool) -> PyDataFusionResult<Self> {
+        let base = self.df.as_ref().clone();
+        let other = py_df.df.as_ref().clone();
+        let new_df = if distinct {
+            base.intersect_distinct(other)?
+        } else {
+            base.intersect(other)?
+        };
         Ok(Self::new(new_df))
     }
 
     /// Calculate the exception of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
-    fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
-        let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
-    }
-
-    /// Calculate the set difference with deduplication
-    fn except_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
-        let new_df = self
-            .df
-            .as_ref()
-            .clone()
-            .except_distinct(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
-    }
-
-    /// Calculate the intersection with deduplication
-    fn intersect_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
-        let new_df = self
-            .df
-            .as_ref()
-            .clone()
-            .intersect_distinct(py_df.df.as_ref().clone())?;
+    #[pyo3(signature = (py_df, distinct=false))]
+    fn except_all(&self, py_df: PyDataFrame, distinct: bool) -> PyDataFusionResult<Self> {
+        let base = self.df.as_ref().clone();
+        let other = py_df.df.as_ref().clone();
+        let new_df = if distinct {
+            base.except_distinct(other)?
+        } else {
+            base.except(other)?
+        };
         Ok(Self::new(new_df))
     }
 
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1179,96 +1179,76 @@ def union(self, other: DataFrame, distinct: bool = False) -> DataFrame:
         """
         return DataFrame(self.df.union(other.df, distinct))
 
+    @deprecated(
+        "union_distinct() is deprecated. Use union(other, distinct=True) instead."
+    )
     def union_distinct(self, other: DataFrame) -> DataFrame:
         """Calculate the distinct union of two :py:class:`DataFrame`.
 
-        The two :py:class:`DataFrame` must have exactly the same schema.
-        Any duplicate rows are discarded.
-
-        Args:
-            other: DataFrame to union with.
-
-        Returns:
-            DataFrame after union.
+        See Also:
+            :py:meth:`union`
         """
-        return DataFrame(self.df.union_distinct(other.df))
+        return self.union(other, distinct=True)
 
-    def intersect(self, other: DataFrame) -> DataFrame:
+    def intersect(self, other: DataFrame, distinct: bool = False) -> DataFrame:
         """Calculate the intersection of two :py:class:`DataFrame`.
 
         The two :py:class:`DataFrame` must have exactly the same schema.
 
         Args:
-            other:  DataFrame to intersect with.
+            other: DataFrame to intersect with.
+            distinct: If ``True``, duplicate rows are removed from the result.
 
         Returns:
             DataFrame after intersection.
-        """
-        return DataFrame(self.df.intersect(other.df))
 
-    def except_all(self, other: DataFrame) -> DataFrame:
-        """Calculate the exception of two :py:class:`DataFrame`.
+        Examples:
+            Find rows common to both DataFrames:
 
-        The two :py:class:`DataFrame` must have exactly the same schema.
+            >>> ctx = dfn.SessionContext()
+            >>> df1 = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]})
+            >>> df2 = ctx.from_pydict({"a": [1, 4], "b": [10, 40]})
+            >>> df1.intersect(df2).to_pydict()
+            {'a': [1], 'b': [10]}
 
-        Args:
-            other: DataFrame to calculate exception with.
+            Intersect with deduplication:
 
-        Returns:
-            DataFrame after exception.
+            >>> df1 = ctx.from_pydict({"a": [1, 1, 2], "b": [10, 10, 20]})
+            >>> df2 = ctx.from_pydict({"a": [1, 1], "b": [10, 10]})
+            >>> df1.intersect(df2, distinct=True).to_pydict()
+            {'a': [1], 'b': [10]}
         """
-        return DataFrame(self.df.except_all(other.df))
+        return DataFrame(self.df.intersect(other.df, distinct))
 
-    def except_distinct(self, other: DataFrame) -> DataFrame:
-        """Calculate the set difference with deduplication.
+    def except_all(self, other: DataFrame, distinct: bool = False) -> DataFrame:
+        """Calculate the set difference of two :py:class:`DataFrame`.
 
-        Returns rows that are in this DataFrame but not in ``other``,
-        removing any duplicates. In contrast, :py:meth:`except_all` preserves
-        duplicate rows.
+        Returns rows that are in this DataFrame but not in ``other``.
 
         The two :py:class:`DataFrame` must have exactly the same schema.
 
         Args:
             other: DataFrame to calculate exception with.
+            distinct: If ``True``, duplicate rows are removed from the result.
 
         Returns:
-            DataFrame after set difference with deduplication.
+            DataFrame after set difference.
 
         Examples:
-            Remove rows present in ``df2`` and deduplicate:
+            Remove rows present in ``df2``:
 
             >>> ctx = dfn.SessionContext()
-            >>> df1 = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [10, 20, 30, 10]})
+            >>> df1 = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]})
             >>> df2 = ctx.from_pydict({"a": [1, 2], "b": [10, 20]})
-            >>> df1.except_distinct(df2).sort("a").to_pydict()
+            >>> df1.except_all(df2).sort("a").to_pydict()
             {'a': [3], 'b': [30]}
-        """
-        return DataFrame(self.df.except_distinct(other.df))
-
-    def intersect_distinct(self, other: DataFrame) -> DataFrame:
-        """Calculate the intersection with deduplication.
-
-        Returns distinct rows that appear in both DataFrames. In contrast,
-        :py:meth:`intersect` preserves duplicate rows.
-
-        The two :py:class:`DataFrame` must have exactly the same schema.
-
-        Args:
-            other: DataFrame to intersect with.
-
-        Returns:
-            DataFrame after intersection with deduplication.
 
-        Examples:
-            Find rows common to both DataFrames:
+            Remove rows present in ``df2`` and deduplicate:
 
-            >>> ctx = dfn.SessionContext()
-            >>> df1 = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]})
-            >>> df2 = ctx.from_pydict({"a": [1, 4], "b": [10, 40]})
-            >>> df1.intersect_distinct(df2).to_pydict()
-            {'a': [1], 'b': [10]}
+            >>> df1.except_all(df2, distinct=True).sort("a").to_pydict()
+            {'a': [3], 'b': [30]}
         """
-        return DataFrame(self.df.intersect_distinct(other.df))
+        return DataFrame(self.df.except_all(other.df, distinct))
 
     def union_by_name(self, other: DataFrame, distinct: bool = False) -> DataFrame:
         """Union two :py:class:`DataFrame` matching columns by name.
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -3573,40 +3573,47 @@ def test_read_parquet_file_sort_order(tmp_path, file_sort_order):
 
 
 @pytest.mark.parametrize(
-    ("df1_data", "df2_data", "method", "expected_a", "expected_b"),
+    ("df1_data", "df2_data", "method", "kwargs", "expected_a", "expected_b"),
     [
         pytest.param(
             {"a": [1, 2, 3, 1], "b": [10, 20, 30, 10]},
             {"a": [1, 2], "b": [10, 20]},
-            "except_distinct",
+            "except_all",
+            {"distinct": True},
             [3],
             [30],
-            id="except_distinct: removes matching rows and deduplicates",
+            id="except_all(distinct=True): removes matching rows and deduplicates",
         ),
         pytest.param(
             {"a": [1, 2, 3, 1], "b": [10, 20, 30, 10]},
             {"a": [1, 4], "b": [10, 40]},
-            "intersect_distinct",
+            "intersect",
+            {"distinct": True},
             [1],
             [10],
-            id="intersect_distinct: keeps common rows and deduplicates",
+            id="intersect(distinct=True): keeps common rows and deduplicates",
         ),
         pytest.param(
             {"a": [1], "b": [10]},
             {"b": [20], "a": [2]},  # reversed column order tests matching by name
             "union_by_name",
+            {},
             [1, 2],
             [10, 20],
             id="union_by_name: matches columns by name not position",
         ),
     ],
 )
-def test_set_operations_distinct(df1_data, df2_data, method, expected_a, expected_b):
+def test_set_operations_distinct(
+    df1_data, df2_data, method, kwargs, expected_a, expected_b
+):
     ctx = SessionContext()
     df1 = ctx.from_pydict(df1_data)
     df2 = ctx.from_pydict(df2_data)
     result = (
-        getattr(df1, method)(df2).sort(column("a").sort(ascending=True)).collect()[0]
+        getattr(df1, method)(df2, **kwargs)
+        .sort(column("a").sort(ascending=True))
+        .collect()[0]
     )
     assert result.column(0).to_pylist() == expected_a
     assert result.column(1).to_pylist() == expected_b