Fix drop_duplicates issue due to mismatching row positions in dfs based on same data source (#3766)

sfc-gh-helmeleegy · web-flow · commit b59743a02e6f · 2025-09-11T13:45:01.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -48,6 +48,8 @@
 
 #### Bug Fixes
 
+- Fixed an issue with drop_duplicates where the same data source could be read multiple times in the same query but in a different order each time, resulting in missing rows in the final result. The fix ensures that the data source is read only once.
+
 ### Snowpark Local Testing Updates
 
 #### New Features
diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py
@@ -1514,6 +1514,17 @@ def drop_duplicates(
     """
     if keep not in ("first", "last", False):
         raise ValueError('keep must be either "first", "last" or False')
+
+    # Make sure CTE optimization is enabled.
+    # The reason this is required for drop_duplicates is that two dataframes need to be joined
+    # on their row position (one is used as a filter for the other) and while not identical,
+    # they both originate from the same source.
+    # Since read_snowflake can result in assigning row positions differently each time it's run,
+    # then if we compute the two dataframes independently, their row positions may not match.
+    # With the CTE optimization, we are guaranteed that reading the input source will only happen
+    # once in the finally generated query, and hence no mismatch in row positions will take place.
+    pd.session.cte_optimization_enabled = True
+
     inplace = validate_bool_kwarg(inplace, "inplace")
     ignore_index = kwargs.get("ignore_index", False)
     subset = kwargs.get("subset", None)
diff --git a/tests/integ/modin/frame/test_drop_duplicates.py b/tests/integ/modin/frame/test_drop_duplicates.py
@@ -6,9 +6,11 @@
 import pandas as native_pd
 import pytest
 
+from snowflake.snowpark._internal.utils import TempObjectType
 import snowflake.snowpark.modin.plugin  # noqa: F401
 from tests.integ.modin.utils import assert_frame_equal
 from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker
+from tests.utils import Utils
 
 
 @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"], []])
@@ -79,6 +81,29 @@ def test_drop_duplicates(subset, keep, ignore_index):
         )
 
 
+def test_drop_duplicates_after_read_snowflake(session):
+    pandas_df = native_pd.DataFrame(
+        {"A": [0, 1, 1, 2, 0], "B": ["a", "b", "c", "b", "a"]}
+    )
+    table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
+    session.create_dataframe(pandas_df).write.save_as_table(
+        table_name, table_type="temp"
+    )
+    # Simulate random order each time read_snowflake is run.
+    snow_df = pd.read_snowflake(
+        f"select A, B from (select random() as r, A, B from {table_name}) order by r"
+    )
+    query_count = 1
+    join_count = 2
+    with SqlCounter(query_count=query_count, join_count=join_count):
+        assert_frame_equal(
+            snow_df.drop_duplicates(keep="first"),
+            pandas_df.drop_duplicates(keep="first"),
+            check_dtype=False,
+            check_index_type=False,
+        )
+
+
 @pytest.mark.parametrize("subset", ["a", ["a"], ["b"], ["a", "b"]])
 @pytest.mark.parametrize("keep", ["first", "last", False])
 @sql_count_checker(query_count=1, join_count=2)