|
6 | 6 | import pandas as native_pd |
7 | 7 | import pytest |
8 | 8 |
|
| 9 | +from snowflake.snowpark._internal.utils import TempObjectType |
9 | 10 | import snowflake.snowpark.modin.plugin # noqa: F401 |
10 | 11 | from tests.integ.modin.utils import assert_frame_equal |
11 | 12 | from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker |
| 13 | +from tests.utils import Utils |
12 | 14 |
|
13 | 15 |
|
14 | 16 | @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"], []]) |
@@ -79,6 +81,29 @@ def test_drop_duplicates(subset, keep, ignore_index): |
79 | 81 | ) |
80 | 82 |
|
81 | 83 |
|
| 84 | +def test_drop_duplicates_after_read_snowflake(session): |
| 85 | + pandas_df = native_pd.DataFrame( |
| 86 | + {"A": [0, 1, 1, 2, 0], "B": ["a", "b", "c", "b", "a"]} |
| 87 | + ) |
| 88 | + table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE) |
| 89 | + session.create_dataframe(pandas_df).write.save_as_table( |
| 90 | + table_name, table_type="temp" |
| 91 | + ) |
| 92 | + # Simulate random order each time read_snowflake is run. |
| 93 | + snow_df = pd.read_snowflake( |
| 94 | + f"select A, B from (select random() as r, A, B from {table_name}) order by r" |
| 95 | + ) |
| 96 | + query_count = 1 |
| 97 | + join_count = 2 |
| 98 | + with SqlCounter(query_count=query_count, join_count=join_count): |
| 99 | + assert_frame_equal( |
| 100 | + snow_df.drop_duplicates(keep="first"), |
| 101 | + pandas_df.drop_duplicates(keep="first"), |
| 102 | + check_dtype=False, |
| 103 | + check_index_type=False, |
| 104 | + ) |
| 105 | + |
| 106 | + |
82 | 107 | @pytest.mark.parametrize("subset", ["a", ["a"], ["b"], ["a", "b"]]) |
83 | 108 | @pytest.mark.parametrize("keep", ["first", "last", False]) |
84 | 109 | @sql_count_checker(query_count=1, join_count=2) |
|
0 commit comments