diff --git a/CHANGELOG.md b/CHANGELOG.md index 3779c44d8b..a3dd8b02b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,12 +4,14 @@ ### Snowpark Python API Updates -#### Bug Fixes - #### New Features - Added `get_wif_token` to `snowflake.snowpark.secrets` for workload identity federation tokens on the Snowflake server (not available in SPCS file-based secret environments). +#### Bug Fixes + +- Fixed a bug where calling `DataFrame.alias()` twice on the same DataFrame (e.g. for a self-join) caused both aliases to share the same internal column-mapping dictionary. This made `col("R", "col")` resolve to the same column as `col("L", "col")`, producing incorrect join conditions and filter expressions. + ## 1.51.1 (2026-05-28) #### Documentation diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py index f4ea001917..6e43c88ac5 100644 --- a/src/snowflake/snowpark/_internal/analyzer/select_statement.py +++ b/src/snowflake/snowpark/_internal/analyzer/select_statement.py @@ -963,7 +963,7 @@ def __copy__(self): new._snowflake_plan = None new.flatten_disabled = False # by default a SelectStatement can be flattened. new._api_calls = self._api_calls.copy() if self._api_calls is not None else None - new.df_aliased_col_name_to_real_col_name = ( + new.df_aliased_col_name_to_real_col_name = deepcopy( self.df_aliased_col_name_to_real_col_name ) new._merge_projection_complexity_with_subquery = ( diff --git a/tests/integ/test_dataframe.py b/tests/integ/test_dataframe.py index cf3eed77ae..de27f81c4a 100644 --- a/tests/integ/test_dataframe.py +++ b/tests/integ/test_dataframe.py @@ -6838,6 +6838,36 @@ def test_dataframe_alias(session): .select(df1["*"], df3["*"], df2["col1"]), ) + # Regression: aliasing the same DataFrame twice must produce independent + # df_aliased_col_name_to_real_col_name dicts so that col("R","col") resolves + # to the right-side column, not the left-side column. + # Before the fix, SelectStatement.__copy__ assigned the dict by reference, + # causing alias("L") and alias("R") on the same df to share the same dict. + df_self = session.create_dataframe( + [[1, 10], [2, 20], [3, 30]], schema=["id", "val"] + ) + + # Self-join ON condition using col() alias references: each row should match + # only itself (equi-join on unique key). With the shared-dict bug the ON + # condition degenerates to "id" = "id" (always true), producing a cross-join. + Utils.check_answer( + df_self.alias("L") + .join(df_self.alias("R"), col("L", "id") == col("R", "id")) + .select(col("L", "id"), col("L", "val"), col("R", "val")), + [(1, 10, 10), (2, 20, 20), (3, 30, 30)], + ) + + # Post-join filter using col() alias references: col("R","val") must resolve + # to the right-side column. With the shared-dict bug it resolved to the + # left-side column, making the filter semantically wrong. + Utils.check_answer( + df_self.alias("L") + .join(df_self.alias("R"), col("L", "id") == col("R", "id")) + .filter(col("R", "val") == 20) + .select(col("L", "id")), + [(2,)], + ) + @pytest.mark.skipif( "config.getoption('local_testing_mode', default=False)", diff --git a/tests/unit/test_deepcopy.py b/tests/unit/test_deepcopy.py index 713ed1819c..acc74ac327 100644 --- a/tests/unit/test_deepcopy.py +++ b/tests/unit/test_deepcopy.py @@ -301,6 +301,36 @@ def _create_select_statement(mock_session, mock_analyzer): return SelectStatement(from_=from_, analyzer=mock_analyzer) +def test_select_statement_copy_aliases_isolated(mock_session, mock_analyzer): + """copy.copy(SelectStatement) must produce an independent df_aliased_col_name_to_real_col_name. + + Before the fix, __copy__ assigned the dict by reference. Calling alias("L") then + alias("R") on the same DataFrame both wrote to the *same* dict, causing col("R","col") + to resolve to the left-side column after a self-join. + """ + from_ = SelectableEntity( + SnowflakeTable("TEST_TABLE", session=mock_session), analyzer=mock_analyzer + ) + original = SelectStatement(from_=from_, analyzer=mock_analyzer) + original.df_aliased_col_name_to_real_col_name["A"] = {"col": "col"} + + copied = copy.copy(original) + + # The copy must be a distinct object. + assert ( + copied.df_aliased_col_name_to_real_col_name + is not original.df_aliased_col_name_to_real_col_name + ) + + # Mutations to the copy must not affect the original. + copied.df_aliased_col_name_to_real_col_name["B"] = {"col": "col"} + assert "B" not in original.df_aliased_col_name_to_real_col_name + + # Mutations to the original must not affect the copy. + original.df_aliased_col_name_to_real_col_name["C"] = {"col": "col"} + assert "C" not in copied.df_aliased_col_name_to_real_col_name + + @pytest.mark.parametrize( "selectable_factory,copy_func,reduce_describe_enabled,cte_enabled", [