From 104249631102aa4c2056f2f7703234c2db44c765 Mon Sep 17 00:00:00 2001 From: Adam Ling Date: Fri, 29 May 2026 22:38:39 +0000 Subject: [PATCH] fix df alias --- CHANGELOG.md | 4 +++ .../_internal/analyzer/select_statement.py | 2 +- tests/integ/test_dataframe.py | 30 +++++++++++++++++++ tests/unit/test_deepcopy.py | 30 +++++++++++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dd3c2fe06..5a9469eede 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ - Added `get_wif_token` to `snowflake.snowpark.secrets` for workload identity federation tokens on the Snowflake server (not available in SPCS file-based secret environments). +#### Bug Fixes + +- Fixed a bug where calling `DataFrame.alias()` twice on the same DataFrame (e.g. for a self-join) caused both aliases to share the same internal column-mapping dictionary. This made `col("R", "col")` resolve to the same column as `col("L", "col")`, producing incorrect join conditions and filter expressions. + #### Documentation - Clarified that the JDBC driver JAR referenced via `udtf_configs.imports` in `DataFrameReader.jdbc()` must be downloaded from the database vendor and uploaded to a Snowflake stage. diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py index f4ea001917..6e43c88ac5 100644 --- a/src/snowflake/snowpark/_internal/analyzer/select_statement.py +++ b/src/snowflake/snowpark/_internal/analyzer/select_statement.py @@ -963,7 +963,7 @@ def __copy__(self): new._snowflake_plan = None new.flatten_disabled = False # by default a SelectStatement can be flattened. new._api_calls = self._api_calls.copy() if self._api_calls is not None else None - new.df_aliased_col_name_to_real_col_name = ( + new.df_aliased_col_name_to_real_col_name = deepcopy( self.df_aliased_col_name_to_real_col_name ) new._merge_projection_complexity_with_subquery = ( diff --git a/tests/integ/test_dataframe.py b/tests/integ/test_dataframe.py index cf3eed77ae..de27f81c4a 100644 --- a/tests/integ/test_dataframe.py +++ b/tests/integ/test_dataframe.py @@ -6838,6 +6838,36 @@ def test_dataframe_alias(session): .select(df1["*"], df3["*"], df2["col1"]), ) + # Regression: aliasing the same DataFrame twice must produce independent + # df_aliased_col_name_to_real_col_name dicts so that col("R","col") resolves + # to the right-side column, not the left-side column. + # Before the fix, SelectStatement.__copy__ assigned the dict by reference, + # causing alias("L") and alias("R") on the same df to share the same dict. + df_self = session.create_dataframe( + [[1, 10], [2, 20], [3, 30]], schema=["id", "val"] + ) + + # Self-join ON condition using col() alias references: each row should match + # only itself (equi-join on unique key). With the shared-dict bug the ON + # condition degenerates to "id" = "id" (always true), producing a cross-join. + Utils.check_answer( + df_self.alias("L") + .join(df_self.alias("R"), col("L", "id") == col("R", "id")) + .select(col("L", "id"), col("L", "val"), col("R", "val")), + [(1, 10, 10), (2, 20, 20), (3, 30, 30)], + ) + + # Post-join filter using col() alias references: col("R","val") must resolve + # to the right-side column. With the shared-dict bug it resolved to the + # left-side column, making the filter semantically wrong. + Utils.check_answer( + df_self.alias("L") + .join(df_self.alias("R"), col("L", "id") == col("R", "id")) + .filter(col("R", "val") == 20) + .select(col("L", "id")), + [(2,)], + ) + @pytest.mark.skipif( "config.getoption('local_testing_mode', default=False)", diff --git a/tests/unit/test_deepcopy.py b/tests/unit/test_deepcopy.py index 713ed1819c..acc74ac327 100644 --- a/tests/unit/test_deepcopy.py +++ b/tests/unit/test_deepcopy.py @@ -301,6 +301,36 @@ def _create_select_statement(mock_session, mock_analyzer): return SelectStatement(from_=from_, analyzer=mock_analyzer) +def test_select_statement_copy_aliases_isolated(mock_session, mock_analyzer): + """copy.copy(SelectStatement) must produce an independent df_aliased_col_name_to_real_col_name. + + Before the fix, __copy__ assigned the dict by reference. Calling alias("L") then + alias("R") on the same DataFrame both wrote to the *same* dict, causing col("R","col") + to resolve to the left-side column after a self-join. + """ + from_ = SelectableEntity( + SnowflakeTable("TEST_TABLE", session=mock_session), analyzer=mock_analyzer + ) + original = SelectStatement(from_=from_, analyzer=mock_analyzer) + original.df_aliased_col_name_to_real_col_name["A"] = {"col": "col"} + + copied = copy.copy(original) + + # The copy must be a distinct object. + assert ( + copied.df_aliased_col_name_to_real_col_name + is not original.df_aliased_col_name_to_real_col_name + ) + + # Mutations to the copy must not affect the original. + copied.df_aliased_col_name_to_real_col_name["B"] = {"col": "col"} + assert "B" not in original.df_aliased_col_name_to_real_col_name + + # Mutations to the original must not affect the copy. + original.df_aliased_col_name_to_real_col_name["C"] = {"col": "col"} + assert "C" not in copied.df_aliased_col_name_to_real_col_name + + @pytest.mark.parametrize( "selectable_factory,copy_func,reduce_describe_enabled,cte_enabled", [