Skip to content

Commit 16dbb7e

Browse files
committed
merge
2 parents a726afe + 2ec6d7b commit 16dbb7e

4 files changed

Lines changed: 65 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44

55
### Snowpark Python API Updates
66

7-
#### Bug Fixes
8-
97
#### New Features
108

119
- Added `get_wif_token` to `snowflake.snowpark.secrets` for workload identity federation tokens on the Snowflake server (not available in SPCS file-based secret environments).
1210

11+
#### Bug Fixes
12+
13+
- Fixed a bug where calling `DataFrame.alias()` twice on the same DataFrame (e.g. for a self-join) caused both aliases to share the same internal column-mapping dictionary. This made `col("R", "col")` resolve to the same column as `col("L", "col")`, producing incorrect join conditions and filter expressions.
14+
1315
#### Improvements
1416

1517
- Improved CTE optimization to deduplicate identical subtrees in self-joins, which were previously emitted as repeated subqueries.

src/snowflake/snowpark/_internal/analyzer/select_statement.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -963,7 +963,7 @@ def __copy__(self):
963963
new._snowflake_plan = None
964964
new.flatten_disabled = False # by default a SelectStatement can be flattened.
965965
new._api_calls = self._api_calls.copy() if self._api_calls is not None else None
966-
new.df_aliased_col_name_to_real_col_name = (
966+
new.df_aliased_col_name_to_real_col_name = deepcopy(
967967
self.df_aliased_col_name_to_real_col_name
968968
)
969969
new._merge_projection_complexity_with_subquery = (

tests/integ/test_dataframe.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6838,6 +6838,36 @@ def test_dataframe_alias(session):
68386838
.select(df1["*"], df3["*"], df2["col1"]),
68396839
)
68406840

6841+
# Regression: aliasing the same DataFrame twice must produce independent
6842+
# df_aliased_col_name_to_real_col_name dicts so that col("R","col") resolves
6843+
# to the right-side column, not the left-side column.
6844+
# Before the fix, SelectStatement.__copy__ assigned the dict by reference,
6845+
# causing alias("L") and alias("R") on the same df to share the same dict.
6846+
df_self = session.create_dataframe(
6847+
[[1, 10], [2, 20], [3, 30]], schema=["id", "val"]
6848+
)
6849+
6850+
# Self-join ON condition using col() alias references: each row should match
6851+
# only itself (equi-join on unique key). With the shared-dict bug the ON
6852+
# condition degenerates to "id" = "id" (always true), producing a cross-join.
6853+
Utils.check_answer(
6854+
df_self.alias("L")
6855+
.join(df_self.alias("R"), col("L", "id") == col("R", "id"))
6856+
.select(col("L", "id"), col("L", "val"), col("R", "val")),
6857+
[(1, 10, 10), (2, 20, 20), (3, 30, 30)],
6858+
)
6859+
6860+
# Post-join filter using col() alias references: col("R","val") must resolve
6861+
# to the right-side column. With the shared-dict bug it resolved to the
6862+
# left-side column, making the filter semantically wrong.
6863+
Utils.check_answer(
6864+
df_self.alias("L")
6865+
.join(df_self.alias("R"), col("L", "id") == col("R", "id"))
6866+
.filter(col("R", "val") == 20)
6867+
.select(col("L", "id")),
6868+
[(2,)],
6869+
)
6870+
68416871

68426872
@pytest.mark.skipif(
68436873
"config.getoption('local_testing_mode', default=False)",

tests/unit/test_deepcopy.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,36 @@ def _create_select_statement(mock_session, mock_analyzer):
301301
return SelectStatement(from_=from_, analyzer=mock_analyzer)
302302

303303

304+
def test_select_statement_copy_aliases_isolated(mock_session, mock_analyzer):
305+
"""copy.copy(SelectStatement) must produce an independent df_aliased_col_name_to_real_col_name.
306+
307+
Before the fix, __copy__ assigned the dict by reference. Calling alias("L") then
308+
alias("R") on the same DataFrame both wrote to the *same* dict, causing col("R","col")
309+
to resolve to the left-side column after a self-join.
310+
"""
311+
from_ = SelectableEntity(
312+
SnowflakeTable("TEST_TABLE", session=mock_session), analyzer=mock_analyzer
313+
)
314+
original = SelectStatement(from_=from_, analyzer=mock_analyzer)
315+
original.df_aliased_col_name_to_real_col_name["A"] = {"col": "col"}
316+
317+
copied = copy.copy(original)
318+
319+
# The copy must be a distinct object.
320+
assert (
321+
copied.df_aliased_col_name_to_real_col_name
322+
is not original.df_aliased_col_name_to_real_col_name
323+
)
324+
325+
# Mutations to the copy must not affect the original.
326+
copied.df_aliased_col_name_to_real_col_name["B"] = {"col": "col"}
327+
assert "B" not in original.df_aliased_col_name_to_real_col_name
328+
329+
# Mutations to the original must not affect the copy.
330+
original.df_aliased_col_name_to_real_col_name["C"] = {"col": "col"}
331+
assert "C" not in copied.df_aliased_col_name_to_real_col_name
332+
333+
304334
@pytest.mark.parametrize(
305335
"selectable_factory,copy_func,reduce_describe_enabled,cte_enabled",
306336
[

0 commit comments

Comments
 (0)