From e88f6956784b097c7328df62f2d81f5b13909650 Mon Sep 17 00:00:00 2001
From: Adam Ling <adam.ling@snowflake.com>
Date: Thu, 28 May 2026 20:56:09 +0000
Subject: [PATCH] poc

---
 .../snowpark/_internal/compiler/cte_utils.py  | 46 +++++++++++--------
 tests/unit/test_cte.py                        | 29 +++++++-----
 2 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/src/snowflake/snowpark/_internal/compiler/cte_utils.py b/src/snowflake/snowpark/_internal/compiler/cte_utils.py
index b67200c6d5..ed9c408e67 100644
--- a/src/snowflake/snowpark/_internal/compiler/cte_utils.py
+++ b/src/snowflake/snowpark/_internal/compiler/cte_utils.py
@@ -267,8 +267,7 @@ def get_duplicated_node_complexity_distribution(
 
 def encode_query_id(node: "TreeNode") -> Optional[str]:
     """
-    Encode the query, its query parameter, expr_to_alias and df_aliased_col_name_to_real_col_name
-    into an id using sha256.
+    Encode the query and its query parameters into an id using sha256.
 
     Returns:
         If encode succeed, return the first 10 encoded value.
@@ -295,24 +294,17 @@ def encode_query_id(node: "TreeNode") -> Optional[str]:
         # to avoid being detected as a common subquery.
         return None
 
-    def stringify(d):
-        if isinstance(d, dict):
-            key_value_pairs = list(d.items())
-            key_value_pairs.sort(key=lambda x: x[0])
-            return str(key_value_pairs)
-        else:
-            return str(d)
-
     string = query
     if query_params:
         string = f"{string}#{query_params}"
-    if hasattr(node, "expr_to_alias") and node.expr_to_alias:
-        string = f"{string}#{stringify(node.expr_to_alias)}"
-    if (
-        hasattr(node, "df_aliased_col_name_to_real_col_name")
-        and node.df_aliased_col_name_to_real_col_name
-    ):
-        string = f"{string}#{stringify(node.df_aliased_col_name_to_real_col_name)}"
+    # expr_to_alias is intentionally excluded from the hash.
+    # The SQL text already incorporates all column aliasing, so expr_to_alias is
+    # redundant for hash purposes.  Including it causes missed deduplication in
+    # self-joins: the SnowflakePlan compiled for the left branch accumulates a
+    # different number of expr_to_alias entries (via incremental plan construction)
+    # than the SelectStatement for the right branch, even though both nodes produce
+    # identical SQL.  This mirrors the reasoning for excluding
+    # df_aliased_col_name_to_real_col_name.
 
     try:
         return hashlib.sha256(string.encode()).hexdigest()[:HASH_LENGTH]
@@ -331,7 +323,25 @@ def encode_node_id_with_query(node: "TreeNode") -> str:
     """
     query_id = encode_query_id(node)
     if query_id is not None:
-        node_type_name = type(node).__name__
+        from snowflake.snowpark._internal.analyzer.select_statement import (
+            SelectSnowflakePlan,
+            SelectStatement,
+        )
+        from snowflake.snowpark._internal.analyzer.snowflake_plan import SnowflakePlan
+
+        # Use a canonical type name for all plan node types that compile to SQL.
+        # SnowflakePlan, SelectSnowflakePlan, and SelectStatement all represent
+        # a compiled SQL query — they differ only in how they are constructed and
+        # how column state is tracked, not in what SQL they produce.  In a
+        # self-join the left branch is represented as a SnowflakePlan while the
+        # right branch (after _disambiguate_snowpark_columns) ends up as a
+        # SelectStatement or SelectSnowflakePlan.  Using the same type suffix for
+        # all three ensures that nodes producing identical SQL are deduplicated
+        # into a single CTE entry.
+        if isinstance(node, (SnowflakePlan, SelectSnowflakePlan, SelectStatement)):
+            node_type_name = "SnowflakePlan"
+        else:
+            node_type_name = type(node).__name__
         return f"{query_id}_{node_type_name}"
     else:
         return str(id(node))
diff --git a/tests/unit/test_cte.py b/tests/unit/test_cte.py
index 7f2f13531b..22554dd50d 100644
--- a/tests/unit/test_cte.py
+++ b/tests/unit/test_cte.py
@@ -113,33 +113,40 @@ def test_encode_node_id_with_query_select_sql(mock_session, mock_analyzer):
     select_statement_node._sql_query = sql_text
     assert (
         encode_node_id_with_query(select_statement_node)
-        == f"{expected_hash}_SelectStatement"
+        == f"{expected_hash}_SnowflakePlan"
     )
 
 
-def test_encode_node_id_with_query_includes_aliases():
+def test_encode_node_id_with_query_excludes_bookkeeping_fields():
+    # Both expr_to_alias and df_aliased_col_name_to_real_col_name are intentionally
+    # excluded from the hash: they are Python-side bookkeeping that do not change the
+    # SQL content.  Two nodes with the same SQL but different bookkeeping maps must
+    # produce the same encoded id so the CTE optimizer can deduplicate them.
     node = SimpleNamespace(
         sql_query="select col1 from t",
         query_params=(("p1", 1), ("p2", "x")),
         expr_to_alias={"uuid1": "ALIAS1"},
         df_aliased_col_name_to_real_col_name={"ALIAS1": "col1"},
     )
+    node_different_alias = SimpleNamespace(
+        sql_query="select col1 from t",
+        query_params=(("p1", 1), ("p2", "x")),
+        expr_to_alias={"uuid2": "ALIAS2"},
+        df_aliased_col_name_to_real_col_name={"ALIAS2": "col1"},
+    )
 
-    def stringify_dict(d: dict) -> str:
-        key_value_pairs = list(d.items())
-        key_value_pairs.sort(key=lambda x: x[0])
-        return str(key_value_pairs)
-
+    # Hash is based on SQL + query_params only; expr_to_alias and
+    # df_aliased_col_name_to_real_col_name are NOT included.
     expected_string = node.sql_query
     if node.query_params:
         expected_string = f"{expected_string}#{node.query_params}"
-    if node.expr_to_alias:
-        expected_string = f"{expected_string}#{stringify_dict(node.expr_to_alias)}"
-    if node.df_aliased_col_name_to_real_col_name:
-        expected_string = f"{expected_string}#{stringify_dict(node.df_aliased_col_name_to_real_col_name)}"
 
     expected_hash = hashlib.sha256(expected_string.encode()).hexdigest()[:10]
     assert encode_node_id_with_query(node) == f"{expected_hash}_SimpleNamespace"
+    # Both nodes produce the same id despite different expr_to_alias and df alias maps
+    assert encode_node_id_with_query(node) == encode_node_id_with_query(
+        node_different_alias
+    )
 
 
 def test_select_statement_contains_data_generation(mock_session, mock_analyzer):