From e88f6956784b097c7328df62f2d81f5b13909650 Mon Sep 17 00:00:00 2001 From: Adam Ling Date: Thu, 28 May 2026 20:56:09 +0000 Subject: [PATCH] poc --- .../snowpark/_internal/compiler/cte_utils.py | 46 +++++++++++-------- tests/unit/test_cte.py | 29 +++++++----- 2 files changed, 46 insertions(+), 29 deletions(-) diff --git a/src/snowflake/snowpark/_internal/compiler/cte_utils.py b/src/snowflake/snowpark/_internal/compiler/cte_utils.py index b67200c6d5..ed9c408e67 100644 --- a/src/snowflake/snowpark/_internal/compiler/cte_utils.py +++ b/src/snowflake/snowpark/_internal/compiler/cte_utils.py @@ -267,8 +267,7 @@ def get_duplicated_node_complexity_distribution( def encode_query_id(node: "TreeNode") -> Optional[str]: """ - Encode the query, its query parameter, expr_to_alias and df_aliased_col_name_to_real_col_name - into an id using sha256. + Encode the query and its query parameters into an id using sha256. Returns: If encode succeed, return the first 10 encoded value. @@ -295,24 +294,17 @@ def encode_query_id(node: "TreeNode") -> Optional[str]: # to avoid being detected as a common subquery. return None - def stringify(d): - if isinstance(d, dict): - key_value_pairs = list(d.items()) - key_value_pairs.sort(key=lambda x: x[0]) - return str(key_value_pairs) - else: - return str(d) - string = query if query_params: string = f"{string}#{query_params}" - if hasattr(node, "expr_to_alias") and node.expr_to_alias: - string = f"{string}#{stringify(node.expr_to_alias)}" - if ( - hasattr(node, "df_aliased_col_name_to_real_col_name") - and node.df_aliased_col_name_to_real_col_name - ): - string = f"{string}#{stringify(node.df_aliased_col_name_to_real_col_name)}" + # expr_to_alias is intentionally excluded from the hash. + # The SQL text already incorporates all column aliasing, so expr_to_alias is + # redundant for hash purposes. Including it causes missed deduplication in + # self-joins: the SnowflakePlan compiled for the left branch accumulates a + # different number of expr_to_alias entries (via incremental plan construction) + # than the SelectStatement for the right branch, even though both nodes produce + # identical SQL. This mirrors the reasoning for excluding + # df_aliased_col_name_to_real_col_name. try: return hashlib.sha256(string.encode()).hexdigest()[:HASH_LENGTH] @@ -331,7 +323,25 @@ def encode_node_id_with_query(node: "TreeNode") -> str: """ query_id = encode_query_id(node) if query_id is not None: - node_type_name = type(node).__name__ + from snowflake.snowpark._internal.analyzer.select_statement import ( + SelectSnowflakePlan, + SelectStatement, + ) + from snowflake.snowpark._internal.analyzer.snowflake_plan import SnowflakePlan + + # Use a canonical type name for all plan node types that compile to SQL. + # SnowflakePlan, SelectSnowflakePlan, and SelectStatement all represent + # a compiled SQL query — they differ only in how they are constructed and + # how column state is tracked, not in what SQL they produce. In a + # self-join the left branch is represented as a SnowflakePlan while the + # right branch (after _disambiguate_snowpark_columns) ends up as a + # SelectStatement or SelectSnowflakePlan. Using the same type suffix for + # all three ensures that nodes producing identical SQL are deduplicated + # into a single CTE entry. + if isinstance(node, (SnowflakePlan, SelectSnowflakePlan, SelectStatement)): + node_type_name = "SnowflakePlan" + else: + node_type_name = type(node).__name__ return f"{query_id}_{node_type_name}" else: return str(id(node)) diff --git a/tests/unit/test_cte.py b/tests/unit/test_cte.py index 7f2f13531b..22554dd50d 100644 --- a/tests/unit/test_cte.py +++ b/tests/unit/test_cte.py @@ -113,33 +113,40 @@ def test_encode_node_id_with_query_select_sql(mock_session, mock_analyzer): select_statement_node._sql_query = sql_text assert ( encode_node_id_with_query(select_statement_node) - == f"{expected_hash}_SelectStatement" + == f"{expected_hash}_SnowflakePlan" ) -def test_encode_node_id_with_query_includes_aliases(): +def test_encode_node_id_with_query_excludes_bookkeeping_fields(): + # Both expr_to_alias and df_aliased_col_name_to_real_col_name are intentionally + # excluded from the hash: they are Python-side bookkeeping that do not change the + # SQL content. Two nodes with the same SQL but different bookkeeping maps must + # produce the same encoded id so the CTE optimizer can deduplicate them. node = SimpleNamespace( sql_query="select col1 from t", query_params=(("p1", 1), ("p2", "x")), expr_to_alias={"uuid1": "ALIAS1"}, df_aliased_col_name_to_real_col_name={"ALIAS1": "col1"}, ) + node_different_alias = SimpleNamespace( + sql_query="select col1 from t", + query_params=(("p1", 1), ("p2", "x")), + expr_to_alias={"uuid2": "ALIAS2"}, + df_aliased_col_name_to_real_col_name={"ALIAS2": "col1"}, + ) - def stringify_dict(d: dict) -> str: - key_value_pairs = list(d.items()) - key_value_pairs.sort(key=lambda x: x[0]) - return str(key_value_pairs) - + # Hash is based on SQL + query_params only; expr_to_alias and + # df_aliased_col_name_to_real_col_name are NOT included. expected_string = node.sql_query if node.query_params: expected_string = f"{expected_string}#{node.query_params}" - if node.expr_to_alias: - expected_string = f"{expected_string}#{stringify_dict(node.expr_to_alias)}" - if node.df_aliased_col_name_to_real_col_name: - expected_string = f"{expected_string}#{stringify_dict(node.df_aliased_col_name_to_real_col_name)}" expected_hash = hashlib.sha256(expected_string.encode()).hexdigest()[:10] assert encode_node_id_with_query(node) == f"{expected_hash}_SimpleNamespace" + # Both nodes produce the same id despite different expr_to_alias and df alias maps + assert encode_node_id_with_query(node) == encode_node_id_with_query( + node_different_alias + ) def test_select_statement_contains_data_generation(mock_session, mock_analyzer):