Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 28 additions & 18 deletions src/snowflake/snowpark/_internal/compiler/cte_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,7 @@ def get_duplicated_node_complexity_distribution(

def encode_query_id(node: "TreeNode") -> Optional[str]:
"""
Encode the query, its query parameter, expr_to_alias and df_aliased_col_name_to_real_col_name
into an id using sha256.
Encode the query and its query parameters into an id using sha256.

Returns:
If encode succeed, return the first 10 encoded value.
Expand All @@ -295,24 +294,17 @@ def encode_query_id(node: "TreeNode") -> Optional[str]:
# to avoid being detected as a common subquery.
return None

def stringify(d):
if isinstance(d, dict):
key_value_pairs = list(d.items())
key_value_pairs.sort(key=lambda x: x[0])
return str(key_value_pairs)
else:
return str(d)

string = query
if query_params:
string = f"{string}#{query_params}"
if hasattr(node, "expr_to_alias") and node.expr_to_alias:
string = f"{string}#{stringify(node.expr_to_alias)}"
if (
hasattr(node, "df_aliased_col_name_to_real_col_name")
and node.df_aliased_col_name_to_real_col_name
):
string = f"{string}#{stringify(node.df_aliased_col_name_to_real_col_name)}"
# expr_to_alias is intentionally excluded from the hash.
# The SQL text already incorporates all column aliasing, so expr_to_alias is
# redundant for hash purposes. Including it causes missed deduplication in
# self-joins: the SnowflakePlan compiled for the left branch accumulates a
# different number of expr_to_alias entries (via incremental plan construction)
# than the SelectStatement for the right branch, even though both nodes produce
# identical SQL. This mirrors the reasoning for excluding
# df_aliased_col_name_to_real_col_name.

try:
return hashlib.sha256(string.encode()).hexdigest()[:HASH_LENGTH]
Expand All @@ -331,7 +323,25 @@ def encode_node_id_with_query(node: "TreeNode") -> str:
"""
query_id = encode_query_id(node)
if query_id is not None:
node_type_name = type(node).__name__
from snowflake.snowpark._internal.analyzer.select_statement import (
SelectSnowflakePlan,
SelectStatement,
)
from snowflake.snowpark._internal.analyzer.snowflake_plan import SnowflakePlan

# Use a canonical type name for all plan node types that compile to SQL.
# SnowflakePlan, SelectSnowflakePlan, and SelectStatement all represent
# a compiled SQL query — they differ only in how they are constructed and
# how column state is tracked, not in what SQL they produce. In a
# self-join the left branch is represented as a SnowflakePlan while the
# right branch (after _disambiguate_snowpark_columns) ends up as a
# SelectStatement or SelectSnowflakePlan. Using the same type suffix for
# all three ensures that nodes producing identical SQL are deduplicated
# into a single CTE entry.
if isinstance(node, (SnowflakePlan, SelectSnowflakePlan, SelectStatement)):
node_type_name = "SnowflakePlan"
else:
node_type_name = type(node).__name__
return f"{query_id}_{node_type_name}"
else:
return str(id(node))
Expand Down
29 changes: 18 additions & 11 deletions tests/unit/test_cte.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,33 +113,40 @@ def test_encode_node_id_with_query_select_sql(mock_session, mock_analyzer):
select_statement_node._sql_query = sql_text
assert (
encode_node_id_with_query(select_statement_node)
== f"{expected_hash}_SelectStatement"
== f"{expected_hash}_SnowflakePlan"
)


def test_encode_node_id_with_query_includes_aliases():
def test_encode_node_id_with_query_excludes_bookkeeping_fields():
# Both expr_to_alias and df_aliased_col_name_to_real_col_name are intentionally
# excluded from the hash: they are Python-side bookkeeping that do not change the
# SQL content. Two nodes with the same SQL but different bookkeeping maps must
# produce the same encoded id so the CTE optimizer can deduplicate them.
node = SimpleNamespace(
sql_query="select col1 from t",
query_params=(("p1", 1), ("p2", "x")),
expr_to_alias={"uuid1": "ALIAS1"},
df_aliased_col_name_to_real_col_name={"ALIAS1": "col1"},
)
node_different_alias = SimpleNamespace(
sql_query="select col1 from t",
query_params=(("p1", 1), ("p2", "x")),
expr_to_alias={"uuid2": "ALIAS2"},
df_aliased_col_name_to_real_col_name={"ALIAS2": "col1"},
)

def stringify_dict(d: dict) -> str:
key_value_pairs = list(d.items())
key_value_pairs.sort(key=lambda x: x[0])
return str(key_value_pairs)

# Hash is based on SQL + query_params only; expr_to_alias and
# df_aliased_col_name_to_real_col_name are NOT included.
expected_string = node.sql_query
if node.query_params:
expected_string = f"{expected_string}#{node.query_params}"
if node.expr_to_alias:
expected_string = f"{expected_string}#{stringify_dict(node.expr_to_alias)}"
if node.df_aliased_col_name_to_real_col_name:
expected_string = f"{expected_string}#{stringify_dict(node.df_aliased_col_name_to_real_col_name)}"

expected_hash = hashlib.sha256(expected_string.encode()).hexdigest()[:10]
assert encode_node_id_with_query(node) == f"{expected_hash}_SimpleNamespace"
# Both nodes produce the same id despite different expr_to_alias and df alias maps
assert encode_node_id_with_query(node) == encode_node_id_with_query(
node_different_alias
)


def test_select_statement_contains_data_generation(mock_session, mock_analyzer):
Expand Down
Loading