Skip to content

Commit 5573dc2

Browse files
SNOW-2325421: Add session parameter dummy_row_pos_optimization_enabled (#3779)
1 parent 862656d commit 5573dc2

4 files changed

Lines changed: 90 additions & 8 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
- The following operations are currently supported and can benefit from the optimization: `read_snowflake`, `repr`, `loc`, `reset_index`, `merge`, and binary operations.
5757
- If a lazy object (e.g., DataFrame or Series) depends on a mix of supported and unsupported operations, the optimization will not be used.
5858
- Updated the error message for when Snowpark pandas is referenced within apply.
59+
- Added a session parameter `dummy_row_pos_optimization_enabled` to enable/disable dummy row position optimization in faster pandas.
5960

6061
#### Dependency Updates
6162

src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import itertools
1111
import json
1212
import logging
13-
import os
1413
import re
1514
from collections import Counter, defaultdict
1615
import typing
@@ -1455,15 +1454,9 @@ def from_snowflake(
14551454
See detailed docstring and examples in ``read_snowflake`` in frontend layer:
14561455
src/snowflake/snowpark/modin/plugin/pd_extensions.py
14571456
"""
1458-
dummy_row_pos_optimization_enabled = (
1459-
os.environ.get(
1460-
"SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED", "true"
1461-
).lower()
1462-
== "true"
1463-
)
14641457
relaxed_query_compiler = None
14651458
if (
1466-
dummy_row_pos_optimization_enabled
1459+
pd.session.dummy_row_pos_optimization_enabled
14671460
and not enforce_ordering
14681461
and not dummy_row_pos_mode
14691462
):

src/snowflake/snowpark/session.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,9 @@
301301
"PYTHON_SNOWPARK_GENERATE_MULTILINE_QUERIES"
302302
)
303303
_PYTHON_SNOWPARK_INTERNAL_TELEMETRY_ENABLED = "ENABLE_SNOWPARK_FIRST_PARTY_TELEMETRY"
304+
_SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED = (
305+
"SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED"
306+
)
304307

305308
# AST encoding.
306309
_PYTHON_SNOWPARK_USE_AST = "PYTHON_SNOWPARK_USE_AST"
@@ -745,6 +748,12 @@ def __init__(
745748
_PYTHON_SNOWPARK_DATAFRAME_JOIN_ALIAS_FIX_VERSION
746749
)
747750

751+
self._dummy_row_pos_optimization_enabled: bool = (
752+
self._conn._get_client_side_session_parameter(
753+
_SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED, True
754+
)
755+
)
756+
748757
self._thread_store = create_thread_local(
749758
self._conn._thread_safe_session_enabled
750759
)
@@ -1009,6 +1018,13 @@ def reduce_describe_query_enabled(self) -> bool:
10091018
"""
10101019
return self._reduce_describe_query_enabled
10111020

1021+
@property
1022+
def dummy_row_pos_optimization_enabled(self) -> bool:
1023+
"""Set to ``True`` to enable the dummy row position optimization (defaults to ``True``).
1024+
The generated SQLs from pandas transformations would potentially have fewer expensive window functions to compute the row position column.
1025+
"""
1026+
return self._dummy_row_pos_optimization_enabled
1027+
10121028
@property
10131029
def custom_package_usage_config(self) -> Dict:
10141030
"""Get or set configuration parameters related to usage of custom Python packages in Snowflake.
@@ -1174,6 +1190,16 @@ def reduce_describe_query_enabled(self, value: bool) -> None:
11741190
"value for reduce_describe_query_enabled must be True or False!"
11751191
)
11761192

1193+
@dummy_row_pos_optimization_enabled.setter
1194+
def dummy_row_pos_optimization_enabled(self, value: bool) -> None:
1195+
"""Set the value for dummy_row_pos_optimization_enabled"""
1196+
if value in [True, False]:
1197+
self._dummy_row_pos_optimization_enabled = value
1198+
else:
1199+
raise ValueError(
1200+
"value for dummy_row_pos_optimization_enabled must be True or False!"
1201+
)
1202+
11771203
@custom_package_usage_config.setter
11781204
@experimental_parameter(version="1.6.0")
11791205
def custom_package_usage_config(self, config: Dict) -> None:

tests/integ/modin/test_faster_pandas.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,16 @@
22
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
33
#
44

5+
import copy
56
import modin.pandas as pd
67
import pandas as native_pd
78

89
from snowflake.snowpark._internal.utils import TempObjectType
910
import snowflake.snowpark.modin.plugin # noqa: F401
11+
from snowflake.snowpark.session import (
12+
_SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED,
13+
Session,
14+
)
1015
from tests.integ.modin.utils import assert_frame_equal, assert_index_equal
1116
from tests.integ.utils.sql_counter import sql_count_checker
1217
from tests.utils import Utils
@@ -129,3 +134,60 @@ def test_read_filter_groupby_agg(session):
129134

130135
# compare results
131136
assert_frame_equal(snow_result, native_result)
137+
138+
139+
@sql_count_checker(query_count=5, join_count=1)
140+
def test_read_filter_join_flag_disabled(session):
141+
# test a chain of operations that are fully supported in faster pandas
142+
# but with the dummy_row_pos_optimization_enabled flag turned off
143+
session.dummy_row_pos_optimization_enabled = False
144+
145+
# create tables
146+
table_name1 = Utils.random_name_for_temp_object(TempObjectType.TABLE)
147+
session.create_dataframe(
148+
native_pd.DataFrame([[1, 11], [2, 12], [3, 13]], columns=["A", "B"])
149+
).write.save_as_table(table_name1, table_type="temp")
150+
table_name2 = Utils.random_name_for_temp_object(TempObjectType.TABLE)
151+
session.create_dataframe(
152+
native_pd.DataFrame([[1, 21], [2, 22], [3, 23]], columns=["C", "D"])
153+
).write.save_as_table(table_name2, table_type="temp")
154+
155+
# create snow dataframes
156+
df1 = pd.read_snowflake(table_name1)
157+
df2 = pd.read_snowflake(table_name2)
158+
snow_result = df1[df1["B"] > 11].merge(
159+
df2[df2["D"] == 22], left_on="A", right_on="C"
160+
)
161+
162+
# verify that the input dataframes have an empty relaxed query compiler
163+
assert df1._query_compiler._relaxed_query_compiler is None
164+
assert df2._query_compiler._relaxed_query_compiler is None
165+
# verify that the output dataframe also has an empty relaxed query compiler
166+
assert snow_result._query_compiler._relaxed_query_compiler is None
167+
168+
# create pandas dataframes
169+
native_df1 = df1.to_pandas()
170+
native_df2 = df2.to_pandas()
171+
native_result = native_df1[native_df1["B"] > 11].merge(
172+
native_df2[native_df2["D"] == 22], left_on="A", right_on="C"
173+
)
174+
175+
# compare results
176+
assert_frame_equal(snow_result, native_result)
177+
178+
179+
@sql_count_checker(query_count=0)
180+
def test_dummy_row_pos_optimization_enabled_on_session(db_parameters):
181+
with Session.builder.configs(db_parameters).create() as new_session:
182+
default_value = new_session.dummy_row_pos_optimization_enabled
183+
new_session.dummy_row_pos_optimization_enabled = not default_value
184+
assert new_session.dummy_row_pos_optimization_enabled is not default_value
185+
new_session.dummy_row_pos_optimization_enabled = default_value
186+
assert new_session.dummy_row_pos_optimization_enabled is default_value
187+
188+
parameters = copy.deepcopy(db_parameters)
189+
parameters["session_parameters"] = {
190+
_SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED: not default_value
191+
}
192+
with Session.builder.configs(parameters).create() as new_session2:
193+
assert new_session2.dummy_row_pos_optimization_enabled is not default_value

0 commit comments

Comments
 (0)