Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
#### New Features

#### Improvements

- Hybrid execution mode is now enabled by default. Certain operations on smaller data will now automatically execute in native pandas in-memory. Use `from modin.config import AutoSwitchBackend; AutoSwitchBackend.disable()` to turn this off and force all execution to occur in Snowflake.
- Added a session parameter `pandas_hybrid_execution_enabled` to enable/disable hybrid execution as an alternative to using `AutoSwitchBackend`.
- Removed an unnecessary `SHOW OBJECTS` query issued from `read_snowflake` under certain conditions.

## 1.39.0 (YYYY-MM-DD)
Expand Down
48 changes: 48 additions & 0 deletions src/snowflake/snowpark/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@
_SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED = (
"SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED"
)
_SNOWPARK_PANDAS_HYBRID_EXECUTION_ENABLED = "SNOWPARK_PANDAS_HYBRID_EXECUTION_ENABLED"

# AST encoding.
_PYTHON_SNOWPARK_USE_AST = "PYTHON_SNOWPARK_USE_AST"
Expand Down Expand Up @@ -754,6 +755,21 @@ def __init__(
)
)

if importlib.util.find_spec("modin"):
try:
from modin.config import AutoSwitchBackend

pandas_hybrid_execution_enabled: bool = (
self._conn._get_client_side_session_parameter(
_SNOWPARK_PANDAS_HYBRID_EXECUTION_ENABLED,
AutoSwitchBackend().get(),
)
)
AutoSwitchBackend.put(pandas_hybrid_execution_enabled)
except Exception:
# Continue session initialization even if Modin configuration fails
pass
Comment thread
sfc-gh-helmeleegy marked this conversation as resolved.

self._thread_store = create_thread_local(
self._conn._thread_safe_session_enabled
)
Expand Down Expand Up @@ -1025,6 +1041,21 @@ def dummy_row_pos_optimization_enabled(self) -> bool:
"""
return self._dummy_row_pos_optimization_enabled

@property
def pandas_hybrid_execution_enabled(self) -> bool:
"""Set to ``True`` to enable hybrid execution mode (has the same default as AutoSwitchBackend).
When enabled, certain operations on smaller data will automatically execute in native pandas in-memory.
This can significantly improve performance for operations that are more efficient in pandas than in Snowflake.
"""
if not importlib.util.find_spec("modin"):
raise ImportError(
"The 'modin' package is required to enable this feature. Please install it first."
)

from modin.config import AutoSwitchBackend

return AutoSwitchBackend().get()
Comment thread
sfc-gh-helmeleegy marked this conversation as resolved.

@property
def custom_package_usage_config(self) -> Dict:
"""Get or set configuration parameters related to usage of custom Python packages in Snowflake.
Expand Down Expand Up @@ -1200,6 +1231,23 @@ def dummy_row_pos_optimization_enabled(self, value: bool) -> None:
"value for dummy_row_pos_optimization_enabled must be True or False!"
)

@pandas_hybrid_execution_enabled.setter
def pandas_hybrid_execution_enabled(self, value: bool) -> None:
"""Set the value for pandas_hybrid_execution_enabled"""
if not importlib.util.find_spec("modin"):
raise ImportError(
"The 'modin' package is required to enable this feature. Please install it first."
)

from modin.config import AutoSwitchBackend

if value in [True, False]:
AutoSwitchBackend.put(value)
Comment thread
sfc-gh-helmeleegy marked this conversation as resolved.
else:
raise ValueError(
"value for pandas_hybrid_execution_enabled must be True or False!"
)

@custom_package_usage_config.setter
@experimental_parameter(version="1.6.0")
def custom_package_usage_config(self, config: Dict) -> None:
Expand Down
27 changes: 25 additions & 2 deletions tests/integ/modin/hybrid/test_switch_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,8 +395,9 @@ def test_tqdm_usage_during_snowflake_to_pandas_switch():
("Series", "transform", (lambda x: x * 2,)), # declared in series_overrides
],
)
@pytest.mark.parametrize("use_session_param", [True, False])
@sql_count_checker(query_count=1)
def test_unimplemented_autoswitches(class_name, method_name, f_args):
def test_unimplemented_autoswitches(class_name, method_name, f_args, use_session_param):
# Unimplemented methods declared via register_*_not_implemented should automatically
# default to local pandas execution.
# This test needs to be modified if any of the APIs in question are ever natively implemented
Expand All @@ -405,6 +406,13 @@ def test_unimplemented_autoswitches(class_name, method_name, f_args):
method = getattr(getattr(pd, class_name)(data).move_to("Snowflake"), method_name)
# Attempting to call the method without switching should raise.
with config_context(AutoSwitchBackend=False):
if use_session_param:
from modin.config import AutoSwitchBackend

AutoSwitchBackend.enable()
pd.session.pandas_hybrid_execution_enabled = False
assert pd.session.pandas_hybrid_execution_enabled is False
assert AutoSwitchBackend.get() is False
with pytest.raises(
NotImplementedError, match="Snowpark pandas does not yet support the method"
):
Expand Down Expand Up @@ -433,14 +441,15 @@ def test_to_datetime():
assert isinstance(result, DatetimeIndex)


@pytest.mark.parametrize("use_session_param", [True, False])
@sql_count_checker(
query_count=11,
join_count=6,
udtf_count=2,
high_count_expected=True,
high_count_reason="tests queries across different execution modes",
)
def test_query_count_no_switch(init_transaction_tables):
def test_query_count_no_switch(init_transaction_tables, use_session_param):
"""
Tests that when there is no switching behavior the query count is the
same under hybrid mode and non-hybrid mode.
Expand All @@ -458,11 +467,25 @@ def inner_test(df_in):
hybrid_len = None
with pd.session.query_history() as query_history_orig:
with config_context(AutoSwitchBackend=False, NativePandasMaxRows=10):
if use_session_param:
from modin.config import AutoSwitchBackend

AutoSwitchBackend.enable()
pd.session.pandas_hybrid_execution_enabled = False
assert pd.session.pandas_hybrid_execution_enabled is False
assert AutoSwitchBackend.get() is False
df_result = inner_test(df_transactions)
orig_len = len(df_result)

with pd.session.query_history() as query_history_hybrid:
with config_context(AutoSwitchBackend=True, NativePandasMaxRows=10):
if use_session_param:
from modin.config import AutoSwitchBackend

AutoSwitchBackend.disable()
pd.session.pandas_hybrid_execution_enabled = True
assert pd.session.pandas_hybrid_execution_enabled is True
assert AutoSwitchBackend.get() is True
df_result = inner_test(df_transactions)
hybrid_len = len(df_result)

Expand Down
Loading