diff --git a/CHANGELOG.md b/CHANGELOG.md index eeb4140527..636e5f25f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,9 @@ #### New Features #### Improvements + - Hybrid execution mode is now enabled by default. Certain operations on smaller data will now automatically execute in native pandas in-memory. Use `from modin.config import AutoSwitchBackend; AutoSwitchBackend.disable()` to turn this off and force all execution to occur in Snowflake. +- Added a session parameter `pandas_hybrid_execution_enabled` to enable/disable hybrid execution as an alternative to using `AutoSwitchBackend`. - Removed an unnecessary `SHOW OBJECTS` query issued from `read_snowflake` under certain conditions. ## 1.39.0 (YYYY-MM-DD) diff --git a/src/snowflake/snowpark/session.py b/src/snowflake/snowpark/session.py index 10f3874dd5..e07b8e3737 100644 --- a/src/snowflake/snowpark/session.py +++ b/src/snowflake/snowpark/session.py @@ -304,6 +304,7 @@ _SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED = ( "SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED" ) +_SNOWPARK_PANDAS_HYBRID_EXECUTION_ENABLED = "SNOWPARK_PANDAS_HYBRID_EXECUTION_ENABLED" # AST encoding. _PYTHON_SNOWPARK_USE_AST = "PYTHON_SNOWPARK_USE_AST" @@ -754,6 +755,21 @@ def __init__( ) ) + if importlib.util.find_spec("modin"): + try: + from modin.config import AutoSwitchBackend + + pandas_hybrid_execution_enabled: bool = ( + self._conn._get_client_side_session_parameter( + _SNOWPARK_PANDAS_HYBRID_EXECUTION_ENABLED, + AutoSwitchBackend().get(), + ) + ) + AutoSwitchBackend.put(pandas_hybrid_execution_enabled) + except Exception: + # Continue session initialization even if Modin configuration fails + pass + self._thread_store = create_thread_local( self._conn._thread_safe_session_enabled ) @@ -1025,6 +1041,21 @@ def dummy_row_pos_optimization_enabled(self) -> bool: """ return self._dummy_row_pos_optimization_enabled + @property + def pandas_hybrid_execution_enabled(self) -> bool: + """Set to ``True`` to enable hybrid execution mode (has the same default as AutoSwitchBackend). + When enabled, certain operations on smaller data will automatically execute in native pandas in-memory. + This can significantly improve performance for operations that are more efficient in pandas than in Snowflake. + """ + if not importlib.util.find_spec("modin"): + raise ImportError( + "The 'modin' package is required to enable this feature. Please install it first." + ) + + from modin.config import AutoSwitchBackend + + return AutoSwitchBackend().get() + @property def custom_package_usage_config(self) -> Dict: """Get or set configuration parameters related to usage of custom Python packages in Snowflake. @@ -1200,6 +1231,23 @@ def dummy_row_pos_optimization_enabled(self, value: bool) -> None: "value for dummy_row_pos_optimization_enabled must be True or False!" ) + @pandas_hybrid_execution_enabled.setter + def pandas_hybrid_execution_enabled(self, value: bool) -> None: + """Set the value for pandas_hybrid_execution_enabled""" + if not importlib.util.find_spec("modin"): + raise ImportError( + "The 'modin' package is required to enable this feature. Please install it first." + ) + + from modin.config import AutoSwitchBackend + + if value in [True, False]: + AutoSwitchBackend.put(value) + else: + raise ValueError( + "value for pandas_hybrid_execution_enabled must be True or False!" + ) + @custom_package_usage_config.setter @experimental_parameter(version="1.6.0") def custom_package_usage_config(self, config: Dict) -> None: diff --git a/tests/integ/modin/hybrid/test_switch_operations.py b/tests/integ/modin/hybrid/test_switch_operations.py index e469c41cb0..cec233a65c 100644 --- a/tests/integ/modin/hybrid/test_switch_operations.py +++ b/tests/integ/modin/hybrid/test_switch_operations.py @@ -395,8 +395,9 @@ def test_tqdm_usage_during_snowflake_to_pandas_switch(): ("Series", "transform", (lambda x: x * 2,)), # declared in series_overrides ], ) +@pytest.mark.parametrize("use_session_param", [True, False]) @sql_count_checker(query_count=1) -def test_unimplemented_autoswitches(class_name, method_name, f_args): +def test_unimplemented_autoswitches(class_name, method_name, f_args, use_session_param): # Unimplemented methods declared via register_*_not_implemented should automatically # default to local pandas execution. # This test needs to be modified if any of the APIs in question are ever natively implemented @@ -405,6 +406,13 @@ def test_unimplemented_autoswitches(class_name, method_name, f_args): method = getattr(getattr(pd, class_name)(data).move_to("Snowflake"), method_name) # Attempting to call the method without switching should raise. with config_context(AutoSwitchBackend=False): + if use_session_param: + from modin.config import AutoSwitchBackend + + AutoSwitchBackend.enable() + pd.session.pandas_hybrid_execution_enabled = False + assert pd.session.pandas_hybrid_execution_enabled is False + assert AutoSwitchBackend.get() is False with pytest.raises( NotImplementedError, match="Snowpark pandas does not yet support the method" ): @@ -433,6 +441,7 @@ def test_to_datetime(): assert isinstance(result, DatetimeIndex) +@pytest.mark.parametrize("use_session_param", [True, False]) @sql_count_checker( query_count=11, join_count=6, @@ -440,7 +449,7 @@ def test_to_datetime(): high_count_expected=True, high_count_reason="tests queries across different execution modes", ) -def test_query_count_no_switch(init_transaction_tables): +def test_query_count_no_switch(init_transaction_tables, use_session_param): """ Tests that when there is no switching behavior the query count is the same under hybrid mode and non-hybrid mode. @@ -458,11 +467,25 @@ def inner_test(df_in): hybrid_len = None with pd.session.query_history() as query_history_orig: with config_context(AutoSwitchBackend=False, NativePandasMaxRows=10): + if use_session_param: + from modin.config import AutoSwitchBackend + + AutoSwitchBackend.enable() + pd.session.pandas_hybrid_execution_enabled = False + assert pd.session.pandas_hybrid_execution_enabled is False + assert AutoSwitchBackend.get() is False df_result = inner_test(df_transactions) orig_len = len(df_result) with pd.session.query_history() as query_history_hybrid: with config_context(AutoSwitchBackend=True, NativePandasMaxRows=10): + if use_session_param: + from modin.config import AutoSwitchBackend + + AutoSwitchBackend.disable() + pd.session.pandas_hybrid_execution_enabled = True + assert pd.session.pandas_hybrid_execution_enabled is True + assert AutoSwitchBackend.get() is True df_result = inner_test(df_transactions) hybrid_len = len(df_result)