diff --git a/CHANGELOG.md b/CHANGELOG.md index 4031c3a6dd..eeb4140527 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,14 @@ #### New Features +### Snowpark pandas API Updates + +#### New Features + +#### Improvements +- Hybrid execution mode is now enabled by default. Certain operations on smaller data will now automatically execute in native pandas in-memory. Use `from modin.config import AutoSwitchBackend; AutoSwitchBackend.disable()` to turn this off and force all execution to occur in Snowflake. +- Removed an unnecessary `SHOW OBJECTS` query issued from `read_snowflake` under certain conditions. + ## 1.39.0 (YYYY-MM-DD) ### Snowpark Python API Updates @@ -73,7 +81,6 @@ #### Improvements -- Hybrid execution mode is now enabled by default. Certain operations on smaller data will now automatically execute in native pandas in-memory. Use `from modin.config import AutoSwitchBackend; AutoSwitchBackend.disable()` to turn this off and force all execution to occur in Snowflake. - Downgraded to level `logging.DEBUG - 1` the log message saying that the Snowpark `DataFrame` reference of an internal `DataFrameReference` object has changed. diff --git a/src/snowflake/snowpark/modin/plugin/_internal/utils.py b/src/snowflake/snowpark/modin/plugin/_internal/utils.py index e0c2a5067b..d1a52c428a 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/utils.py @@ -322,7 +322,9 @@ def _create_read_only_table( def create_initial_ordered_dataframe( table_name_or_query: Union[str, Iterable[str]], enforce_ordering: bool, + *, dummy_row_pos_mode: bool = False, + row_count_hint: Optional[int] = None, ) -> tuple[OrderedDataFrame, str]: """ create read only temp table on top of the existing table or Snowflake query if required, and create a OrderedDataFrame @@ -334,6 +336,11 @@ def create_initial_ordered_dataframe( enforce_ordering: If True, create a read only temp table on top of the existing table or Snowflake query, and create the OrderedDataFrame using the read only temp table created. Otherwise, directly using the existing table. + dummy_row_pos_mode: If True, uses "dummy" row position columns to avoid a potentially + expensive ROW_NUMBER() query. + row_count_hint: An optional hint for the exact row count of the frame. This is used in scenarios + where we have already performed a query for the size of the underlying data, and can re-use + the value. Returns: OrderedDataFrame with row position column. @@ -502,8 +509,10 @@ def create_initial_ordered_dataframe( ordered_dataframe.row_position_snowflake_quoted_identifier ) - materialized_row_count = None - if not is_query: + if row_count_hint is not None: + ordered_dataframe.row_count = row_count_hint + ordered_dataframe.row_count_upper_bound = row_count_hint + elif not is_query: materialized_row_count = get_object_metadata_row_count(table_name_or_query) ordered_dataframe.row_count = materialized_row_count ordered_dataframe.row_count_upper_bound = materialized_row_count diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 612055329c..94ac31ea34 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -1479,6 +1479,11 @@ def from_snowflake( table_name_or_query=name_or_query, enforce_ordering=enforce_ordering, dummy_row_pos_mode=dummy_row_pos_mode, + row_count_hint=( + relaxed_query_compiler._modin_frame.ordered_dataframe.row_count + if relaxed_query_compiler is not None + else None + ), ) pandas_labels_to_snowflake_quoted_identifiers_map = { # pandas labels of resulting Snowpark pandas dataframe will be snowflake identifier