Skip to content

Commit ba2d66f

Browse files
SNOW-2331792: Support DataFrame.eval for single-level index. (#3782)
Support all the parameters of `DataFrame.eval` via the Modin implementation. Only support single-level row index for now because the MultiIndex is currently a pandas object, so we have to get resolvers for its levels differently. Signed-off-by: sfc-gh-mvashishtha <mahesh.vashishtha@snowflake.com>
1 parent cfd5d16 commit ba2d66f

7 files changed

Lines changed: 590 additions & 4 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
- Added a new option `cacheResult` to `DataFrameReader.xml` that allows users to cache the result of the XML reader to a temporary table after calling `xml`. It helps improve performance when subsequent operations are performed on the same DataFrame.
7777

7878
### Snowpark pandas API Updates
79+
- Added support for `DataFrame.eval()` for dataframes with single-level indexes.
7980

8081
#### New Features
8182

docs/source/modin/supported/dataframe_supported.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ Methods
165165
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
166166
| ``equals`` | Y | | |
167167
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
168-
| ``eval`` | N | | |
168+
| ``eval`` | P | | No support for dataframes with a row MultiIndex. |
169169
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
170170
| ``ewm`` | N | | |
171171
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

src/snowflake/snowpark/modin/plugin/_internal/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@
119119

120120
_logger = logging.getLogger(__name__)
121121

122+
# Flag guarding certain features available only in newer modin versions.
123+
# Snowpark pandas supports the newest two released versions of modin; update this flag and remove legacy
124+
# code as needed when we bump dependency versions.
125+
MODIN_IS_AT_LEAST_0_36_0 = version.parse(pd.__version__) >= version.parse("0.36.0")
126+
122127

123128
# This is the default statement parameters for queries from Snowpark pandas API. It provides the fine grain metric for
124129
# the server to track all pandas API usage.

src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from modin.pandas.base import BasePandasDataset
3737
from modin.pandas.io import from_pandas
3838
from modin.pandas.utils import is_scalar
39+
from modin.core.computation.eval import eval as _eval
3940
from modin.core.storage_formats.pandas.query_compiler_caster import (
4041
register_function_for_pre_op_switch,
4142
)
@@ -226,9 +227,63 @@ def dot(self, other): # noqa: PR01, RT01, D200
226227
pass # pragma: no cover
227228

228229

229-
@register_dataframe_not_implemented()
230+
# Override eval because
231+
# 1) we treat the "engine" parameter differently
232+
# 2) We have to update the level parameter to reflect this method's place in
233+
# the function stack. Modin can't do that for us since it doesn't know our
234+
# place in the stack.
235+
@register_dataframe_accessor("eval")
230236
def eval(self, expr, inplace=False, **kwargs): # noqa: PR01, RT01, D200
231-
pass # pragma: no cover
237+
"""
238+
Evaluate a string describing operations on ``DataFrame`` columns.
239+
"""
240+
if self._query_compiler.nlevels() > 1:
241+
# If the rows of this dataframe have a multi-index, we store the index
242+
# as a native_pd.MultiIndex, and the usual method of getting index
243+
# resolvers with _get_index_resolvers() does not work.
244+
ErrorMessage.not_implemented("eval() does not support a multi-level index.")
245+
246+
inplace = validate_bool_kwarg(inplace, "inplace")
247+
248+
# numexpr engine is useful for chained operations on numpy-backed
249+
# arrays. It doesn't support all the syntax that the python engine
250+
# does, and the Snowpark backend doesn't store data in numpy, so the
251+
# numexpr performance optimizations are not useful. Ignore the "engine"
252+
# requirement, and warn the user that if they explicitly select
253+
# engine="numexpr", we will not honor their preference.
254+
if kwargs.get("engine", None) == "numexpr":
255+
WarningMessage.ignored_argument(
256+
operation="eval",
257+
argument="engine",
258+
message="Snowpark pandas always uses the python engine in "
259+
+ "favor of the numexpr engine, even if the numexpr engine is "
260+
+ "available",
261+
)
262+
kwargs["engine"] = "python"
263+
264+
# eval() lets the user reference variables according to dynamic scope
265+
# at `level` stack frames below the stack frame that called eval(). The
266+
# eval() implementation is 4 stack frames above the frame where we execute
267+
# the _eval() implementation:
268+
# 1) query_compiler_caster wrapper dispatches to snowflake implementation
269+
# 2) telemetry wrapper 1
270+
# 3) telemetry wrapper 2 calls this implementation.
271+
# 4) This method implementation calls the _eval() implementation
272+
# so we add 4 to the `level` param.
273+
kwargs["level"] = kwargs.get("level", 0) + 4
274+
275+
index_resolvers = self._get_index_resolvers()
276+
column_resolvers = self._get_cleaned_column_resolvers()
277+
kwargs["resolvers"] = (
278+
*kwargs.get("resolvers", ()),
279+
index_resolvers,
280+
column_resolvers,
281+
)
282+
283+
if "target" not in kwargs:
284+
kwargs["target"] = self
285+
286+
return _eval(expr, inplace=inplace, **kwargs)
232287

233288

234289
@register_dataframe_not_implemented()

tests/integ/modin/conftest.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from snowflake.snowpark.modin.plugin._internal.apply_utils import (
2121
clear_session_udf_and_udtf_caches,
2222
)
23+
from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage
2324
from tests.integ.modin.pandas_api_coverage import PandasAPICoverageGenerator
2425
from tests.integ.utils.sql_counter import (
2526
SqlCounter,
@@ -857,3 +858,12 @@ def testing_dfs_from_read_snowflake(
857858
auto_create_table=True,
858859
)
859860
return pd.read_snowflake(test_table_name), pandas_df
861+
862+
863+
@pytest.fixture(autouse=True)
864+
def clear_printed_warnings() -> Generator[None, None, None]:
865+
# Preserve a copy of the warnings printed before the test.
866+
warnings = set(WarningMessage.printed_warnings)
867+
WarningMessage.printed_warnings.clear()
868+
yield
869+
WarningMessage.printed_warnings = warnings

0 commit comments

Comments
 (0)