diff --git a/CHANGELOG.md b/CHANGELOG.md index d0faf8d22d..c32372fe83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,6 +60,7 @@ ### Snowpark pandas API Updates #### New Features +- Added support for `DataFrame.query` for dataframes with single-level indexes. #### Improvements diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst index 0fe93a904e..15c28cbdf1 100644 --- a/docs/source/modin/supported/dataframe_supported.rst +++ b/docs/source/modin/supported/dataframe_supported.rst @@ -332,7 +332,7 @@ Methods | | | | ``"linear"`` or ``"nearest"``, and ``method`` is | | | | | ``"single"``. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``query`` | N | | | +| ``query`` | P | | No support for dataframes with a row MultiIndex. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``radd`` | P | ``level`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index 7d1c962cfd..a08137c6ab 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -3294,7 +3294,134 @@ def quantile(): def query(): """ - Query the columns of a ``DataFrame`` with a boolean expression. + Query the columns of a DataFrame with a boolean expression. + + Parameters + ---------- + expr : str + The query string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + inplace : bool + Whether to modify the DataFrame rather than creating a new one. + + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. + + Returns + ------- + DataFrame or None + DataFrame resulting from the provided query expression or + None if ``inplace=True``. + + See Also + -------- + eval : Evaluate a string describing operations on + DataFrame columns. + DataFrame.eval : Evaluate a string describing operations on + DataFrame columns. + + Notes + ----- + The result of the evaluation of this expression is first passed to + :attr:`DataFrame.loc` and if that fails because of a + multidimensional key (e.g., a DataFrame) then the result will be passed + to :meth:`DataFrame.__getitem__`. + + This method uses the top-level :func:`eval` function to + evaluate the passed query. + + The :meth:`query` method uses a slightly + modified Python syntax by default. For example, the ``&`` and ``|`` + (bitwise) operators have the precedence of their boolean cousins, + :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, + however the semantics are different. + + You can change the semantics of the expression by passing the keyword + argument ``parser='python'``. This enforces the same semantics as + evaluation in Python space. + + The :attr:`DataFrame.index` and + :attr:`DataFrame.columns` attributes of the + :class:`DataFrame` instance are placed in the query namespace + by default, which allows you to treat both the index and columns of the + frame as a column in the frame. + The identifier ``index`` is used for the frame index; you can also + use the name of the index to identify it in a query. Please note that + Python keywords may not be used as identifiers. + + *Backtick quoted variables* + + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. + + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html). + + Examples + -------- + >>> df = pd.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) + >>> df + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + >>> df.query('A > B') + A B C C + 4 5 2 6 + + The previous expression is equivalent to + + >>> df[df.A > df.B] + A B C C + 4 5 2 6 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.query('B == `C C`') + A B C C + 0 1 10 10 + + The previous expression is equivalent to + + >>> df[df.B == df['C C']] + A B C C + 0 1 10 10 """ def rename(): diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index 0f5d0cc12a..fac213d3a8 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -101,6 +101,7 @@ replace_external_data_keys_with_empty_pandas_series, replace_external_data_keys_with_query_compiler, try_convert_index_to_native, + update_eval_and_query_engine_kwarg_and_maybe_warn, ) from snowflake.snowpark.modin.plugin.utils.error_message import ( ErrorMessage, @@ -128,6 +129,19 @@ ) +# eval() and query() let the user reference variables according to dynamic +# scope at `level` stack frames below the stack frame that called them. If +# a snowflake override function here wraps an eval/query implementation +# that's in another function, the implementation function frame is 4 frames +# above the override function frame: +# 1) query_compiler_caster wrapper dispatches to snowflake implementation +# 2) telemetry wrapper 1 +# 3) telemetry wrapper 2 calls the snowflake implementation. +# 4) The snowflake implementation calls the implementation function. +# so we add 4 to the `level` param. +EVAL_QUERY_EXTRA_STACK_LEVELS = 4 + + register_dataframe_accessor = functools.partial( _register_dataframe_accessor, backend="Snowflake" ) @@ -245,32 +259,9 @@ def eval(self, expr, inplace=False, **kwargs): # noqa: PR01, RT01, D200 inplace = validate_bool_kwarg(inplace, "inplace") - # numexpr engine is useful for chained operations on numpy-backed - # arrays. It doesn't support all the syntax that the python engine - # does, and the Snowpark backend doesn't store data in numpy, so the - # numexpr performance optimizations are not useful. Ignore the "engine" - # requirement, and warn the user that if they explicitly select - # engine="numexpr", we will not honor their preference. - if kwargs.get("engine", None) == "numexpr": - WarningMessage.ignored_argument( - operation="eval", - argument="engine", - message="Snowpark pandas always uses the python engine in " - + "favor of the numexpr engine, even if the numexpr engine is " - + "available", - ) - kwargs["engine"] = "python" - - # eval() lets the user reference variables according to dynamic scope - # at `level` stack frames below the stack frame that called eval(). The - # eval() implementation is 4 stack frames above the frame where we execute - # the _eval() implementation: - # 1) query_compiler_caster wrapper dispatches to snowflake implementation - # 2) telemetry wrapper 1 - # 3) telemetry wrapper 2 calls this implementation. - # 4) This method implementation calls the _eval() implementation - # so we add 4 to the `level` param. - kwargs["level"] = kwargs.get("level", 0) + 4 + update_eval_and_query_engine_kwarg_and_maybe_warn(kwargs) + + kwargs["level"] = kwargs.get("level", 0) + EVAL_QUERY_EXTRA_STACK_LEVELS index_resolvers = self._get_index_resolvers() column_resolvers = self._get_cleaned_column_resolvers() @@ -327,9 +318,39 @@ def prod( register_dataframe_accessor("product")(prod) -@register_dataframe_not_implemented() -def query(self, expr, inplace=False, **kwargs): # noqa: PR01, RT01, D200 - pass # pragma: no cover +@register_dataframe_accessor("query") +def query(self, expr, inplace=False, **kwargs): + if self._query_compiler.nlevels() > 1: + # If the rows of this dataframe have a multi-index, we store the index + # as a native_pd.MultiIndex, and the usual method of getting index + # resolvers with _get_index_resolvers() does not work. + ErrorMessage.not_implemented("query() does not support a multi-level index.") + + inplace = validate_bool_kwarg(inplace, "inplace") + + update_eval_and_query_engine_kwarg_and_maybe_warn(kwargs) + + if inplace and "target" not in kwargs: + kwargs["target"] = self + else: + # have to explicitly set target=None to get correct error for + # multi-line query. + kwargs["target"] = None + + key = self.eval( + expr, + inplace=False, + **(kwargs | {"level": kwargs.get("level", 0) + EVAL_QUERY_EXTRA_STACK_LEVELS}), + ) + + try: + result = self.loc[key] + except ValueError: + # when res is multi-dimensional, loc raises an error, but that is + # sometimes a valid query. + result = self[key] + + return self._create_or_update_from_compiler(result._query_compiler, inplace=inplace) @register_dataframe_not_implemented() diff --git a/src/snowflake/snowpark/modin/plugin/extensions/utils.py b/src/snowflake/snowpark/modin/plugin/extensions/utils.py index 252eff468d..a1ff066f7c 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/utils.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/utils.py @@ -13,6 +13,7 @@ import modin.pandas as pd import numpy as np +from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage import pandas from modin.core.storage_formats import BaseQueryCompiler # pragma: no cover from pandas._libs import lib @@ -659,3 +660,27 @@ def register_non_snowflake_accessors( backend=backend, method=method, object_type=object_type ) ) + + +def update_eval_and_query_engine_kwarg_and_maybe_warn(kwargs: dict[str, Any]) -> None: + """ + Update the engine kwarg and warn if the user tries to use numexpr. + + Args: + kwargs: The keyword arguments to eval() or query(). + """ + # numexpr engine is useful for chained operations on numpy-backed + # arrays. It doesn't support all the syntax that the python engine + # does, and the Snowpark backend doesn't store data in numpy, so the + # numexpr performance optimizations are not useful. Ignore the "engine" + # requirement, and warn the user that if they explicitly select + # engine="numexpr", we will not honor their preference. + if kwargs.get("engine", None) == "numexpr": + WarningMessage.ignored_argument( + operation="eval", + argument="engine", + message="Snowpark pandas always uses the python engine in " + + "favor of the numexpr engine, even if the numexpr engine is " + + "available", + ) + kwargs["engine"] = "python" diff --git a/tests/integ/modin/frame/test_eval_and_query/__init__.py b/tests/integ/modin/frame/test_eval_and_query/__init__.py new file mode 100644 index 0000000000..c1a753cccc --- /dev/null +++ b/tests/integ/modin/frame/test_eval_and_query/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved. +# diff --git a/tests/integ/modin/frame/test_eval_and_query/conftest.py b/tests/integ/modin/frame/test_eval_and_query/conftest.py new file mode 100644 index 0000000000..1fd3ce931f --- /dev/null +++ b/tests/integ/modin/frame/test_eval_and_query/conftest.py @@ -0,0 +1,92 @@ +# +# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved. +# + +import pytest +import pandas as native_pd +from tests.integ.modin.utils import ( + create_test_dfs, +) + + +@pytest.fixture +def test_dfs(): + return create_test_dfs( + native_pd.DataFrame( + { + "CUSTOMER_KEY": [-10, -10, -5, 30, 0, 1, 25], + "ACCOUNT_BALANCE": [-101, -101, -51, 30, 0, 53, 105], + "MARKET_SEGMENT": [ + "AUTOMOBILE", + "AUTOMOBILE", + "FURNITURE", + "AUTOMOBILE", + "FURNITURE", + "MACHINERY", + "HOUSEHOLD", + ], + "PURCHASE COUNT": [1, 0, 5, 7, 9, 10, 0], + }, + index=["i0", "i1", "i2", "i3", "i4", "i5", "i6"], + ) + ) + + +@pytest.fixture +def test_dfs_with_named_index(): + return create_test_dfs( + native_pd.DataFrame( + { + "CUSTOMER_KEY": [-10, -10, -5, 30, 0, 1, 25], + "ACCOUNT_BALANCE": [-101, -101, -51, 30, 0, 53, 105], + "MARKET_SEGMENT": [ + "AUTOMOBILE", + "AUTOMOBILE", + "FURNITURE", + "AUTOMOBILE", + "FURNITURE", + "MACHINERY", + "HOUSEHOLD", + ], + }, + index=native_pd.Index( + ["i0", "i1", "i2", "i3", "i4", "i5", "i6"], name="index_name" + ), + ) + ) + + +@pytest.fixture +def test_dfs_multiindex(): + return create_test_dfs( + native_pd.DataFrame( + { + "CUSTOMER_KEY": [-10, -10, -5, 30, 0, 1, 25], + "ACCOUNT_BALANCE": [-101, -101, -51, 30, 0, 53, 105], + "MARKET_SEGMENT": [ + "AUTOMOBILE", + "AUTOMOBILE", + "FURNITURE", + "AUTOMOBILE", + "FURNITURE", + "MACHINERY", + "HOUSEHOLD", + ], + }, + index=native_pd.MultiIndex.from_tuples( + [ + ("i00", "i01"), + ("i10", "i11"), + ("i20", "i21"), + ( + "i30", + "i31", + ), + ("i40", "i41"), + ("i50", "i51"), + ("i60", "i61"), + ], + names=["level_0_name", "level_1_name"], + ), + ) + ) diff --git a/tests/integ/modin/frame/test_eval.py b/tests/integ/modin/frame/test_eval_and_query/test_eval.py similarity index 81% rename from tests/integ/modin/frame/test_eval.py rename to tests/integ/modin/frame/test_eval_and_query/test_eval.py index 0642532045..439c8472e1 100644 --- a/tests/integ/modin/frame/test_eval.py +++ b/tests/integ/modin/frame/test_eval_and_query/test_eval.py @@ -7,7 +7,6 @@ import pytest from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, - create_test_dfs, eval_snowpark_pandas_result, ) import pandas as native_pd @@ -15,6 +14,10 @@ import logging from snowflake.snowpark.modin.plugin._internal.utils import MODIN_IS_AT_LEAST_0_36_0 from pytest import param +from tests.integ.modin.frame.test_eval_and_query.utils import ( + ENGINE_IGNORED_MESSAGE, + engine_parameters, +) import modin.pandas as pd pytestmark = pytest.mark.skipif( @@ -23,23 +26,6 @@ ) -ENGINE_IGNORED_MESSAGE = ( - "The argument `engine` of `eval` has been ignored by Snowpark pandas " - + "API:\nSnowpark pandas always uses the python engine in favor of " - + "the numexpr engine, even if the numexpr engine is available." -) - - -engine_parameters = pytest.mark.parametrize( - "engine_kwargs", - [ - param({"engine": "python"}, id="engine_python"), - param({"engine": "numexpr"}, id="engine_numexpr"), - param({}, id="default_engine"), - ], -) - - def python_eval(df, expr, *, inplace=False, **kwargs): """ Implement DataFrame.eval(), but always use engine='python' for pandas dataframes. @@ -58,88 +44,6 @@ def python_eval(df, expr, *, inplace=False, **kwargs): return df.eval(expr, inplace=inplace, **(kwargs | {"engine": "python"})) -@pytest.fixture -def test_dfs(): - return create_test_dfs( - native_pd.DataFrame( - { - "CUSTOMER_KEY": [-10, -10, -5, 30, 0, 1, 25], - "ACCOUNT_BALANCE": [-101, -101, -51, 30, 0, 53, 105], - "MARKET_SEGMENT": [ - "AUTOMOBILE", - "AUTOMOBILE", - "FURNITURE", - "AUTOMOBILE", - "FURNITURE", - "MACHINERY", - "HOUSEHOLD", - ], - }, - index=["i0", "i1", "i2", "i3", "i4", "i5", "i6"], - ) - ) - - -@pytest.fixture -def test_dfs_with_named_index(): - return create_test_dfs( - native_pd.DataFrame( - { - "CUSTOMER_KEY": [-10, -10, -5, 30, 0, 1, 25], - "ACCOUNT_BALANCE": [-101, -101, -51, 30, 0, 53, 105], - "MARKET_SEGMENT": [ - "AUTOMOBILE", - "AUTOMOBILE", - "FURNITURE", - "AUTOMOBILE", - "FURNITURE", - "MACHINERY", - "HOUSEHOLD", - ], - }, - index=native_pd.Index( - ["i0", "i1", "i2", "i3", "i4", "i5", "i6"], name="index_name" - ), - ) - ) - - -@pytest.fixture -def test_dfs_multiindex(): - return create_test_dfs( - native_pd.DataFrame( - { - "CUSTOMER_KEY": [-10, -10, -5, 30, 0, 1, 25], - "ACCOUNT_BALANCE": [-101, -101, -51, 30, 0, 53, 105], - "MARKET_SEGMENT": [ - "AUTOMOBILE", - "AUTOMOBILE", - "FURNITURE", - "AUTOMOBILE", - "FURNITURE", - "MACHINERY", - "HOUSEHOLD", - ], - }, - index=native_pd.MultiIndex.from_tuples( - [ - ("i00", "i01"), - ("i10", "i11"), - ("i20", "i21"), - ( - "i30", - "i31", - ), - ("i40", "i41"), - ("i50", "i51"), - ("i60", "i61"), - ], - names=["level_0_name", "level_1_name"], - ), - ) - ) - - global_int = 10 diff --git a/tests/integ/modin/frame/test_eval_and_query/test_query.py b/tests/integ/modin/frame/test_eval_and_query/test_query.py new file mode 100644 index 0000000000..263bbae9bc --- /dev/null +++ b/tests/integ/modin/frame/test_eval_and_query/test_query.py @@ -0,0 +1,385 @@ +# +# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved. +# + +import pytest +from pytest import param +import pandas as native_pd +import modin.pandas as pd +import re + +from tests.integ.utils.sql_counter import sql_count_checker +from snowflake.snowpark.modin.plugin._internal.utils import MODIN_IS_AT_LEAST_0_36_0 +import logging +from tests.integ.modin.frame.test_eval_and_query.utils import ( + engine_parameters, + ENGINE_IGNORED_MESSAGE, +) +from tests.integ.modin.utils import ( + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, + eval_snowpark_pandas_result, +) + +pytestmark = pytest.mark.skipif( + not MODIN_IS_AT_LEAST_0_36_0, + reason="Modin 0.36 had an important performant fix for query().", +) + +global_int = 10 + + +def python_query(df, expr, *, inplace=False, **kwargs): + """ + Implement DataFrame.query(), but always use engine='python' for pandas dataframes. + + pandas query() by default will use numexpr engine if numexpr is available, + but numexpr lacks some features that are available with the python engine. + Snowpark pandas ignores the engine argument, so we test against + engine='python' in pandas. + """ + # have to add an extra stack level since we are wrapping query in another + # function call. + kwargs["level"] = kwargs.get("level", 0) + 1 + if isinstance(df, pd.DataFrame): + return df.query(expr, inplace=inplace, **kwargs) + assert isinstance(df, native_pd.DataFrame) + return df.query(expr, inplace=inplace, **(kwargs | {"engine": "python"})) + + +@sql_count_checker(query_count=1) +@pytest.mark.parametrize( + "expr", + [ + "CUSTOMER_KEY < @global_int", + "~(CUSTOMER_KEY < @global_int)", + "MARKET_SEGMENT == 'MACHINERY' or ACCOUNT_BALANCE > @local_int", + "MARKET_SEGMENT.isin(('AUTOMOBILE', 'MACHINERY'))", + "`PURCHASE COUNT` != 0", + "@df.isin([0])", + ], +) +@engine_parameters +def test_default_parameters( + test_dfs, + expr, + engine_kwargs, +): + def do_query(df, expr): + local_int = 0 # noqa: F841 + local_list = ["MACHINERY", "FURNITURE"] # noqa: F841 + return python_query(df, expr, **engine_kwargs) + + eval_snowpark_pandas_result( + *test_dfs, + lambda df: do_query(df, expr), + ) + + +@sql_count_checker(query_count=1) +@engine_parameters +def test_local_dict(test_dfs, engine_kwargs): + def do_query(df): + local_int = 0 # noqa: F841 + local_list = ["MACHINERY", "FURNITURE"] # noqa: F841 + return python_query( + df, + """ + ACCOUNT_BALANCE > @local_int or MARKET_SEGMENT.isin(@local_list) and not @local_str == MARKET_SEGMENT + """, + local_dict={ + "local_int": 100, + "local_list": ["MACHINERY"], + "local_str": "MACHINERY", + }, + **engine_kwargs, + ) + + eval_snowpark_pandas_result(*test_dfs, do_query) + + +@sql_count_checker(query_count=1) +@engine_parameters +def test_global_dict(test_dfs, engine_kwargs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: python_query( + df, + "ACCOUNT_BALANCE > @global_int or ACCOUNT_BALANCE < @dict_int", + global_dict={"global_int": global_int + 1000, "dict_int": 3}, + **engine_kwargs, + ), + ) + + +@engine_parameters +class TestParser: + @sql_count_checker(query_count=1) + def test_pandas_supports_list_comparison( + self, + test_dfs, + engine_kwargs, + ): + """Test that pandas parser supports comparing series to list""" + + def do_query(df): + return python_query( + df, + "MARKET_SEGMENT == ['AUTOMOBILE', 'MACHINERY']", + parser="pandas", + **engine_kwargs, + ) + + eval_snowpark_pandas_result( + *test_dfs, + do_query, + ) + + @sql_count_checker(query_count=0) + def test_python_fails_list_comparison(self, test_dfs, engine_kwargs): + """Test that python parser fails with comparing series to list""" + + def do_query(df): + return python_query( + df, + "MARKET_SEGMENT == ['AUTOMOBILE', 'MACHINERY']", + parser="python", + **engine_kwargs, + ) + + eval_snowpark_pandas_result( + *test_dfs, + do_query, + expect_exception=True, + expect_exception_type=NotImplementedError, + expect_exception_match=re.escape("'In' nodes are not implemented"), + ) + + @sql_count_checker(query_count=1) + def test_pandas_supports_boolean_operators(self, test_dfs, engine_kwargs): + """Test that pandas parser supports 'and'/'or' boolean operators""" + + def do_query(df): + return python_query( + df, + "CUSTOMER_KEY > 0 and ACCOUNT_BALANCE > 50", + parser="pandas", + **engine_kwargs, + ) + + eval_snowpark_pandas_result(*test_dfs, do_query) + + @sql_count_checker(query_count=0) + def test_python_fails_boolean_operators(self, test_dfs, engine_kwargs): + """Test that python parser fails with 'and'/'or' boolean operators""" + + def do_query(df): + return python_query( + df, + "CUSTOMER_KEY > 0 and ACCOUNT_BALANCE > 50", + parser="python", + **engine_kwargs, + ) + + eval_snowpark_pandas_result( + *test_dfs, + do_query, + expect_exception=True, + expect_exception_type=NotImplementedError, + expect_exception_match=re.escape("'BoolOp' nodes are not implemented"), + ) + + @sql_count_checker(query_count=1) + @pytest.mark.parametrize("parser", ["pandas", "python"]) + def test_works_with_bitwise_operators( + self, + test_dfs, + parser, + engine_kwargs, + ): + """Test that both parsers work with bitwise operators (&/|)""" + + def do_query(df): + return python_query( + df, + "(CUSTOMER_KEY > 0) & (ACCOUNT_BALANCE > 50)", + parser=parser, + **engine_kwargs, + ) + + eval_snowpark_pandas_result(*test_dfs, do_query) + + +@sql_count_checker(query_count=1) +def test_warning_for_explicit_numexpr_engine(test_dfs, caplog): # noqa: F811 + with caplog.at_level(logging.WARNING): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: python_query(df, "CUSTOMER_KEY > 0", engine="numexpr"), + ) + + assert any(record.message == ENGINE_IGNORED_MESSAGE for record in caplog.records) + + +@sql_count_checker(query_count=1) +def test_no_warning_for_default_numexpr_engine(test_dfs, caplog): # noqa: F811 + with caplog.at_level(logging.WARNING): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: python_query(df, "CUSTOMER_KEY > 0"), + ) + + assert not any( + record.message == ENGINE_IGNORED_MESSAGE for record in caplog.records + ) + + +@sql_count_checker(query_count=1) +@engine_parameters +def test_resolvers(test_dfs, engine_kwargs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: python_query( + df, + "ACCOUNT_BALANCE > key1 or ACCOUNT_BALANCE == key2", + resolvers=({"key1": 0}, {"key1": 1000, "key2": 53}), + **engine_kwargs, + ), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("level", [0, 1, 2]) +@engine_parameters +def test_level( + test_dfs, + level, + engine_kwargs, +): + def level_0(df): + local_var = "i0" # noqa: F841 + return python_query(df, "index == @local_var", level=level, **engine_kwargs) + + def level_1(df): + local_var = "i1" # noqa: F841 + return level_0(df) + + def level_2(df): + local_var = "i2" # noqa: F841 + return level_1(df) + + eval_snowpark_pandas_result( + *test_dfs, + level_2, + ) + + +@engine_parameters +class TestInplace: + @pytest.mark.parametrize( + "inplace_kwargs", + [ + param({}, id="inplace_default"), + param({"inplace": False}, id="inplace_False"), + ], + ) + @sql_count_checker(query_count=2) + def test_inplace_false_does_not_mutate_df( + self, test_dfs, engine_kwargs, inplace_kwargs + ): + snowpark_input, pandas_input = test_dfs + pandas_original = pandas_input.copy() + eval_snowpark_pandas_result( + snowpark_input, + pandas_input, + lambda df: python_query( + df, "ACCOUNT_BALANCE < 0", **engine_kwargs, **inplace_kwargs + ), + ) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snowpark_input, pandas_original + ) + + @sql_count_checker(query_count=1) + def test_inplace_true_mutates_df( + self, + test_dfs, + engine_kwargs, + ): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: python_query( + df, "ACCOUNT_BALANCE < 0", **engine_kwargs, inplace=True + ), + inplace=True, + ) + + +@pytest.mark.parametrize( + "do_query", + ( + param( + lambda df: python_query(df, "ACCOUNT_BALANCE == @undefined_variable"), + id="undefined_variable", + ), + param( + lambda df: python_query(df, "x = ACCOUNT_BALANCE * 2"), + id="assignment", + ), + param( + lambda df: python_query(df, "ACCOUNT_BALANCE < 0\nACCOUNT_BALANCE > 0"), + id="multiple_expression_lines", + ), + param( + lambda df: python_query(df, "ACCOUNT_BALANCE ==", inplace=True), + id="invalid_syntax", + ), + ), +) +@sql_count_checker(query_count=0) +def test_user_error(test_dfs, do_query): + eval_snowpark_pandas_result( + *test_dfs, + do_query, + expect_exception=True, + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("expr", ("ilevel_0 == 'i0'", "index == 'i0'")) +def test_refer_to_unnamed_single_level_index(test_dfs, expr): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: python_query(df, expr=expr), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("expr", ("index_name == 'i0'", "index == 'i0'")) +def test_refer_to_named_single_level_index(test_dfs_with_named_index, expr): + eval_snowpark_pandas_result( + *test_dfs_with_named_index, + lambda df: python_query(df, expr=expr), + ) + + +@pytest.mark.xfail( + strict=True, raises=NotImplementedError, reason="No support for multiindex" +) +def test_multiindex(test_dfs_multiindex): + eval_snowpark_pandas_result( + *test_dfs_multiindex, + lambda df: python_query(df, expr="level_1_name == 'i01'"), + ) + + +@sql_count_checker(query_count=2) +def test_target(test_dfs): + snow_df, pandas_df = test_dfs + snow_df_copy = snow_df.copy() + pandas_df_copy = pandas_df.copy() + snow_df.query("ACCOUNT_BALANCE < 0", target=snow_df_copy, inplace=True) + pandas_df.query("ACCOUNT_BALANCE < 0", target=pandas_df_copy, inplace=True) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, pandas_df) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snow_df_copy, pandas_df_copy + ) diff --git a/tests/integ/modin/frame/test_eval_and_query/utils.py b/tests/integ/modin/frame/test_eval_and_query/utils.py new file mode 100644 index 0000000000..cc04f1f2a4 --- /dev/null +++ b/tests/integ/modin/frame/test_eval_and_query/utils.py @@ -0,0 +1,22 @@ +# +# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved. +# + +import pytest +from pytest import param + +ENGINE_IGNORED_MESSAGE = ( + "The argument `engine` of `eval` has been ignored by Snowpark pandas " + + "API:\nSnowpark pandas always uses the python engine in favor of " + + "the numexpr engine, even if the numexpr engine is available." +) + + +engine_parameters = pytest.mark.parametrize( + "engine_kwargs", + [ + param({"engine": "python"}, id="engine_python"), + param({"engine": "numexpr"}, id="engine_numexpr"), + param({}, id="default_engine"), + ], +) diff --git a/tests/integ/modin/window/__init__.py b/tests/integ/modin/window/__init__.py new file mode 100644 index 0000000000..c1a753cccc --- /dev/null +++ b/tests/integ/modin/window/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved. +# diff --git a/tests/unit/modin/test_unsupported.py b/tests/unit/modin/test_unsupported.py index 036296250f..d50768c6d5 100644 --- a/tests/unit/modin/test_unsupported.py +++ b/tests/unit/modin/test_unsupported.py @@ -93,7 +93,6 @@ def test_unsupported_general(general_method, kwargs): ["pipe", {"func": ""}], ["prod", {}], ["product", {}], - ["query", {"expr": ""}], ["reindex_like", {"other": ""}], ["reorder_levels", {"order": ""}], ["sem", {}],