snowflakedb · sfc-gh-mvashishtha · Sep 22, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -60,6 +60,7 @@
 ### Snowpark pandas API Updates
 
 #### New Features
+- Added support for `DataFrame.query` for dataframes with single-level indexes.
 
 #### Improvements
 

@@ -332,7 +332,7 @@ Methods
 |                             |                                 |                                  | ``"linear"`` or ``"nearest"``, and ``method`` is   |
 |                             |                                 |                                  | ``"single"``.                                      |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``query``                   | N                               |                                  |                                                    |
+| ``query``                   | P                               |                                  | No support for dataframes with a row MultiIndex.   |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``radd``                    | P                               | ``level``                        |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

@@ -3294,7 +3294,134 @@ def quantile():
 
     def query():
         """
-        Query the columns of a ``DataFrame`` with a boolean expression.
+        Query the columns of a DataFrame with a boolean expression.
+
+        Parameters
+        ----------
+        expr : str
+            The query string to evaluate.
+
+            You can refer to variables
+            in the environment by prefixing them with an '@' character like
+            ``@a + b``.
+
+            You can refer to column names that are not valid Python variable names
+            by surrounding them in backticks. Thus, column names containing spaces
+            or punctuations (besides underscores) or starting with digits must be
+            surrounded by backticks. (For example, a column named "Area (cm^2)" would
+            be referenced as ```Area (cm^2)```). Column names which are Python keywords
+            (like "list", "for", "import", etc) cannot be used.
+
+            For example, if one of your columns is called ``a a`` and you want
+            to sum it with ``b``, your query should be ```a a` + b``.
+
+        inplace : bool
+            Whether to modify the DataFrame rather than creating a new one.
+
+        **kwargs
+            See the documentation for :func:`eval` for complete details
+            on the keyword arguments accepted by :meth:`DataFrame.query`.
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame resulting from the provided query expression or
+            None if ``inplace=True``.
+
+        See Also
+        --------
+        eval : Evaluate a string describing operations on
+            DataFrame columns.
+        DataFrame.eval : Evaluate a string describing operations on
+            DataFrame columns.
+
+        Notes
+        -----
+        The result of the evaluation of this expression is first passed to
+        :attr:`DataFrame.loc` and if that fails because of a
+        multidimensional key (e.g., a DataFrame) then the result will be passed
+        to :meth:`DataFrame.__getitem__`.
+
+        This method uses the top-level :func:`eval` function to
+        evaluate the passed query.
+
+        The :meth:`query` method uses a slightly
+        modified Python syntax by default. For example, the ``&`` and ``|``
+        (bitwise) operators have the precedence of their boolean cousins,
+        :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
+        however the semantics are different.
+
+        You can change the semantics of the expression by passing the keyword
+        argument ``parser='python'``. This enforces the same semantics as
+        evaluation in Python space.
+
+        The :attr:`DataFrame.index` and
+        :attr:`DataFrame.columns` attributes of the
+        :class:`DataFrame` instance are placed in the query namespace
+        by default, which allows you to treat both the index and columns of the
+        frame as a column in the frame.
+        The identifier ``index`` is used for the frame index; you can also
+        use the name of the index to identify it in a query. Please note that
+        Python keywords may not be used as identifiers.
+
+        *Backtick quoted variables*
+
+        Backtick quoted variables are parsed as literal Python code and
+        are converted internally to a Python valid identifier.
+        This can lead to the following problems.
+
+        During parsing a number of disallowed characters inside the backtick
+        quoted string are replaced by strings that are allowed as a Python identifier.
+        These characters include all operators in Python, the space character, the
+        question mark, the exclamation mark, the dollar sign, and the euro sign.
+        For other characters that fall outside the ASCII range (U+0001..U+007F)
+        and those that are not further specified in PEP 3131,
+        the query parser will raise an error.
+        This excludes whitespace different than the space character,
+        but also the hashtag (as it is used for comments) and the backtick
+        itself (backtick can also not be escaped).
+
+        In a special case, quotes that make a pair around a backtick can
+        confuse the parser.
+        For example, ```it's` > `that's``` will raise an error,
+        as it forms a quoted string (``'s > `that'``) with a backtick inside.
+
+        See also the Python documentation about lexical analysis
+        (https://docs.python.org/3/reference/lexical_analysis.html).
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'A': range(1, 6),
+        ...                    'B': range(10, 0, -2),
+        ...                    'C C': range(10, 5, -1)})
+        >>> df
+           A   B  C C
+        0  1  10   10
+        1  2   8    9
+        2  3   6    8
+        3  4   4    7
+        4  5   2    6
+        >>> df.query('A > B')
+           A  B  C C
+        4  5  2    6
+
+        The previous expression is equivalent to
+
+        >>> df[df.A > df.B]
+           A  B  C C
+        4  5  2    6
+
+        For columns with spaces in their name, you can use backtick quoting.
+
+        >>> df.query('B == `C C`')
+           A   B  C C
+        0  1  10   10
+
+        The previous expression is equivalent to
+
+        >>> df[df.B == df['C C']]
+           A   B  C C
+        0  1  10   10
         """
 
     def rename():

@@ -101,6 +101,7 @@
     replace_external_data_keys_with_empty_pandas_series,
     replace_external_data_keys_with_query_compiler,
     try_convert_index_to_native,
+    update_eval_and_query_engine_kwarg_and_maybe_warn,
 )
 from snowflake.snowpark.modin.plugin.utils.error_message import (
     ErrorMessage,
@@ -128,6 +129,19 @@
 )
 
 
+# eval() and query() let the user reference variables according to dynamic
+# scope at `level` stack frames below the stack frame that called them. If
+# a snowflake override function here wraps an eval/query implementation
+# that's in another function, the implementation function frame is 4 frames
+# above the override function frame:
+# 1) query_compiler_caster wrapper dispatches to snowflake implementation
+# 2) telemetry wrapper 1
+# 3) telemetry wrapper 2 calls the snowflake implementation.
+# 4) The snowflake implementation calls the implementation function.
+# so we add 4 to the `level` param.
+EVAL_QUERY_EXTRA_STACK_LEVELS = 4
+
+
 register_dataframe_accessor = functools.partial(
     _register_dataframe_accessor, backend="Snowflake"
 )
@@ -245,32 +259,9 @@ def eval(self, expr, inplace=False, **kwargs):  # noqa: PR01, RT01, D200
 
     inplace = validate_bool_kwarg(inplace, "inplace")
 
-    # numexpr engine is useful for chained operations on numpy-backed
-    # arrays. It doesn't support all the syntax that the python engine
-    # does, and the Snowpark backend doesn't store data in numpy, so the
-    # numexpr performance optimizations are not useful. Ignore the "engine"
-    # requirement, and warn the user that if they explicitly select
-    # engine="numexpr", we will not honor their preference.
-    if kwargs.get("engine", None) == "numexpr":
-        WarningMessage.ignored_argument(
-            operation="eval",
-            argument="engine",
-            message="Snowpark pandas always uses the python engine in "
-            + "favor of the numexpr engine, even if the numexpr engine is "
-            + "available",
-        )
-    kwargs["engine"] = "python"
-
-    # eval() lets the user reference variables according to dynamic scope
-    # at `level` stack frames below the stack frame that called eval(). The
-    # eval() implementation is 4 stack frames above the frame where we execute
-    # the _eval() implementation:
-    # 1) query_compiler_caster wrapper dispatches to snowflake implementation
-    # 2) telemetry wrapper 1
-    # 3) telemetry wrapper 2 calls this implementation.
-    # 4) This method implementation calls the _eval() implementation
-    # so we add 4 to the `level` param.
-    kwargs["level"] = kwargs.get("level", 0) + 4
+    update_eval_and_query_engine_kwarg_and_maybe_warn(kwargs)
+
+    kwargs["level"] = kwargs.get("level", 0) + EVAL_QUERY_EXTRA_STACK_LEVELS
 
     index_resolvers = self._get_index_resolvers()
     column_resolvers = self._get_cleaned_column_resolvers()
@@ -327,9 +318,39 @@ def prod(
 register_dataframe_accessor("product")(prod)
 
 
-@register_dataframe_not_implemented()
-def query(self, expr, inplace=False, **kwargs):  # noqa: PR01, RT01, D200
-    pass  # pragma: no cover
+@register_dataframe_accessor("query")
+def query(self, expr, inplace=False, **kwargs):
+    if self._query_compiler.nlevels() > 1:
+        # If the rows of this dataframe have a multi-index, we store the index
+        # as a native_pd.MultiIndex, and the usual method of getting index
+        # resolvers with _get_index_resolvers() does not work.
+        ErrorMessage.not_implemented("query() does not support a multi-level index.")
+
+    inplace = validate_bool_kwarg(inplace, "inplace")
+
+    update_eval_and_query_engine_kwarg_and_maybe_warn(kwargs)
+
+    if inplace and "target" not in kwargs:
+        kwargs["target"] = self
+    else:
+        # have to explicitly set target=None to get correct error for
+        # multi-line query.
+        kwargs["target"] = None
+
+    key = self.eval(
+        expr,
+        inplace=False,
+        **(kwargs | {"level": kwargs.get("level", 0) + EVAL_QUERY_EXTRA_STACK_LEVELS}),
+    )
+
+    try:
+        result = self.loc[key]
+    except ValueError:
+        # when res is multi-dimensional, loc raises an error, but that is
+        # sometimes a valid query.
+        result = self[key]
+
+    return self._create_or_update_from_compiler(result._query_compiler, inplace=inplace)
 
 
 @register_dataframe_not_implemented()

@@ -13,6 +13,7 @@
 
 import modin.pandas as pd
 import numpy as np
+from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage
 import pandas
 from modin.core.storage_formats import BaseQueryCompiler  # pragma: no cover
 from pandas._libs import lib
@@ -659,3 +660,27 @@ def register_non_snowflake_accessors(
                     backend=backend, method=method, object_type=object_type
                 )
             )
+
+
+def update_eval_and_query_engine_kwarg_and_maybe_warn(kwargs: dict[str, Any]) -> None:
+    """
+    Update the engine kwarg and warn if the user tries to use numexpr.
+
+    Args:
+        kwargs: The keyword arguments to eval() or query().
+    """
+    # numexpr engine is useful for chained operations on numpy-backed
+    # arrays. It doesn't support all the syntax that the python engine
+    # does, and the Snowpark backend doesn't store data in numpy, so the
+    # numexpr performance optimizations are not useful. Ignore the "engine"
+    # requirement, and warn the user that if they explicitly select
+    # engine="numexpr", we will not honor their preference.
+    if kwargs.get("engine", None) == "numexpr":
+        WarningMessage.ignored_argument(
+            operation="eval",
+            argument="engine",
+            message="Snowpark pandas always uses the python engine in "
+            + "favor of the numexpr engine, even if the numexpr engine is "
+            + "available",
+        )
+    kwargs["engine"] = "python"
@@ -0,0 +1,3 @@
+#
+# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
+#
@@ -0,0 +1,92 @@
+#
+# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
+#
+
+import pytest
+import pandas as native_pd
+from tests.integ.modin.utils import (
+    create_test_dfs,
+)
+
+
+@pytest.fixture
+def test_dfs():
+    return create_test_dfs(
+        native_pd.DataFrame(
+            {
+                "CUSTOMER_KEY": [-10, -10, -5, 30, 0, 1, 25],
+                "ACCOUNT_BALANCE": [-101, -101, -51, 30, 0, 53, 105],
+                "MARKET_SEGMENT": [
+                    "AUTOMOBILE",
+                    "AUTOMOBILE",
+                    "FURNITURE",
+                    "AUTOMOBILE",
+                    "FURNITURE",
+                    "MACHINERY",
+                    "HOUSEHOLD",
+                ],
+                "PURCHASE COUNT": [1, 0, 5, 7, 9, 10, 0],
+            },
+            index=["i0", "i1", "i2", "i3", "i4", "i5", "i6"],
+        )
+    )
+
+
+@pytest.fixture
+def test_dfs_with_named_index():
+    return create_test_dfs(
+        native_pd.DataFrame(
+            {
+                "CUSTOMER_KEY": [-10, -10, -5, 30, 0, 1, 25],
+                "ACCOUNT_BALANCE": [-101, -101, -51, 30, 0, 53, 105],
+                "MARKET_SEGMENT": [
+                    "AUTOMOBILE",
+                    "AUTOMOBILE",
+                    "FURNITURE",
+                    "AUTOMOBILE",
+                    "FURNITURE",
+                    "MACHINERY",
+                    "HOUSEHOLD",
+                ],
+            },
+            index=native_pd.Index(
+                ["i0", "i1", "i2", "i3", "i4", "i5", "i6"], name="index_name"
+            ),
+        )
+    )
+
+
+@pytest.fixture
+def test_dfs_multiindex():
+    return create_test_dfs(
+        native_pd.DataFrame(
+            {
+                "CUSTOMER_KEY": [-10, -10, -5, 30, 0, 1, 25],
+                "ACCOUNT_BALANCE": [-101, -101, -51, 30, 0, 53, 105],
+                "MARKET_SEGMENT": [
+                    "AUTOMOBILE",
+                    "AUTOMOBILE",
+                    "FURNITURE",
+                    "AUTOMOBILE",
+                    "FURNITURE",
+                    "MACHINERY",
+                    "HOUSEHOLD",
+                ],
+            },
+            index=native_pd.MultiIndex.from_tuples(
+                [
+                    ("i00", "i01"),
+                    ("i10", "i11"),
+                    ("i20", "i21"),
+                    (
+                        "i30",
+                        "i31",
+                    ),
+                    ("i40", "i41"),
+                    ("i50", "i51"),
+                    ("i60", "i61"),
+                ],
+                names=["level_0_name", "level_1_name"],
+            ),
+        )
+    )