Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@

#### New Features
- Added support for `DataFrame.query` for dataframes with single-level indexes.
- Added support for `DataFrameGroupby.__len__` and `SeriesGroupBy.__len__`.

#### Improvements

Expand Down
2 changes: 2 additions & 0 deletions docs/source/modin/supported/groupby_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ Computations/descriptive stats
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``last`` | P | Does not support ``min_count`` parameter |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``__len__`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``max`` | Y | See ``count`` |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``mean`` | Y | See ``count`` |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1230,10 +1230,9 @@ def __getitem__(self, key):
)


@register_df_groupby_override("__len__")
@register_dataframe_groupby_accessor("__len__")
def __len__(self):
# TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.DataFrameGroupBy functions
ErrorMessage.method_not_implemented_error(name="__len__", class_="GroupBy")
return self.ngroups


# expanding and rolling are unique cases and need to likely be handled
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,46 +6,53 @@
import numpy as np
import pandas as native_pd
import pytest
from pytest import param

import snowflake.snowpark.modin.plugin # noqa: F401
from tests.integ.modin.utils import eval_snowpark_pandas_result
from tests.integ.utils.sql_counter import sql_count_checker


def assert_ngroups_equal(snow_res, pd_res):
assert snow_res.ngroups == pd_res.ngroups
@pytest.fixture(
params=(
param(lambda groupby: groupby.ngroups, id="ngroups"),
param(lambda groupby: len(groupby), id="len"),
)
)
def count_function(request):
return request.param


@pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
@sql_count_checker(query_count=1)
def test_groupby_sort_multiindex_series(native_series_multi_numeric, by):
def test_series_with_multiindex(native_series_multi_numeric, by, count_function):

snow_ser = pd.Series(native_series_multi_numeric)
native_ser = native_series_multi_numeric
eval_snowpark_pandas_result(
snow_ser,
native_ser,
lambda ser: ser.groupby(by=by),
comparator=assert_ngroups_equal,
lambda ser: count_function(ser.groupby(by=by)),
comparator=int.__eq__,
)


@sql_count_checker(query_count=1)
def test_groupby_ngroups_series_nan():
def test_series_with_nan(count_function):
index = native_pd.Index(["a", "b", "b", "a"])
index.names = ["grp_col"]
native_ser = native_pd.Series([390.0, 350.0, np.nan, 20.0], index=index)
snow_ser = pd.Series(native_ser)
eval_snowpark_pandas_result(
snow_ser,
native_ser,
lambda ser: ser.groupby(by="grp_col"),
comparator=assert_ngroups_equal,
lambda ser: count_function(ser.groupby(by="grp_col")),
comparator=int.__eq__,
)


@sql_count_checker(query_count=1)
def test_groupby_ngroups_series_nan_all():
def test_all_nan_series(count_function):
index = native_pd.Index(["a", "b", "b", "a"])
index.names = ["grp_col"]

Expand All @@ -56,13 +63,13 @@ def test_groupby_ngroups_series_nan_all():
eval_snowpark_pandas_result(
snow_ser,
native_ser,
lambda ser: ser.groupby(by="grp_col"),
comparator=assert_ngroups_equal,
lambda ser: count_function(ser.groupby(by="grp_col")),
comparator=int.__eq__,
)


@sql_count_checker(query_count=1)
def test_groupby_ngroups_series():
def test_series_with_single_level_index(count_function):
index = native_pd.Index(["a", "b", "b", "a"])
index.names = ["grp_col"]

Expand All @@ -75,28 +82,28 @@ def test_groupby_ngroups_series():
eval_snowpark_pandas_result(
snow_ser,
native_ser,
lambda ser: ser.groupby(by="grp_col"),
comparator=assert_ngroups_equal,
lambda ser: count_function(ser.groupby(by="grp_col")),
comparator=int.__eq__,
)


@pytest.mark.parametrize("by", ["A", ["A", "B"]])
@sql_count_checker(query_count=1)
def test_groupby_ngroups(by):
def test_df(by, count_function):
native_df = native_pd.DataFrame({"A": list("aabbcccd"), "B": list("xxxxabcx")})
snow_df = pd.DataFrame(native_df)

eval_snowpark_pandas_result(
snow_df,
native_df,
lambda df: df.groupby(by=by),
comparator=assert_ngroups_equal,
lambda df: count_function(df.groupby(by=by)),
comparator=int.__eq__,
)


@pytest.mark.parametrize("by", ["c1", ["c1", "c2"], ["c1", "c2", "c1"]])
@sql_count_checker(query_count=1)
def test_groupby_ngroups_nan(by):
def test_df_with_nan(by, count_function):
native_df = native_pd.DataFrame(
{
"c1": [np.nan, 3, 4, 4, "b"],
Expand All @@ -109,22 +116,22 @@ def test_groupby_ngroups_nan(by):
eval_snowpark_pandas_result(
snow_df,
native_df,
lambda df: df.groupby(by=by),
comparator=assert_ngroups_equal,
lambda df: count_function(df.groupby(by=by)),
comparator=int.__eq__,
)


@pytest.mark.parametrize("by", ["A", ["A", "B"]])
@sql_count_checker(query_count=1)
def test_groupby_ngroups_empty_cols(by):
def test_df_with_0_rows(by, count_function):
native_df = native_pd.DataFrame({"A": [], "B": []})
snow_df = pd.DataFrame({"A": [], "B": []})

eval_snowpark_pandas_result(
snow_df,
native_df,
lambda df: df.groupby(by=by),
comparator=assert_ngroups_equal,
lambda df: count_function(df.groupby(by=by)),
comparator=int.__eq__,
)


Expand All @@ -133,10 +140,10 @@ def test_groupby_ngroups_empty_cols(by):
"level", [0, "B", [1, 1], [1, 0], ["A", "B"], [0, "A"], [-1, 0]]
)
@sql_count_checker(query_count=2)
def test_groupby_ngroups_multiindex(df_multi, level):
def test_df_with_multiindex(df_multi, level, count_function):
eval_snowpark_pandas_result(
df_multi,
df_multi.to_pandas(),
lambda df: df.groupby(level=level),
comparator=assert_ngroups_equal,
lambda df: count_function(df.groupby(level=level)),
comparator=int.__eq__,
)
2 changes: 0 additions & 2 deletions tests/unit/modin/test_groupby_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
@pytest.mark.parametrize(
"func, func_name",
[
(lambda se: se.groupby("A").__len__(), "__len__"),
(lambda se: se.groupby("A").__bytes__(), "__bytes__"),
(lambda se: se.groupby("A").corrwith, "corrwith"),
(lambda se: se.groupby("A").dtypes, "dtypes"),
Expand Down Expand Up @@ -49,7 +48,6 @@ def test_series_groupby_unsupported_methods_raises(
@pytest.mark.parametrize(
"func, func_name",
[
(lambda df: df.groupby("A").__len__(), "__len__"),
(lambda df: df.groupby("A").__bytes__(), "__bytes__"),
(lambda df: df.groupby("A").corrwith, "corrwith"),
(lambda df: df.groupby("A").dtypes, "dtypes"),
Expand Down