From ab8e05c8270331bf7c191051ad5d1c1601adfd0c Mon Sep 17 00:00:00 2001 From: sfc-gh-mvashishtha Date: Thu, 25 Sep 2025 03:46:13 +0800 Subject: [PATCH 1/3] SNOW-2361885: Implement DataFrameGroupby.__len__() Signed-off-by: sfc-gh-mvashishtha --- CHANGELOG.md | 1 + .../modin/supported/groupby_supported.rst | 2 + .../extensions/dataframe_groupby_overrides.py | 4 +- ...ups.py => test_groupby_ngroups___len__.py} | 59 +++++++++++-------- 4 files changed, 37 insertions(+), 29 deletions(-) rename tests/integ/modin/groupby/{test_groupby_ngroups.py => test_groupby_ngroups___len__.py} (67%) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0740db7fd..3eb6af0a3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,7 @@ #### New Features - Added support for `DataFrame.query` for dataframes with single-level indexes. +- Added support for `DataFrameGroupby.__len__` and `SeriesGroupBy.__len__`. #### Improvements diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst index 12352ac984..fdcb311157 100644 --- a/docs/source/modin/supported/groupby_supported.rst +++ b/docs/source/modin/supported/groupby_supported.rst @@ -121,6 +121,8 @@ Computations/descriptive stats +-----------------------------+---------------------------------+----------------------------------------------------+ | ``last`` | P | Does not support ``min_count`` parameter | +-----------------------------+---------------------------------+----------------------------------------------------+ +| ``__len__`` | Y | | ++-----------------------------+---------------------------------+----------------------------------------------------+ | ``max`` | Y | See ``count`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``mean`` | Y | See ``count`` | diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_groupby_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_groupby_overrides.py index 5e43baa65b..de31269a1c 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_groupby_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_groupby_overrides.py @@ -1230,10 +1230,8 @@ def __getitem__(self, key): ) -@register_df_groupby_override("__len__") def __len__(self): - # TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.DataFrameGroupBy functions - ErrorMessage.method_not_implemented_error(name="__len__", class_="GroupBy") + return self.ngroups # expanding and rolling are unique cases and need to likely be handled diff --git a/tests/integ/modin/groupby/test_groupby_ngroups.py b/tests/integ/modin/groupby/test_groupby_ngroups___len__.py similarity index 67% rename from tests/integ/modin/groupby/test_groupby_ngroups.py rename to tests/integ/modin/groupby/test_groupby_ngroups___len__.py index e1684bd851..a9d710081c 100644 --- a/tests/integ/modin/groupby/test_groupby_ngroups.py +++ b/tests/integ/modin/groupby/test_groupby_ngroups___len__.py @@ -6,32 +6,39 @@ import numpy as np import pandas as native_pd import pytest +from pytest import param import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.utils import eval_snowpark_pandas_result from tests.integ.utils.sql_counter import sql_count_checker -def assert_ngroups_equal(snow_res, pd_res): - assert snow_res.ngroups == pd_res.ngroups +@pytest.fixture( + params=( + param(lambda groupby: groupby.ngroups, id="ngroups"), + param(lambda groupby: len(groupby), id="len"), + ) +) +def count_function(request): + return request.param @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]]) @sql_count_checker(query_count=1) -def test_groupby_sort_multiindex_series(native_series_multi_numeric, by): +def test_series_with_multiindex(native_series_multi_numeric, by, count_function): snow_ser = pd.Series(native_series_multi_numeric) native_ser = native_series_multi_numeric eval_snowpark_pandas_result( snow_ser, native_ser, - lambda ser: ser.groupby(by=by), - comparator=assert_ngroups_equal, + lambda ser: count_function(ser.groupby(by=by)), + comparator=int.__eq__, ) @sql_count_checker(query_count=1) -def test_groupby_ngroups_series_nan(): +def test_series_with_nan(count_function): index = native_pd.Index(["a", "b", "b", "a"]) index.names = ["grp_col"] native_ser = native_pd.Series([390.0, 350.0, np.nan, 20.0], index=index) @@ -39,13 +46,13 @@ def test_groupby_ngroups_series_nan(): eval_snowpark_pandas_result( snow_ser, native_ser, - lambda ser: ser.groupby(by="grp_col"), - comparator=assert_ngroups_equal, + lambda ser: count_function(ser.groupby(by="grp_col")), + comparator=int.__eq__, ) @sql_count_checker(query_count=1) -def test_groupby_ngroups_series_nan_all(): +def test_all_nan_series(count_function): index = native_pd.Index(["a", "b", "b", "a"]) index.names = ["grp_col"] @@ -56,13 +63,13 @@ def test_groupby_ngroups_series_nan_all(): eval_snowpark_pandas_result( snow_ser, native_ser, - lambda ser: ser.groupby(by="grp_col"), - comparator=assert_ngroups_equal, + lambda ser: count_function(ser.groupby(by="grp_col")), + comparator=int.__eq__, ) @sql_count_checker(query_count=1) -def test_groupby_ngroups_series(): +def test_series_with_single_level_index(count_function): index = native_pd.Index(["a", "b", "b", "a"]) index.names = ["grp_col"] @@ -75,28 +82,28 @@ def test_groupby_ngroups_series(): eval_snowpark_pandas_result( snow_ser, native_ser, - lambda ser: ser.groupby(by="grp_col"), - comparator=assert_ngroups_equal, + lambda ser: count_function(ser.groupby(by="grp_col")), + comparator=int.__eq__, ) @pytest.mark.parametrize("by", ["A", ["A", "B"]]) @sql_count_checker(query_count=1) -def test_groupby_ngroups(by): +def test_df(by, count_function): native_df = native_pd.DataFrame({"A": list("aabbcccd"), "B": list("xxxxabcx")}) snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( snow_df, native_df, - lambda df: df.groupby(by=by), - comparator=assert_ngroups_equal, + lambda df: count_function(df.groupby(by=by)), + comparator=int.__eq__, ) @pytest.mark.parametrize("by", ["c1", ["c1", "c2"], ["c1", "c2", "c1"]]) @sql_count_checker(query_count=1) -def test_groupby_ngroups_nan(by): +def test_df_with_nan(by, count_function): native_df = native_pd.DataFrame( { "c1": [np.nan, 3, 4, 4, "b"], @@ -109,22 +116,22 @@ def test_groupby_ngroups_nan(by): eval_snowpark_pandas_result( snow_df, native_df, - lambda df: df.groupby(by=by), - comparator=assert_ngroups_equal, + lambda df: count_function(df.groupby(by=by)), + comparator=int.__eq__, ) @pytest.mark.parametrize("by", ["A", ["A", "B"]]) @sql_count_checker(query_count=1) -def test_groupby_ngroups_empty_cols(by): +def test_df_with_0_rows(by, count_function): native_df = native_pd.DataFrame({"A": [], "B": []}) snow_df = pd.DataFrame({"A": [], "B": []}) eval_snowpark_pandas_result( snow_df, native_df, - lambda df: df.groupby(by=by), - comparator=assert_ngroups_equal, + lambda df: count_function(df.groupby(by=by)), + comparator=int.__eq__, ) @@ -133,10 +140,10 @@ def test_groupby_ngroups_empty_cols(by): "level", [0, "B", [1, 1], [1, 0], ["A", "B"], [0, "A"], [-1, 0]] ) @sql_count_checker(query_count=2) -def test_groupby_ngroups_multiindex(df_multi, level): +def test_df_with_multiindex(df_multi, level, count_function): eval_snowpark_pandas_result( df_multi, df_multi.to_pandas(), - lambda df: df.groupby(level=level), - comparator=assert_ngroups_equal, + lambda df: count_function(df.groupby(level=level)), + comparator=int.__eq__, ) From cb58e79b430de07cd38d420d6ba229900df895e1 Mon Sep 17 00:00:00 2001 From: sfc-gh-mvashishtha Date: Thu, 25 Sep 2025 05:22:08 +0800 Subject: [PATCH 2/3] SNOW-2361885: Implement DataFrameGroupby.__len__() Signed-off-by: sfc-gh-mvashishtha --- tests/unit/modin/test_groupby_unsupported.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/modin/test_groupby_unsupported.py b/tests/unit/modin/test_groupby_unsupported.py index 42c70e1a28..f03f9163f0 100644 --- a/tests/unit/modin/test_groupby_unsupported.py +++ b/tests/unit/modin/test_groupby_unsupported.py @@ -8,7 +8,6 @@ @pytest.mark.parametrize( "func, func_name", [ - (lambda se: se.groupby("A").__len__(), "__len__"), (lambda se: se.groupby("A").__bytes__(), "__bytes__"), (lambda se: se.groupby("A").corrwith, "corrwith"), (lambda se: se.groupby("A").dtypes, "dtypes"), @@ -49,7 +48,6 @@ def test_series_groupby_unsupported_methods_raises( @pytest.mark.parametrize( "func, func_name", [ - (lambda df: df.groupby("A").__len__(), "__len__"), (lambda df: df.groupby("A").__bytes__(), "__bytes__"), (lambda df: df.groupby("A").corrwith, "corrwith"), (lambda df: df.groupby("A").dtypes, "dtypes"), From 0f3bc1347bfb503b4de2a50911859ed3a6e01df5 Mon Sep 17 00:00:00 2001 From: sfc-gh-mvashishtha Date: Thu, 25 Sep 2025 09:32:47 +0800 Subject: [PATCH 3/3] Really do the override so we don't rely on groupby.indices Signed-off-by: sfc-gh-mvashishtha --- .../modin/plugin/extensions/dataframe_groupby_overrides.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_groupby_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_groupby_overrides.py index de31269a1c..a97d391332 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_groupby_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_groupby_overrides.py @@ -1230,6 +1230,7 @@ def __getitem__(self, key): ) +@register_dataframe_groupby_accessor("__len__") def __len__(self): return self.ngroups