feat: add idxmin, idxmax to series, dataframe (#74)

TrevorBergeron · gcf-owl-bot[bot] · web-flow · commit 781307ec22d3 · 2023-10-03T16:12:52.000-07:00
* feat: add idxmin, idxmax to series, dataframe * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -17,6 +17,7 @@
 
 import pandas as pd
 
+import bigframes.constants as constants
 import bigframes.core as core
 import bigframes.core.blocks as blocks
 import bigframes.core.ordering as ordering
@@ -576,3 +577,53 @@ def align_columns(
     left_final = left_block.select_columns(left_column_ids)
     right_final = right_block.select_columns(right_column_ids)
     return left_final, right_final
+
+
+def idxmin(block: blocks.Block) -> blocks.Block:
+    return _idx_extrema(block, "min")
+
+
+def idxmax(block: blocks.Block) -> blocks.Block:
+    return _idx_extrema(block, "max")
+
+
+def _idx_extrema(
+    block: blocks.Block, min_or_max: typing.Literal["min", "max"]
+) -> blocks.Block:
+    if len(block.index_columns) != 1:
+        # TODO: Need support for tuple dtype
+        raise NotImplementedError(
+            f"idxmin not support for multi-index. {constants.FEEDBACK_LINK}"
+        )
+
+    original_block = block
+    result_cols = []
+    for value_col in original_block.value_columns:
+        direction = (
+            ordering.OrderingDirection.ASC
+            if min_or_max == "min"
+            else ordering.OrderingDirection.DESC
+        )
+        # Have to find the min for each
+        order_refs = [
+            ordering.OrderingColumnReference(value_col, direction),
+            *[
+                ordering.OrderingColumnReference(idx_col)
+                for idx_col in original_block.index_columns
+            ],
+        ]
+        window_spec = core.WindowSpec(ordering=order_refs)
+        idx_col = original_block.index_columns[0]
+        block, result_col = block.apply_window_op(
+            idx_col, agg_ops.first_op, window_spec
+        )
+        result_cols.append(result_col)
+
+    block = block.select_columns(result_cols).with_column_labels(
+        original_block.column_labels
+    )
+    # Stack the entire column axis to produce single-column result
+    # Assumption: uniform dtype for stackability
+    return block.aggregate_all_and_stack(
+        agg_ops.AnyValueOp(), dtype=block.dtypes[0]
+    ).with_column_labels([original_block.index.name])
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -1642,6 +1642,12 @@ def agg(
 
     aggregate = agg
 
+    def idxmin(self) -> bigframes.series.Series:
+        return bigframes.series.Series(block_ops.idxmin(self._block))
+
+    def idxmax(self) -> bigframes.series.Series:
+        return bigframes.series.Series(block_ops.idxmax(self._block))
+
     def describe(self) -> DataFrame:
         df_numeric = self._drop_non_numeric(keep_bool=False)
         if len(df_numeric.columns) == 0:
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -887,6 +887,34 @@ def argmin(self) -> int:
             scalars.Scalar, Series(block.select_column(row_nums)).iloc[0]
         )
 
+    def idxmax(self) -> blocks.Label:
+        block = self._block.order_by(
+            [
+                OrderingColumnReference(
+                    self._value_column, direction=OrderingDirection.DESC
+                ),
+                *[
+                    OrderingColumnReference(idx_col)
+                    for idx_col in self._block.index_columns
+                ],
+            ]
+        )
+        block = block.slice(0, 1)
+        return indexes.Index._from_block(block).to_pandas()[0]
+
+    def idxmin(self) -> blocks.Label:
+        block = self._block.order_by(
+            [
+                OrderingColumnReference(self._value_column),
+                *[
+                    OrderingColumnReference(idx_col)
+                    for idx_col in self._block.index_columns
+                ],
+            ]
+        )
+        block = block.slice(0, 1)
+        return indexes.Index._from_block(block).to_pandas()[0]
+
     @property
     def is_monotonic_increasing(self) -> bool:
         return typing.cast(
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -1292,6 +1292,34 @@ def test_df_update(overwrite, filter_func):
     pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1)
 
 
+def test_df_idxmin():
+    pd_df = pd.DataFrame(
+        {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"]
+    )
+    bf_df = dataframe.DataFrame(pd_df)
+
+    bf_result = bf_df.idxmin().to_pandas()
+    pd_result = pd_df.idxmin()
+
+    pd.testing.assert_series_equal(
+        bf_result, pd_result, check_index_type=False, check_dtype=False
+    )
+
+
+def test_df_idxmax():
+    pd_df = pd.DataFrame(
+        {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"]
+    )
+    bf_df = dataframe.DataFrame(pd_df)
+
+    bf_result = bf_df.idxmax().to_pandas()
+    pd_result = pd_df.idxmax()
+
+    pd.testing.assert_series_equal(
+        bf_result, pd_result, check_index_type=False, check_dtype=False
+    )
+
+
 @pytest.mark.parametrize(
     ("join", "axis"),
     [
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
@@ -41,6 +41,17 @@ def test_reset_multi_index(scalars_df_index, scalars_pandas_df_index):
     pandas.testing.assert_frame_equal(bf_result, pd_result)
 
 
+def test_series_multi_index_idxmin(scalars_df_index, scalars_pandas_df_index):
+    bf_result = scalars_df_index.set_index(["bool_col", "int64_too"])[
+        "float64_col"
+    ].idxmin()
+    pd_result = scalars_pandas_df_index.set_index(["bool_col", "int64_too"])[
+        "float64_col"
+    ].idxmin()
+
+    assert bf_result == pd_result
+
+
 def test_binop_series_series_matching_multi_indices(
     scalars_df_index, scalars_pandas_df_index
 ):
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -2468,6 +2468,18 @@ def test_argmax(scalars_df_index, scalars_pandas_df_index):
     assert bf_result == pd_result
 
 
+def test_series_idxmin(scalars_df_index, scalars_pandas_df_index):
+    bf_result = scalars_df_index.string_col.idxmin()
+    pd_result = scalars_pandas_df_index.string_col.idxmin()
+    assert bf_result == pd_result
+
+
+def test_series_idxmax(scalars_df_index, scalars_pandas_df_index):
+    bf_result = scalars_df_index.int64_too.idxmax()
+    pd_result = scalars_pandas_df_index.int64_too.idxmax()
+    assert bf_result == pd_result
+
+
 def test_getattr_attribute_error_when_pandas_has(scalars_df_index):
     # asof is implemented in pandas but not in bigframes
     with pytest.raises(AttributeError):
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -1805,6 +1805,28 @@ def nsmallest(self, n: int, columns, keep: str = "first"):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def idxmin(self):
+        """
+        Return index of first occurrence of minimum over requested axis.
+
+        NA/null values are excluded.
+
+        Returns:
+            Series: Indexes of minima along the specified axis.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def idxmax(self):
+        """
+        Return index of first occurrence of maximum over requested axis.
+
+        NA/null values are excluded.
+
+        Returns:
+            Series: Indexes of maxima along the specified axis.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def nunique(self):
         """
         Count number of distinct elements in specified axis.
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
@@ -8,7 +8,6 @@
 import numpy as np
 from pandas._libs import lib
 from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer
-import pandas.io.formats.format as fmt
 
 from bigframes import constants
 from third_party.bigframes_vendored.pandas.core.generic import NDFrame
@@ -151,21 +150,6 @@ def to_string(
             str or None: String representation of Series if ``buf=None``,
                 otherwise None.
         """
-        formatter = fmt.SeriesFormatter(
-            self,
-            name=name,
-            length=length,
-            header=header,
-            index=index,
-            dtype=dtype,
-            na_rep=na_rep,
-            float_format=float_format,
-            min_rows=min_rows,
-            max_rows=max_rows,
-        )
-        result = formatter.to_string()
-
-        # catch contract violations
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
     def to_markdown(
@@ -475,6 +459,30 @@ def duplicated(self, keep="first") -> Series:
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def idxmin(self) -> Hashable:
+        """
+        Return the row label of the minimum value.
+
+        If multiple values equal the minimum, the first row label with that
+        value is returned.
+
+        Returns:
+            Index: Label of the minimum value.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def idxmax(self) -> Hashable:
+        """
+        Return the row label of the maximum value.
+
+        If multiple values equal the maximum, the first row label with that
+        value is returned.
+
+        Returns:
+            Index: Label of the maximum value.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def round(self, decimals: int = 0) -> Series:
         """
         Round each value in a Series to the given number of decimals.