Skip to content

Commit 381a1e9

Browse files
feat(bigframes): Support unstable sort_values, sort_index
1 parent e22a1f1 commit 381a1e9

File tree

11 files changed

+96
-32
lines changed

11 files changed

+96
-32
lines changed

packages/bigframes/bigframes/core/array_value.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,11 +212,17 @@ def filter(self, predicate: ex.Expression):
212212
return arr.drop_columns(filter_ids)
213213

214214
def order_by(
215-
self, by: Sequence[OrderingExpression], is_total_order: bool = False
215+
self,
216+
by: Sequence[OrderingExpression],
217+
is_total_order: bool = False,
218+
stable: bool = True,
216219
) -> ArrayValue:
217220
return ArrayValue(
218221
nodes.OrderByNode(
219-
child=self.node, by=tuple(by), is_total_order=is_total_order
222+
child=self.node,
223+
by=tuple(by),
224+
is_total_order=is_total_order,
225+
stable=stable,
220226
)
221227
)
222228

packages/bigframes/bigframes/core/blocks.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,9 +395,10 @@ def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]:
395395
def order_by(
396396
self,
397397
by: typing.Sequence[ordering.OrderingExpression],
398+
stable: bool = True,
398399
) -> Block:
399400
return Block(
400-
self._expr.order_by(by),
401+
self._expr.order_by(by, stable=stable),
401402
index_columns=self.index_columns,
402403
column_labels=self.column_labels,
403404
index_labels=self.index.names,

packages/bigframes/bigframes/core/indexes/base.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -255,12 +255,6 @@ def query_job(self) -> bigquery.QueryJob:
255255
self._query_job = query_job
256256
return self._query_job
257257

258-
@property
259-
def str(self) -> bigframes.operations.strings.StringMethods:
260-
import bigframes.operations.strings
261-
262-
return bigframes.operations.strings.StringMethods(self)
263-
264258
def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
265259
"""Get integer location, slice or boolean mask for requested label.
266260
@@ -436,7 +430,8 @@ def sort_values(
436430
*,
437431
inplace: bool = False,
438432
ascending: bool = True,
439-
na_position: __builtins__.str = "last",
433+
kind: str | None = None,
434+
na_position: str = "last",
440435
) -> Index:
441436
if na_position not in ["first", "last"]:
442437
raise ValueError("Param na_position must be one of 'first' or 'last'")
@@ -448,7 +443,8 @@ def sort_values(
448443
else order.descending_over(column, na_last)
449444
for column in index_columns
450445
]
451-
return Index(self._block.order_by(ordering))
446+
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
447+
return Index(self._block.order_by(ordering, stable=is_stable))
452448

453449
def astype(
454450
self,
@@ -840,6 +836,13 @@ def _apply_binary_op(
840836
else:
841837
return NotImplemented
842838

839+
# last so as to not shadow __builtins__.str
840+
@property
841+
def str(self) -> bigframes.operations.strings.StringMethods:
842+
import bigframes.operations.strings
843+
844+
return bigframes.operations.strings.StringMethods(self)
845+
843846

844847
def _should_create_datetime_index(block: blocks.Block) -> bool:
845848
if len(block.index.dtypes) != 1:

packages/bigframes/bigframes/core/nodes.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -991,7 +991,8 @@ def remap_refs(
991991
@dataclasses.dataclass(frozen=True, eq=False)
992992
class OrderByNode(UnaryNode):
993993
by: Tuple[OrderingExpression, ...]
994-
# This is an optimization, if true, can discard previous orderings.
994+
stable: bool = True
995+
# This is an optimization, if true, can discard previous orderings, even if doing a stable sort
995996
# might be a total ordering even if false
996997
is_total_order: bool = False
997998

packages/bigframes/bigframes/core/rewrite/order.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ def pull_up_order_inner(
7171
child_result, child_order = pull_up_order_inner(node.child)
7272
return child_result, child_order.with_reverse()
7373
elif isinstance(node, bigframes.core.nodes.OrderByNode):
74-
if node.is_total_order:
74+
# unstable sorts don't care about previous order, total orders override previous order
75+
if (not node.stable) or node.is_total_order:
7576
new_node = remove_order(node.child)
7677
else:
7778
new_node, child_order = pull_up_order_inner(node.child)
@@ -106,6 +107,10 @@ def pull_up_order_inner(
106107
),
107108
)
108109
)
110+
elif not node.stable:
111+
new_order = bigframes.core.ordering.RowOrdering(
112+
ordering_value_columns=tuple(new_by),
113+
)
109114
else:
110115
assert child_order
111116
new_order = child_order.with_ordering_columns(new_by)

packages/bigframes/bigframes/dataframe.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2418,6 +2418,7 @@ def sort_index(
24182418
*,
24192419
ascending: bool = ...,
24202420
inplace: Literal[False] = ...,
2421+
kind: str = ...,
24212422
na_position: Literal["first", "last"] = ...,
24222423
) -> DataFrame: ...
24232424

@@ -2427,6 +2428,7 @@ def sort_index(
24272428
*,
24282429
ascending: bool = ...,
24292430
inplace: Literal[True] = ...,
2431+
kind: str = ...,
24302432
na_position: Literal["first", "last"] = ...,
24312433
) -> None: ...
24322434

@@ -2436,6 +2438,7 @@ def sort_index(
24362438
axis: Union[int, str] = 0,
24372439
ascending: bool = True,
24382440
inplace: bool = False,
2441+
kind: str | None = None,
24392442
na_position: Literal["first", "last"] = "last",
24402443
) -> Optional[DataFrame]:
24412444
if utils.get_axis_number(axis) == 0:
@@ -2449,7 +2452,8 @@ def sort_index(
24492452
else order.descending_over(column, na_last)
24502453
for column in index_columns
24512454
]
2452-
block = self._block.order_by(ordering)
2455+
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
2456+
block = self._block.order_by(ordering, stable=is_stable)
24532457
else: # axis=1
24542458
_, indexer = self.columns.sort_values(
24552459
return_indexer=True,
@@ -2472,7 +2476,7 @@ def sort_values(
24722476
*,
24732477
inplace: Literal[False] = ...,
24742478
ascending: bool | typing.Sequence[bool] = ...,
2475-
kind: str = ...,
2479+
kind: str | None = None,
24762480
na_position: typing.Literal["first", "last"] = ...,
24772481
) -> DataFrame: ...
24782482

@@ -2483,7 +2487,7 @@ def sort_values(
24832487
*,
24842488
inplace: Literal[True] = ...,
24852489
ascending: bool | typing.Sequence[bool] = ...,
2486-
kind: str = ...,
2490+
kind: str | None = None,
24872491
na_position: typing.Literal["first", "last"] = ...,
24882492
) -> None: ...
24892493

@@ -2493,7 +2497,7 @@ def sort_values(
24932497
*,
24942498
inplace: bool = False,
24952499
ascending: bool | typing.Sequence[bool] = True,
2496-
kind: str = "quicksort",
2500+
kind: str | None = None,
24972501
na_position: typing.Literal["first", "last"] = "last",
24982502
) -> Optional[DataFrame]:
24992503
if isinstance(by, (bigframes.series.Series, indexes.Index, DataFrame)):
@@ -2525,7 +2529,8 @@ def sort_values(
25252529
if is_ascending
25262530
else order.descending_over(column_id, na_last)
25272531
)
2528-
block = self._block.order_by(ordering)
2532+
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
2533+
block = self._block.order_by(ordering, stable=is_stable)
25292534
if inplace:
25302535
self._set_block(block)
25312536
return None

packages/bigframes/bigframes/series.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1769,7 +1769,7 @@ def sort_values(
17691769
axis=...,
17701770
inplace: Literal[True] = ...,
17711771
ascending: bool | typing.Sequence[bool] = ...,
1772-
kind: str = ...,
1772+
kind: str | None = ...,
17731773
na_position: typing.Literal["first", "last"] = ...,
17741774
) -> None: ...
17751775

@@ -1780,7 +1780,7 @@ def sort_values(
17801780
axis=...,
17811781
inplace: Literal[False] = ...,
17821782
ascending: bool | typing.Sequence[bool] = ...,
1783-
kind: str = ...,
1783+
kind: str | None = ...,
17841784
na_position: typing.Literal["first", "last"] = ...,
17851785
) -> Series: ...
17861786

@@ -1790,19 +1790,21 @@ def sort_values(
17901790
axis=0,
17911791
inplace: bool = False,
17921792
ascending=True,
1793-
kind: str = "quicksort",
1793+
kind: str | None = None,
17941794
na_position: typing.Literal["first", "last"] = "last",
17951795
) -> Optional[Series]:
17961796
if axis != 0 and axis != "index":
17971797
raise ValueError(f"No axis named {axis} for object type Series")
17981798
if na_position not in ["first", "last"]:
17991799
raise ValueError("Param na_position must be one of 'first' or 'last'")
1800+
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
18001801
block = self._block.order_by(
18011802
[
18021803
order.ascending_over(self._value_column, (na_position == "last"))
18031804
if ascending
18041805
else order.descending_over(self._value_column, (na_position == "last"))
18051806
],
1807+
stable=is_stable,
18061808
)
18071809
if inplace:
18081810
self._set_block(block)
@@ -1812,17 +1814,37 @@ def sort_values(
18121814

18131815
@typing.overload # type: ignore[override]
18141816
def sort_index(
1815-
self, *, axis=..., inplace: Literal[False] = ..., ascending=..., na_position=...
1816-
) -> Series: ...
1817+
self,
1818+
*,
1819+
axis=...,
1820+
inplace: Literal[False] = ...,
1821+
ascending=...,
1822+
kind: str | None = ...,
1823+
na_position=...,
1824+
) -> Series:
1825+
...
18171826

18181827
@typing.overload
18191828
def sort_index(
1820-
self, *, axis=0, inplace: Literal[True] = ..., ascending=..., na_position=...
1821-
) -> None: ...
1829+
self,
1830+
*,
1831+
axis=0,
1832+
inplace: Literal[True] = ...,
1833+
ascending=...,
1834+
kind: str | None = ...,
1835+
na_position=...,
1836+
) -> None:
1837+
...
18221838

18231839
@validations.requires_index
18241840
def sort_index(
1825-
self, *, axis=0, inplace: bool = False, ascending=True, na_position="last"
1841+
self,
1842+
*,
1843+
axis=0,
1844+
inplace: bool = False,
1845+
ascending=True,
1846+
kind: str | None = None,
1847+
na_position="last",
18261848
) -> Optional[Series]:
18271849
# TODO(tbergeron): Support level parameter once multi-index introduced.
18281850
if axis != 0 and axis != "index":
@@ -1837,7 +1859,8 @@ def sort_index(
18371859
else order.descending_over(column, na_last)
18381860
for column in block.index_columns
18391861
]
1840-
block = block.order_by(ordering)
1862+
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
1863+
block = block.order_by(ordering, stable=is_stable)
18411864
if inplace:
18421865
self._set_block(block)
18431866
return None

packages/bigframes/third_party/bigframes_vendored/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,5 @@
5555
"_deferred",
5656
]
5757
VALID_WRITE_ENGINES = typing.get_args(WriteEngineType)
58+
59+
DEFAULT_SORT_KIND = "stable"

packages/bigframes/third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2253,7 +2253,7 @@ def sort_values(
22532253
*,
22542254
inplace: bool = False,
22552255
ascending: bool | Sequence[bool] = True,
2256-
kind: str = "quicksort",
2256+
kind: str | None = None,
22572257
na_position: Literal["first", "last"] = "last",
22582258
):
22592259
"""Sort by the values along row axis.
@@ -2339,7 +2339,7 @@ def sort_values(
23392339
the by.
23402340
inplace (bool, default False):
23412341
If True, perform operation in-place.
2342-
kind (str, default 'quicksort'):
2342+
kind (str, default None):
23432343
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
23442344
'heapsort', 'stable'. Ignored except when determining whether to
23452345
sort stably. 'mergesort' or 'stable' will result in stable reorder.
@@ -2363,6 +2363,7 @@ def sort_index(
23632363
axis: str | int = 0,
23642364
ascending: bool = True,
23652365
inplace: bool = False,
2366+
kind: str | None = None,
23662367
na_position: Literal["first", "last"] = "last",
23672368
):
23682369
"""Sort object by labels (along an axis).
@@ -2375,6 +2376,10 @@ def sort_index(
23752376
Sort ascending vs. descending.
23762377
inplace (bool, default False):
23772378
Whether to modify the DataFrame rather than creating a new one.
2379+
kind (str, default None):
2380+
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
2381+
'heapsort', 'stable'. Ignored except when determining whether to
2382+
sort stably. 'mergesort' or 'stable' will result in stable reorder.
23782383
na_position ({'first', 'last'}, default 'last'):
23792384
Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
23802385
Not implemented for MultiIndex.

packages/bigframes/third_party/bigframes_vendored/pandas/core/indexes/base.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -828,7 +828,11 @@ def nunique(self) -> int:
828828
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
829829

830830
def sort_values(
831-
self, *, ascending: bool = True, na_position: str = "last"
831+
self,
832+
*,
833+
ascending: bool = True,
834+
kind: str | None = None,
835+
na_position: str = "last",
832836
) -> Index:
833837
"""
834838
Return a sorted copy of the index.
@@ -851,6 +855,10 @@ def sort_values(
851855
Args:
852856
ascending (bool, default True):
853857
Should the index values be sorted in an ascending order.
858+
kind (str, default None):
859+
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
860+
'heapsort', 'stable'. Ignored except when determining whether to
861+
sort stably. 'mergesort' or 'stable' will result in stable reorder.
854862
na_position ({'first' or 'last'}, default 'last'):
855863
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
856864
the end.

0 commit comments

Comments
 (0)