Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions packages/bigframes/bigframes/core/array_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,17 @@ def filter(self, predicate: ex.Expression):
return arr.drop_columns(filter_ids)

def order_by(
self, by: Sequence[OrderingExpression], is_total_order: bool = False
self,
by: Sequence[OrderingExpression],
is_total_order: bool = False,
stable: bool = True,
) -> ArrayValue:
return ArrayValue(
nodes.OrderByNode(
child=self.node, by=tuple(by), is_total_order=is_total_order
child=self.node,
by=tuple(by),
is_total_order=is_total_order,
stable=stable,
)
)

Expand Down
31 changes: 14 additions & 17 deletions packages/bigframes/bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,9 +395,10 @@ def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]:
def order_by(
self,
by: typing.Sequence[ordering.OrderingExpression],
stable: bool = True,
) -> Block:
return Block(
self._expr.order_by(by),
self._expr.order_by(by, stable=stable),
index_columns=self.index_columns,
column_labels=self.column_labels,
index_labels=self.index.names,
Expand Down Expand Up @@ -2412,13 +2413,13 @@ def _align_both_axes(
rcol_indexer if (rcol_indexer is not None) else range(len(columns))
)

left_input_lookup = (
lambda index: ex.deref(get_column_left[self.value_columns[index]])
left_input_lookup = lambda index: (
ex.deref(get_column_left[self.value_columns[index]])
if index != -1
else ex.const(None)
)
righ_input_lookup = (
lambda index: ex.deref(get_column_right[other.value_columns[index]])
righ_input_lookup = lambda index: (
ex.deref(get_column_right[other.value_columns[index]])
if index != -1
else ex.const(None)
)
Expand Down Expand Up @@ -2471,15 +2472,13 @@ def _align_series_block_axis_1(
rcol_indexer if (rcol_indexer is not None) else range(len(columns))
)

left_input_lookup = (
lambda index: ex.deref(get_column_left[self.value_columns[index]])
left_input_lookup = lambda index: (
ex.deref(get_column_left[self.value_columns[index]])
if index != -1
else ex.const(None)
)
righ_input_lookup = (
lambda index: ex.deref(
get_column_right[other.transpose().value_columns[index]]
)
righ_input_lookup = lambda index: (
ex.deref(get_column_right[other.transpose().value_columns[index]])
if index != -1
else ex.const(None)
)
Expand All @@ -2506,13 +2505,11 @@ def _align_pd_series_axis_1(
rcol_indexer if (rcol_indexer is not None) else range(len(columns))
)

left_input_lookup = (
lambda index: ex.deref(self.value_columns[index])
if index != -1
else ex.const(None)
left_input_lookup = lambda index: (
ex.deref(self.value_columns[index]) if index != -1 else ex.const(None)
)
righ_input_lookup = (
lambda index: ex.const(other.iloc[index]) if index != -1 else ex.const(None)
righ_input_lookup = lambda index: (
ex.const(other.iloc[index]) if index != -1 else ex.const(None)
)

left_inputs = [left_input_lookup(i) for i in lcol_indexer]
Expand Down
19 changes: 11 additions & 8 deletions packages/bigframes/bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,12 +255,6 @@ def query_job(self) -> bigquery.QueryJob:
self._query_job = query_job
return self._query_job

@property
def str(self) -> bigframes.operations.strings.StringMethods:
import bigframes.operations.strings

return bigframes.operations.strings.StringMethods(self)

def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
"""Get integer location, slice or boolean mask for requested label.

Expand Down Expand Up @@ -436,7 +430,8 @@ def sort_values(
*,
inplace: bool = False,
ascending: bool = True,
na_position: __builtins__.str = "last",
kind: str | None = None,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Neat use of an existing pandas parameter! (https://pandas.pydata.org/docs/reference/api/pandas.Series.sort_values.html#pandas.Series.sort_values)

Interesting opportunity here too with regards to the default in pandas not being a stable sort.

na_position: str = "last",
) -> Index:
if na_position not in ["first", "last"]:
raise ValueError("Param na_position must be one of 'first' or 'last'")
Expand All @@ -448,7 +443,8 @@ def sort_values(
else order.descending_over(column, na_last)
for column in index_columns
]
return Index(self._block.order_by(ordering))
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS
return Index(self._block.order_by(ordering, stable=is_stable))

def astype(
self,
Expand Down Expand Up @@ -840,6 +836,13 @@ def _apply_binary_op(
else:
return NotImplemented

# last so as to not shadow __builtins__.str
@property
def str(self) -> bigframes.operations.strings.StringMethods:
import bigframes.operations.strings

return bigframes.operations.strings.StringMethods(self)


def _should_create_datetime_index(block: blocks.Block) -> bool:
if len(block.index.dtypes) != 1:
Expand Down
3 changes: 2 additions & 1 deletion packages/bigframes/bigframes/core/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,8 @@ def remap_refs(
@dataclasses.dataclass(frozen=True, eq=False)
class OrderByNode(UnaryNode):
by: Tuple[OrderingExpression, ...]
# This is an optimization, if true, can discard previous orderings.
stable: bool = True
# This is an optimization, if true, can discard previous orderings, even if doing a stable sort
# might be a total ordering even if false
is_total_order: bool = False

Expand Down
7 changes: 6 additions & 1 deletion packages/bigframes/bigframes/core/rewrite/order.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ def pull_up_order_inner(
child_result, child_order = pull_up_order_inner(node.child)
return child_result, child_order.with_reverse()
elif isinstance(node, bigframes.core.nodes.OrderByNode):
if node.is_total_order:
# unstable sorts don't care about previous order, total orders override previous order
if (not node.stable) or node.is_total_order:
new_node = remove_order(node.child)
else:
new_node, child_order = pull_up_order_inner(node.child)
Expand Down Expand Up @@ -106,6 +107,10 @@ def pull_up_order_inner(
),
)
)
elif not node.stable:
new_order = bigframes.core.ordering.RowOrdering(
ordering_value_columns=tuple(new_by),
)
else:
assert child_order
new_order = child_order.with_ordering_columns(new_by)
Expand Down
25 changes: 16 additions & 9 deletions packages/bigframes/bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2418,6 +2418,7 @@ def sort_index(
*,
ascending: bool = ...,
inplace: Literal[False] = ...,
kind: str | None = ...,
na_position: Literal["first", "last"] = ...,
) -> DataFrame: ...

Expand All @@ -2427,6 +2428,7 @@ def sort_index(
*,
ascending: bool = ...,
inplace: Literal[True] = ...,
kind: str | None = ...,
na_position: Literal["first", "last"] = ...,
) -> None: ...

Expand All @@ -2436,6 +2438,7 @@ def sort_index(
axis: Union[int, str] = 0,
ascending: bool = True,
inplace: bool = False,
kind: str | None = None,
na_position: Literal["first", "last"] = "last",
) -> Optional[DataFrame]:
if utils.get_axis_number(axis) == 0:
Expand All @@ -2449,7 +2452,10 @@ def sort_index(
else order.descending_over(column, na_last)
for column in index_columns
]
block = self._block.order_by(ordering)
is_stable = (
kind or constants.DEFAULT_SORT_KIND
) in constants.STABLE_SORT_KINDS
block = self._block.order_by(ordering, stable=is_stable)
else: # axis=1
_, indexer = self.columns.sort_values(
return_indexer=True,
Expand All @@ -2472,7 +2478,7 @@ def sort_values(
*,
inplace: Literal[False] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
kind: str | None = ...,
na_position: typing.Literal["first", "last"] = ...,
) -> DataFrame: ...

Expand All @@ -2483,7 +2489,7 @@ def sort_values(
*,
inplace: Literal[True] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
kind: str | None = ...,
na_position: typing.Literal["first", "last"] = ...,
) -> None: ...

Expand All @@ -2493,7 +2499,7 @@ def sort_values(
*,
inplace: bool = False,
ascending: bool | typing.Sequence[bool] = True,
kind: str = "quicksort",
kind: str | None = None,
na_position: typing.Literal["first", "last"] = "last",
) -> Optional[DataFrame]:
if isinstance(by, (bigframes.series.Series, indexes.Index, DataFrame)):
Expand Down Expand Up @@ -2525,7 +2531,8 @@ def sort_values(
if is_ascending
else order.descending_over(column_id, na_last)
)
block = self._block.order_by(ordering)
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS
block = self._block.order_by(ordering, stable=is_stable)
if inplace:
self._set_block(block)
return None
Expand Down Expand Up @@ -2768,11 +2775,11 @@ def replace(
):
if utils.is_dict_like(value):
return self.apply(
lambda x: x.replace(
to_replace=to_replace, value=value[x.name], regex=regex
lambda x: (
x.replace(to_replace=to_replace, value=value[x.name], regex=regex)
if (x.name in value)
else x
)
if (x.name in value)
else x
)
return self.apply(
lambda x: x.replace(to_replace=to_replace, value=value, regex=regex)
Expand Down
35 changes: 28 additions & 7 deletions packages/bigframes/bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1769,7 +1769,7 @@ def sort_values(
axis=...,
inplace: Literal[True] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
kind: str | None = ...,
na_position: typing.Literal["first", "last"] = ...,
) -> None: ...

Expand All @@ -1780,7 +1780,7 @@ def sort_values(
axis=...,
inplace: Literal[False] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
kind: str | None = ...,
na_position: typing.Literal["first", "last"] = ...,
) -> Series: ...

Expand All @@ -1790,19 +1790,21 @@ def sort_values(
axis=0,
inplace: bool = False,
ascending=True,
kind: str = "quicksort",
kind: str | None = None,
na_position: typing.Literal["first", "last"] = "last",
) -> Optional[Series]:
if axis != 0 and axis != "index":
raise ValueError(f"No axis named {axis} for object type Series")
if na_position not in ["first", "last"]:
raise ValueError("Param na_position must be one of 'first' or 'last'")
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS
block = self._block.order_by(
[
order.ascending_over(self._value_column, (na_position == "last"))
if ascending
else order.descending_over(self._value_column, (na_position == "last"))
],
stable=is_stable,
)
if inplace:
self._set_block(block)
Expand All @@ -1812,17 +1814,35 @@ def sort_values(

@typing.overload # type: ignore[override]
def sort_index(
self, *, axis=..., inplace: Literal[False] = ..., ascending=..., na_position=...
self,
*,
axis=...,
inplace: Literal[False] = ...,
ascending=...,
kind: str | None = ...,
na_position=...,
) -> Series: ...

@typing.overload
def sort_index(
self, *, axis=0, inplace: Literal[True] = ..., ascending=..., na_position=...
self,
*,
axis=0,
inplace: Literal[True] = ...,
ascending=...,
kind: str | None = ...,
na_position=...,
) -> None: ...

@validations.requires_index
def sort_index(
self, *, axis=0, inplace: bool = False, ascending=True, na_position="last"
self,
*,
axis=0,
inplace: bool = False,
ascending=True,
kind: str | None = None,
na_position="last",
) -> Optional[Series]:
# TODO(tbergeron): Support level parameter once multi-index introduced.
if axis != 0 and axis != "index":
Expand All @@ -1837,7 +1857,8 @@ def sort_index(
else order.descending_over(column, na_last)
for column in block.index_columns
]
block = block.order_by(ordering)
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS
block = block.order_by(ordering, stable=is_stable)
if inplace:
self._set_block(block)
return None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,6 @@
"_deferred",
]
VALID_WRITE_ENGINES = typing.get_args(WriteEngineType)

DEFAULT_SORT_KIND = "stable"
Comment thread
TrevorBergeron marked this conversation as resolved.
STABLE_SORT_KINDS = ("stable", "mergesort")
Original file line number Diff line number Diff line change
Expand Up @@ -2253,7 +2253,7 @@ def sort_values(
*,
inplace: bool = False,
ascending: bool | Sequence[bool] = True,
kind: str = "quicksort",
kind: str | None = None,
na_position: Literal["first", "last"] = "last",
):
"""Sort by the values along row axis.
Expand Down Expand Up @@ -2339,7 +2339,7 @@ def sort_values(
the by.
inplace (bool, default False):
If True, perform operation in-place.
kind (str, default 'quicksort'):
kind (str, default None):
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
'heapsort', 'stable'. Ignored except when determining whether to
sort stably. 'mergesort' or 'stable' will result in stable reorder.
Expand All @@ -2363,6 +2363,7 @@ def sort_index(
axis: str | int = 0,
ascending: bool = True,
inplace: bool = False,
kind: str | None = None,
na_position: Literal["first", "last"] = "last",
):
"""Sort object by labels (along an axis).
Expand All @@ -2375,6 +2376,10 @@ def sort_index(
Sort ascending vs. descending.
inplace (bool, default False):
Whether to modify the DataFrame rather than creating a new one.
kind (str, default None):
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
'heapsort', 'stable'. Ignored except when determining whether to
sort stably. 'mergesort' or 'stable' will result in stable reorder.
na_position ({'first', 'last'}, default 'last'):
Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
Not implemented for MultiIndex.
Expand Down
Loading
Loading