Skip to content

feat: Add inplace arg support to sort methods #1710

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,13 @@ def _memory_usage(self) -> int:
def transpose(self) -> Index:
return self

def sort_values(self, *, ascending: bool = True, na_position: str = "last"):
def sort_values(
self,
*,
inplace: bool = False,
ascending: bool = True,
na_position: str = "last",
) -> Index:
if na_position not in ["first", "last"]:
raise ValueError("Param na_position must be one of 'first' or 'last'")
na_last = na_position == "last"
Expand Down
69 changes: 64 additions & 5 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2234,10 +2234,34 @@ def set_index(
col_ids_strs: List[str] = [col_id for col_id in col_ids if col_id is not None]
return DataFrame(self._block.set_index(col_ids_strs, append=append, drop=drop))

@validations.requires_index
@overload # type: ignore[override]
def sort_index(
self, ascending: bool = True, na_position: Literal["first", "last"] = "last"
self,
*,
ascending: bool = ...,
inplace: Literal[False] = ...,
na_position: Literal["first", "last"] = ...,
) -> DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can we not just return Optional[DataFrame] here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the typing overload, see https://docs.python.org/3/library/typing.html#overload .

...

@overload
def sort_index(
self,
*,
ascending: bool = ...,
inplace: Literal[True] = ...,
na_position: Literal["first", "last"] = ...,
) -> None:
...

@validations.requires_index
def sort_index(
self,
*,
ascending: bool = True,
inplace: bool = False,
na_position: Literal["first", "last"] = "last",
) -> Optional[DataFrame]:
if na_position not in ["first", "last"]:
raise ValueError("Param na_position must be one of 'first' or 'last'")
na_last = na_position == "last"
Expand All @@ -2248,16 +2272,46 @@ def sort_index(
else order.descending_over(column, na_last)
for column in index_columns
]
return DataFrame(self._block.order_by(ordering))
block = self._block.order_by(ordering)
if inplace:
self._set_block(block)
return None
else:
return DataFrame(block)

@overload # type: ignore[override]
def sort_values(
self,
by: str | typing.Sequence[str],
*,
inplace: Literal[False] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
na_position: typing.Literal["first", "last"] = ...,
) -> DataFrame:
...

@overload
def sort_values(
self,
by: str | typing.Sequence[str],
*,
inplace: Literal[True] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
na_position: typing.Literal["first", "last"] = ...,
) -> None:
...

def sort_values(
self,
by: str | typing.Sequence[str],
*,
inplace: bool = False,
ascending: bool | typing.Sequence[bool] = True,
kind: str = "quicksort",
na_position: typing.Literal["first", "last"] = "last",
) -> DataFrame:
) -> Optional[DataFrame]:
if isinstance(by, (bigframes.series.Series, indexes.Index, DataFrame)):
raise KeyError(
f"Invalid key type: {type(by).__name__}. Please provide valid column name(s)."
Expand Down Expand Up @@ -2287,7 +2341,12 @@ def sort_values(
if is_ascending
else order.descending_over(column_id, na_last)
)
return DataFrame(self._block.order_by(ordering))
block = self._block.order_by(ordering)
if inplace:
self._set_block(block)
return None
else:
return DataFrame(block)

def eval(self, expr: str) -> DataFrame:
import bigframes.core.eval as bf_eval
Expand Down
4 changes: 2 additions & 2 deletions bigframes/ml/metrics/_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def recall_score(
unique_labels = (
bpd.concat([y_true_series, y_pred_series], join="outer")
.drop_duplicates()
.sort_values()
.sort_values(inplace=False)
)
index = unique_labels.to_list()

Expand Down Expand Up @@ -277,7 +277,7 @@ def precision_score(
unique_labels = (
bpd.concat([y_true_series, y_pred_series], join="outer")
.drop_duplicates()
.sort_values()
.sort_values(inplace=False)
)
index = unique_labels.to_list()

Expand Down
60 changes: 56 additions & 4 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1538,9 +1538,39 @@ def value_counts(
)
return Series(block)

@typing.overload # type: ignore[override]
def sort_values(
self, *, axis=0, ascending=True, kind: str = "quicksort", na_position="last"
self,
*,
axis=...,
inplace: Literal[True] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
na_position: typing.Literal["first", "last"] = ...,
) -> None:
...

@typing.overload
def sort_values(
self,
*,
axis=...,
inplace: Literal[False] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
na_position: typing.Literal["first", "last"] = ...,
) -> Series:
...

def sort_values(
self,
*,
axis=0,
inplace: bool = False,
ascending=True,
kind: str = "quicksort",
na_position: typing.Literal["first", "last"] = "last",
) -> Optional[Series]:
if axis != 0 and axis != "index":
raise ValueError(f"No axis named {axis} for object type Series")
if na_position not in ["first", "last"]:
Expand All @@ -1552,10 +1582,28 @@ def sort_values(
else order.descending_over(self._value_column, (na_position == "last"))
],
)
return Series(block)
if inplace:
self._set_block(block)
return None
else:
return Series(block)

@typing.overload # type: ignore[override]
def sort_index(
self, *, axis=..., inplace: Literal[False] = ..., ascending=..., na_position=...
) -> Series:
...

@typing.overload
def sort_index(
self, *, axis=0, inplace: Literal[True] = ..., ascending=..., na_position=...
) -> None:
...

@validations.requires_index
def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series:
def sort_index(
self, *, axis=0, inplace: bool = False, ascending=True, na_position="last"
) -> Optional[Series]:
# TODO(tbergeron): Support level parameter once multi-index introduced.
if axis != 0 and axis != "index":
raise ValueError(f"No axis named {axis} for object type Series")
Expand All @@ -1570,7 +1618,11 @@ def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series:
for column in block.index_columns
]
block = block.order_by(ordering)
return Series(block)
if inplace:
self._set_block(block)
return None
else:
return Series(block)

@validations.requires_ordering()
def rolling(
Expand Down
37 changes: 37 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2035,6 +2035,17 @@ def test_sort_index(scalars_dfs, ascending, na_position):
pandas.testing.assert_frame_equal(bf_result, pd_result)


def test_dataframe_sort_index_inplace(scalars_dfs):
index_column = "int64_col"
scalars_df, scalars_pandas_df = scalars_dfs
df = scalars_df.copy().set_index(index_column)
df.sort_index(ascending=False, inplace=True)
bf_result = df.to_pandas()

pd_result = scalars_pandas_df.set_index(index_column).sort_index(ascending=False)
pandas.testing.assert_frame_equal(bf_result, pd_result)


def test_df_abs(scalars_dfs_maybe_ordered):
scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered
columns = ["int64_col", "int64_too", "float64_col"]
Expand Down Expand Up @@ -2817,6 +2828,32 @@ def test_dataframe_sort_values(
)


@pytest.mark.parametrize(
("by", "ascending", "na_position"),
[
("int64_col", True, "first"),
(["bool_col", "int64_col"], True, "last"),
],
)
def test_dataframe_sort_values_inplace(
scalars_df_index, scalars_pandas_df_index, by, ascending, na_position
):
# Test needs values to be unique
bf_sorted = scalars_df_index.copy()
bf_sorted.sort_values(
by, ascending=ascending, na_position=na_position, inplace=True
)
bf_result = bf_sorted.to_pandas()
pd_result = scalars_pandas_df_index.sort_values(
by, ascending=ascending, na_position=na_position
)

pandas.testing.assert_frame_equal(
bf_result,
pd_result,
)


def test_dataframe_sort_values_invalid_input(scalars_df_index):
with pytest.raises(KeyError):
scalars_df_index.sort_values(by=scalars_df_index["int64_col"])
Expand Down
25 changes: 25 additions & 0 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3339,6 +3339,19 @@ def test_sort_values(scalars_df_index, scalars_pandas_df_index, ascending, na_po
)


def test_series_sort_values_inplace(scalars_df_index, scalars_pandas_df_index):
# Test needs values to be unique
bf_series = scalars_df_index["int64_col"].copy()
bf_series.sort_values(ascending=False, inplace=True)
bf_result = bf_series.to_pandas()
pd_result = scalars_pandas_df_index["int64_col"].sort_values(ascending=False)

pd.testing.assert_series_equal(
bf_result,
pd_result,
)


@pytest.mark.parametrize(
("ascending"),
[
Expand All @@ -3358,6 +3371,18 @@ def test_sort_index(scalars_df_index, scalars_pandas_df_index, ascending):
)


def test_series_sort_index_inplace(scalars_df_index, scalars_pandas_df_index):
bf_series = scalars_df_index["int64_too"].copy()
bf_series.sort_index(ascending=False, inplace=True)
bf_result = bf_series.to_pandas()
pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=False)

pd.testing.assert_series_equal(
bf_result,
pd_result,
)


def test_mask_default_value(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

Expand Down
9 changes: 5 additions & 4 deletions tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,17 @@ def test_read_gbq_tokyo(
tokyo_location: str,
):
df = session_tokyo.read_gbq(scalars_table_tokyo, index_col=["rowindex"])
result = df.sort_index().to_pandas()
df.sort_index(inplace=True)
expected = scalars_pandas_df_index

# use_explicit_destination=True, otherwise might use path with no query_job
result = session_tokyo._executor.execute(
exec_result = session_tokyo._executor.execute(
df._block.expr, use_explicit_destination=True
)
assert result.query_job.location == tokyo_location
assert exec_result.query_job is not None
assert exec_result.query_job.location == tokyo_location

assert len(expected) == result.total_rows
assert len(expected) == exec_result.total_rows


@pytest.mark.parametrize(
Expand Down
28 changes: 22 additions & 6 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2213,10 +2213,11 @@ def sort_values(
self,
by: str | Sequence[str],
*,
inplace: bool = False,
ascending: bool | Sequence[bool] = True,
kind: str = "quicksort",
na_position="last",
) -> DataFrame:
na_position: Literal["first", "last"] = "last",
):
"""Sort by the values along row axis.

**Examples:**
Expand Down Expand Up @@ -2300,6 +2301,8 @@ def sort_values(
Sort ascending vs. descending. Specify list for multiple sort
orders. If this is a list of bools, must match the length of
the by.
inplace (bool, default False):
If True, perform operation in-place.
kind (str, default 'quicksort'):
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
'heapsort', 'stable'. Ignored except when determining whether to
Expand All @@ -2309,8 +2312,8 @@ def sort_values(
if `first`; `last` puts NaNs at the end.

Returns:
bigframes.pandas.DataFrame:
DataFrame with sorted values.
bigframes.pandas.DataFram or None:
DataFrame with sorted values or None if inplace=True.

Raises:
ValueError:
Expand All @@ -2320,12 +2323,25 @@ def sort_values(

def sort_index(
self,
) -> DataFrame:
*,
ascending: bool = True,
inplace: bool = False,
na_position: Literal["first", "last"] = "last",
):
"""Sort object by labels (along an axis).

Args:
ascending (bool, default True)
Sort ascending vs. descending.
inplace (bool, default False):
Whether to modify the DataFrame rather than creating a new one.
na_position ({'first', 'last'}, default 'last'):
Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
Not implemented for MultiIndex.

Returns:
bigframes.pandas.DataFrame:
The original DataFrame sorted by the labels.
DataFrame with sorted values or None if inplace=True.

Raises:
ValueError:
Expand Down
Loading