From 8f57ee79b859a0852b6a301b8440f4a1c060b024 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 8 May 2025 17:11:51 +0000 Subject: [PATCH 1/2] feat: Add inplace arg support to sort methods --- bigframes/core/indexes/base.py | 8 ++- bigframes/dataframe.py | 69 +++++++++++++++++-- bigframes/series.py | 60 ++++++++++++++-- tests/system/small/test_dataframe.py | 37 ++++++++++ tests/system/small/test_series.py | 25 +++++++ .../bigframes_vendored/pandas/core/frame.py | 26 +++++-- .../bigframes_vendored/pandas/core/series.py | 6 ++ 7 files changed, 216 insertions(+), 15 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 71dc914ed4..eac1f58eae 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -298,7 +298,13 @@ def _memory_usage(self) -> int: def transpose(self) -> Index: return self - def sort_values(self, *, ascending: bool = True, na_position: str = "last"): + def sort_values( + self, + *, + inplace: bool = False, + ascending: bool = True, + na_position: str = "last", + ) -> Index: if na_position not in ["first", "last"]: raise ValueError("Param na_position must be one of 'first' or 'last'") na_last = na_position == "last" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 8e9794a5e3..d520a4a36f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2234,10 +2234,34 @@ def set_index( col_ids_strs: List[str] = [col_id for col_id in col_ids if col_id is not None] return DataFrame(self._block.set_index(col_ids_strs, append=append, drop=drop)) - @validations.requires_index + @overload def sort_index( - self, ascending: bool = True, na_position: Literal["first", "last"] = "last" + self, + *, + ascending: bool = ..., + inplace: Literal[False] = ..., + na_position: Literal["first", "last"] = ..., ) -> DataFrame: + ... + + @overload + def sort_index( + self, + *, + ascending: bool = ..., + inplace: Literal[True] = ..., + na_position: Literal["first", "last"] = ..., + ) -> None: + ... + + @validations.requires_index + def sort_index( + self, + *, + ascending: bool = True, + inplace: bool = False, + na_position: Literal["first", "last"] = "last", + ) -> Optional[DataFrame]: if na_position not in ["first", "last"]: raise ValueError("Param na_position must be one of 'first' or 'last'") na_last = na_position == "last" @@ -2248,16 +2272,46 @@ def sort_index( else order.descending_over(column, na_last) for column in index_columns ] - return DataFrame(self._block.order_by(ordering)) + block = self._block.order_by(ordering) + if inplace: + self._set_block(block) + return None + else: + return DataFrame(block) + @overload def sort_values( self, by: str | typing.Sequence[str], *, + inplace: Literal[False] = ..., + ascending: bool | typing.Sequence[bool] = ..., + kind: str = ..., + na_position: typing.Literal["first", "last"] = ..., + ) -> DataFrame: + ... + + @overload + def sort_values( + self, + by: str | typing.Sequence[str], + *, + inplace: Literal[True] = ..., + ascending: bool | typing.Sequence[bool] = ..., + kind: str = ..., + na_position: typing.Literal["first", "last"] = ..., + ) -> None: + ... + + def sort_values( + self, + by: str | typing.Sequence[str], + *, + inplace: bool = False, ascending: bool | typing.Sequence[bool] = True, kind: str = "quicksort", na_position: typing.Literal["first", "last"] = "last", - ) -> DataFrame: + ) -> Optional[DataFrame]: if isinstance(by, (bigframes.series.Series, indexes.Index, DataFrame)): raise KeyError( f"Invalid key type: {type(by).__name__}. Please provide valid column name(s)." @@ -2287,7 +2341,12 @@ def sort_values( if is_ascending else order.descending_over(column_id, na_last) ) - return DataFrame(self._block.order_by(ordering)) + block = self._block.order_by(ordering) + if inplace: + self._set_block(block) + return None + else: + return DataFrame(block) def eval(self, expr: str) -> DataFrame: import bigframes.core.eval as bf_eval diff --git a/bigframes/series.py b/bigframes/series.py index 37a3723a0a..7b69d0176a 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1538,9 +1538,39 @@ def value_counts( ) return Series(block) + @typing.overload def sort_values( - self, *, axis=0, ascending=True, kind: str = "quicksort", na_position="last" + self, + *, + axis=..., + inplace: Literal[True] = ..., + ascending: bool | typing.Sequence[bool] = ..., + kind: str = ..., + na_position: typing.Literal["first", "last"] = ..., + ) -> None: + ... + + @typing.overload + def sort_values( + self, + *, + axis=..., + inplace: Literal[False] = ..., + ascending: bool | typing.Sequence[bool] = ..., + kind: str = ..., + na_position: typing.Literal["first", "last"] = ..., ) -> Series: + ... + + def sort_values( + self, + *, + axis=0, + inplace: bool = False, + ascending=True, + kind: str = "quicksort", + na_position: typing.Literal["first", "last"] = "last", + ) -> Optional[Series]: if axis != 0 and axis != "index": raise ValueError(f"No axis named {axis} for object type Series") if na_position not in ["first", "last"]: @@ -1552,10 +1582,28 @@ def sort_values( else order.descending_over(self._value_column, (na_position == "last")) ], ) - return Series(block) + if inplace: + self._set_block(block) + return None + else: + return Series(block) + + @typing.overload + def sort_index( + self, *, axis=..., inplace: Literal[False] = ..., ascending=..., na_position=... + ) -> Series: + ... + + @typing.overload + def sort_index( + self, *, axis=0, inplace: Literal[True] = ..., ascending=..., na_position=... + ) -> None: + ... @validations.requires_index - def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: + def sort_index( + self, *, axis=0, inplace: bool = False, ascending=True, na_position="last" + ) -> Optional[Series]: # TODO(tbergeron): Support level parameter once multi-index introduced. if axis != 0 and axis != "index": raise ValueError(f"No axis named {axis} for object type Series") @@ -1570,7 +1618,11 @@ def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: for column in block.index_columns ] block = block.order_by(ordering) - return Series(block) + if inplace: + self._set_block(block) + return None + else: + return Series(block) @validations.requires_ordering() def rolling( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 362d736aeb..ce291d4999 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2035,6 +2035,17 @@ def test_sort_index(scalars_dfs, ascending, na_position): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_dataframe_sort_index_inplace(scalars_dfs): + index_column = "int64_col" + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.copy().set_index(index_column) + df.sort_index(ascending=False, inplace=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df.set_index(index_column).sort_index(ascending=False) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + def test_df_abs(scalars_dfs_maybe_ordered): scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered columns = ["int64_col", "int64_too", "float64_col"] @@ -2817,6 +2828,32 @@ def test_dataframe_sort_values( ) +@pytest.mark.parametrize( + ("by", "ascending", "na_position"), + [ + ("int64_col", True, "first"), + (["bool_col", "int64_col"], True, "last"), + ], +) +def test_dataframe_sort_values_inplace( + scalars_df_index, scalars_pandas_df_index, by, ascending, na_position +): + # Test needs values to be unique + bf_sorted = scalars_df_index.copy() + bf_sorted.sort_values( + by, ascending=ascending, na_position=na_position, inplace=True + ) + bf_result = bf_sorted.to_pandas() + pd_result = scalars_pandas_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ) + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_dataframe_sort_values_invalid_input(scalars_df_index): with pytest.raises(KeyError): scalars_df_index.sort_values(by=scalars_df_index["int64_col"]) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c63bf8e12b..3852c417fa 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3339,6 +3339,19 @@ def test_sort_values(scalars_df_index, scalars_pandas_df_index, ascending, na_po ) +def test_series_sort_values_inplace(scalars_df_index, scalars_pandas_df_index): + # Test needs values to be unique + bf_series = scalars_df_index["int64_col"].copy() + bf_series.sort_values(ascending=False, inplace=True) + bf_result = bf_series.to_pandas() + pd_result = scalars_pandas_df_index["int64_col"].sort_values(ascending=False) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + @pytest.mark.parametrize( ("ascending"), [ @@ -3358,6 +3371,18 @@ def test_sort_index(scalars_df_index, scalars_pandas_df_index, ascending): ) +def test_series_sort_index_inplace(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_too"].copy() + bf_series.sort_index(ascending=False, inplace=True) + bf_result = bf_series.to_pandas() + pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=False) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + def test_mask_default_value(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 8f3e150606..15b318fbda 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2213,10 +2213,11 @@ def sort_values( self, by: str | Sequence[str], *, + inplace: bool = False, ascending: bool | Sequence[bool] = True, kind: str = "quicksort", na_position="last", - ) -> DataFrame: + ) -> Optional[DataFrame]: """Sort by the values along row axis. **Examples:** @@ -2300,6 +2301,8 @@ def sort_values( Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. + inplace (bool, default False): + If True, perform operation in-place. kind (str, default 'quicksort'): Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', 'heapsort', 'stable'. Ignored except when determining whether to @@ -2309,8 +2312,8 @@ def sort_values( if `first`; `last` puts NaNs at the end. Returns: - bigframes.pandas.DataFrame: - DataFrame with sorted values. + bigframes.pandas.DataFram or None: + DataFrame with sorted values or None if inplace=True. Raises: ValueError: @@ -2320,12 +2323,25 @@ def sort_values( def sort_index( self, - ) -> DataFrame: + *, + ascending: bool = True, + inplace: bool = False, + na_position: Literal["first", "last"] = "last", + ) -> Optional[DataFrame]: """Sort object by labels (along an axis). + Args: + ascending (bool, default True) + Sort ascending vs. descending. + inplace (bool, default False): + Whether to modify the DataFrame rather than creating a new one. + na_position ({'first', 'last'}, default 'last'): + Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. + Not implemented for MultiIndex. + Returns: bigframes.pandas.DataFrame: - The original DataFrame sorted by the labels. + DataFrame with sorted values or None if inplace=True. Raises: ValueError: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a2d0983652..1d42a8fca2 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1504,6 +1504,7 @@ def sort_values( self, *, axis: Axis = 0, + inplace: bool = False, ascending: bool | int | Sequence[bool] | Sequence[int] = True, kind: str = "quicksort", na_position: str = "last", @@ -1581,6 +1582,8 @@ def sort_values( Args: axis (0 or 'index'): Unused. Parameter needed for compatibility with DataFrame. + inplace (bool, default False): + Whether to modify the Series rather than creating a new one. ascending (bool or list of bools, default True): If True, sort values in ascending order, otherwise descending. kind (str, default to 'quicksort'): @@ -1601,6 +1604,7 @@ def sort_index( self, *, axis: Axis = 0, + inplace: bool = False, ascending: bool | Sequence[bool] = True, na_position: NaPosition = "last", ) -> Series | None: @@ -1647,6 +1651,8 @@ def sort_index( Args: axis ({0 or 'index'}): Unused. Parameter needed for compatibility with DataFrame. + inplace (bool, default False): + Whether to modify the Series rather than creating a new one. ascending (bool or list-like of bools, default True): Sort ascending vs. descending. When the index is a MultiIndex the sort direction can be controlled for each level individually. From 4f08fc7f5fee9b6d603b51d5bc057a6507e65b97 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 8 May 2025 17:51:40 +0000 Subject: [PATCH 2/2] mypy overloads --- bigframes/dataframe.py | 4 ++-- bigframes/ml/metrics/_metrics.py | 4 ++-- bigframes/series.py | 4 ++-- tests/system/small/test_session.py | 9 +++++---- third_party/bigframes_vendored/pandas/core/frame.py | 6 +++--- third_party/bigframes_vendored/pandas/core/series.py | 4 ++-- 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d520a4a36f..8ed749138c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2234,7 +2234,7 @@ def set_index( col_ids_strs: List[str] = [col_id for col_id in col_ids if col_id is not None] return DataFrame(self._block.set_index(col_ids_strs, append=append, drop=drop)) - @overload + @overload # type: ignore[override] def sort_index( self, *, @@ -2279,7 +2279,7 @@ def sort_index( else: return DataFrame(block) - @overload + @overload # type: ignore[override] def sort_values( self, by: str | typing.Sequence[str], diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 658818b261..d7591ef011 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -240,7 +240,7 @@ def recall_score( unique_labels = ( bpd.concat([y_true_series, y_pred_series], join="outer") .drop_duplicates() - .sort_values() + .sort_values(inplace=False) ) index = unique_labels.to_list() @@ -277,7 +277,7 @@ def precision_score( unique_labels = ( bpd.concat([y_true_series, y_pred_series], join="outer") .drop_duplicates() - .sort_values() + .sort_values(inplace=False) ) index = unique_labels.to_list() diff --git a/bigframes/series.py b/bigframes/series.py index 7b69d0176a..1e29671310 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1538,7 +1538,7 @@ def value_counts( ) return Series(block) - @typing.overload + @typing.overload # type: ignore[override] def sort_values( self, *, @@ -1588,7 +1588,7 @@ def sort_values( else: return Series(block) - @typing.overload + @typing.overload # type: ignore[override] def sort_index( self, *, axis=..., inplace: Literal[False] = ..., ascending=..., na_position=... ) -> Series: diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index ab460d5bc9..eeb242e8da 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -96,16 +96,17 @@ def test_read_gbq_tokyo( tokyo_location: str, ): df = session_tokyo.read_gbq(scalars_table_tokyo, index_col=["rowindex"]) - result = df.sort_index().to_pandas() + df.sort_index(inplace=True) expected = scalars_pandas_df_index # use_explicit_destination=True, otherwise might use path with no query_job - result = session_tokyo._executor.execute( + exec_result = session_tokyo._executor.execute( df._block.expr, use_explicit_destination=True ) - assert result.query_job.location == tokyo_location + assert exec_result.query_job is not None + assert exec_result.query_job.location == tokyo_location - assert len(expected) == result.total_rows + assert len(expected) == exec_result.total_rows @pytest.mark.parametrize( diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 15b318fbda..5bbf72b421 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2216,8 +2216,8 @@ def sort_values( inplace: bool = False, ascending: bool | Sequence[bool] = True, kind: str = "quicksort", - na_position="last", - ) -> Optional[DataFrame]: + na_position: Literal["first", "last"] = "last", + ): """Sort by the values along row axis. **Examples:** @@ -2327,7 +2327,7 @@ def sort_index( ascending: bool = True, inplace: bool = False, na_position: Literal["first", "last"] = "last", - ) -> Optional[DataFrame]: + ): """Sort object by labels (along an axis). Args: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 1d42a8fca2..8164fa7415 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1508,7 +1508,7 @@ def sort_values( ascending: bool | int | Sequence[bool] | Sequence[int] = True, kind: str = "quicksort", na_position: str = "last", - ) -> Series | None: + ): """ Sort by the values. @@ -1607,7 +1607,7 @@ def sort_index( inplace: bool = False, ascending: bool | Sequence[bool] = True, na_position: NaPosition = "last", - ) -> Series | None: + ): """ Sort Series by index labels.