diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 090fea57872c5..acd7cec480c39 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1026,6 +1026,7 @@ Reshaping Sparse ^^^^^^ - Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``SparseDtype`` failing to retain the array's ``fill_value`` (:issue:`45691`) +- Bug in :meth:`SparseArray.unique` fails to keep original elements order (:issue:`47809`) - ExtensionArray diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 5653d87a4570b..b547446603853 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -821,7 +821,7 @@ def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT def _first_fill_value_loc(self): """ - Get the location of the first missing value. + Get the location of the first fill value. Returns ------- @@ -834,14 +834,24 @@ def _first_fill_value_loc(self): if not len(indices) or indices[0] > 0: return 0 - diff = indices[1:] - indices[:-1] - return np.searchsorted(diff, 2) + 1 + # a number larger than 1 should be appended to + # the last in case of fill value only appears + # in the tail of array + diff = np.r_[np.diff(indices), 2] + return indices[(diff > 1).argmax()] + 1 def unique(self: SparseArrayT) -> SparseArrayT: uniques = algos.unique(self.sp_values) - fill_loc = self._first_fill_value_loc() - if fill_loc >= 0: - uniques = np.insert(uniques, fill_loc, self.fill_value) + if len(self.sp_values) != len(self): + fill_loc = self._first_fill_value_loc() + # Inorder to align the behavior of pd.unique or + # pd.Series.unique, we should keep the original + # order, here we use unique again to find the + # insertion place. Since the length of sp_values + # is not large, maybe minor performance hurt + # is worthwhile to the correctness. + insert_loc = len(algos.unique(self.sp_values[:fill_loc])) + uniques = np.insert(uniques, insert_loc, self.fill_value) return type(self)._from_sequence(uniques, dtype=self.dtype) def _values_for_factorize(self): diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 492427b2be213..9b78eb345e188 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -391,23 +391,36 @@ def test_setting_fill_value_updates(): @pytest.mark.parametrize( - "arr, loc", + "arr,fill_value,loc", [ - ([None, 1, 2], 0), - ([0, None, 2], 1), - ([0, 1, None], 2), - ([0, 1, 1, None, None], 3), - ([1, 1, 1, 2], -1), - ([], -1), + ([None, 1, 2], None, 0), + ([0, None, 2], None, 1), + ([0, 1, None], None, 2), + ([0, 1, 1, None, None], None, 3), + ([1, 1, 1, 2], None, -1), + ([], None, -1), + ([None, 1, 0, 0, None, 2], None, 0), + ([None, 1, 0, 0, None, 2], 1, 1), + ([None, 1, 0, 0, None, 2], 2, 5), + ([None, 1, 0, 0, None, 2], 3, -1), + ([None, 0, 0, 1, 2, 1], 0, 1), + ([None, 0, 0, 1, 2, 1], 1, 3), ], ) -def test_first_fill_value_loc(arr, loc): - result = SparseArray(arr)._first_fill_value_loc() +def test_first_fill_value_loc(arr, fill_value, loc): + result = SparseArray(arr, fill_value=fill_value)._first_fill_value_loc() assert result == loc @pytest.mark.parametrize( - "arr", [[1, 2, np.nan, np.nan], [1, np.nan, 2, np.nan], [1, 2, np.nan]] + "arr", + [ + [1, 2, np.nan, np.nan], + [1, np.nan, 2, np.nan], + [1, 2, np.nan], + [np.nan, 1, 0, 0, np.nan, 2], + [np.nan, 0, 0, 1, 2, 1], + ], ) @pytest.mark.parametrize("fill_value", [np.nan, 0, 1]) def test_unique_na_fill(arr, fill_value):