BUG: fix SparseArray.unique IndexError and _first_fill_value_loc algo (#47810)

GYHHAHA · web-flow · commit 3d94f7a07975 · 2022-07-22T10:32:20.000-07:00
* Update array.py

* Update test_array.py

* Update array.py

* fix format

* Update v1.5.0.rst

* fix number
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -1027,6 +1027,7 @@ Reshaping
 Sparse
 ^^^^^^
 - Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``SparseDtype`` failing to retain the array's ``fill_value`` (:issue:`45691`)
+- Bug in :meth:`SparseArray.unique` fails to keep original elements order (:issue:`47809`)
 -
 
 ExtensionArray
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -821,7 +821,7 @@ def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT
 
     def _first_fill_value_loc(self):
         """
-        Get the location of the first missing value.
+        Get the location of the first fill value.
 
         Returns
         -------
@@ -834,14 +834,24 @@ def _first_fill_value_loc(self):
         if not len(indices) or indices[0] > 0:
             return 0
 
-        diff = indices[1:] - indices[:-1]
-        return np.searchsorted(diff, 2) + 1
+        # a number larger than 1 should be appended to
+        # the last in case of fill value only appears
+        # in the tail of array
+        diff = np.r_[np.diff(indices), 2]
+        return indices[(diff > 1).argmax()] + 1
 
     def unique(self: SparseArrayT) -> SparseArrayT:
         uniques = algos.unique(self.sp_values)
-        fill_loc = self._first_fill_value_loc()
-        if fill_loc >= 0:
-            uniques = np.insert(uniques, fill_loc, self.fill_value)
+        if len(self.sp_values) != len(self):
+            fill_loc = self._first_fill_value_loc()
+            # Inorder to align the behavior of pd.unique or
+            # pd.Series.unique, we should keep the original
+            # order, here we use unique again to find the
+            # insertion place. Since the length of sp_values
+            # is not large, maybe minor performance hurt
+            # is worthwhile to the correctness.
+            insert_loc = len(algos.unique(self.sp_values[:fill_loc]))
+            uniques = np.insert(uniques, insert_loc, self.fill_value)
         return type(self)._from_sequence(uniques, dtype=self.dtype)
 
     def _values_for_factorize(self):
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -391,23 +391,36 @@ def test_setting_fill_value_updates():
 
 
 @pytest.mark.parametrize(
-    "arr, loc",
+    "arr,fill_value,loc",
     [
-        ([None, 1, 2], 0),
-        ([0, None, 2], 1),
-        ([0, 1, None], 2),
-        ([0, 1, 1, None, None], 3),
-        ([1, 1, 1, 2], -1),
-        ([], -1),
+        ([None, 1, 2], None, 0),
+        ([0, None, 2], None, 1),
+        ([0, 1, None], None, 2),
+        ([0, 1, 1, None, None], None, 3),
+        ([1, 1, 1, 2], None, -1),
+        ([], None, -1),
+        ([None, 1, 0, 0, None, 2], None, 0),
+        ([None, 1, 0, 0, None, 2], 1, 1),
+        ([None, 1, 0, 0, None, 2], 2, 5),
+        ([None, 1, 0, 0, None, 2], 3, -1),
+        ([None, 0, 0, 1, 2, 1], 0, 1),
+        ([None, 0, 0, 1, 2, 1], 1, 3),
     ],
 )
-def test_first_fill_value_loc(arr, loc):
-    result = SparseArray(arr)._first_fill_value_loc()
+def test_first_fill_value_loc(arr, fill_value, loc):
+    result = SparseArray(arr, fill_value=fill_value)._first_fill_value_loc()
     assert result == loc
 
 
 @pytest.mark.parametrize(
-    "arr", [[1, 2, np.nan, np.nan], [1, np.nan, 2, np.nan], [1, 2, np.nan]]
+    "arr",
+    [
+        [1, 2, np.nan, np.nan],
+        [1, np.nan, 2, np.nan],
+        [1, 2, np.nan],
+        [np.nan, 1, 0, 0, np.nan, 2],
+        [np.nan, 0, 0, 1, 2, 1],
+    ],
 )
 @pytest.mark.parametrize("fill_value", [np.nan, 0, 1])
 def test_unique_na_fill(arr, fill_value):

Original file line number	Diff line number	Diff line change
`@@ -1027,6 +1027,7 @@ Reshaping`
`1027`	`1027`	`Sparse`
`1028`	`1028`	`^^^^^^`
`1029`	`1029`	- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``SparseDtype`` failing to retain the array's ``fill_value`` (:issue:`45691`)
	`1030`	+- Bug in :meth:`SparseArray.unique` fails to keep original elements order (:issue:`47809`)
`1030`	`1031`	`-`
`1031`	`1032`
`1032`	`1033`	`ExtensionArray`