Skip to content

Commit 3d94f7a

Browse files
authored
BUG: fix SparseArray.unique IndexError and _first_fill_value_loc algo (#47810)
* Update array.py * Update test_array.py * Update array.py * fix format * Update v1.5.0.rst * fix number
1 parent bedd8f0 commit 3d94f7a

File tree

3 files changed

+40
-16
lines changed

3 files changed

+40
-16
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1027,6 +1027,7 @@ Reshaping
10271027
Sparse
10281028
^^^^^^
10291029
- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``SparseDtype`` failing to retain the array's ``fill_value`` (:issue:`45691`)
1030+
- Bug in :meth:`SparseArray.unique` fails to keep original elements order (:issue:`47809`)
10301031
-
10311032

10321033
ExtensionArray

pandas/core/arrays/sparse/array.py

+16-6
Original file line numberDiff line numberDiff line change
@@ -821,7 +821,7 @@ def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT
821821

822822
def _first_fill_value_loc(self):
823823
"""
824-
Get the location of the first missing value.
824+
Get the location of the first fill value.
825825
826826
Returns
827827
-------
@@ -834,14 +834,24 @@ def _first_fill_value_loc(self):
834834
if not len(indices) or indices[0] > 0:
835835
return 0
836836

837-
diff = indices[1:] - indices[:-1]
838-
return np.searchsorted(diff, 2) + 1
837+
# a number larger than 1 should be appended to
838+
# the last in case of fill value only appears
839+
# in the tail of array
840+
diff = np.r_[np.diff(indices), 2]
841+
return indices[(diff > 1).argmax()] + 1
839842

840843
def unique(self: SparseArrayT) -> SparseArrayT:
841844
uniques = algos.unique(self.sp_values)
842-
fill_loc = self._first_fill_value_loc()
843-
if fill_loc >= 0:
844-
uniques = np.insert(uniques, fill_loc, self.fill_value)
845+
if len(self.sp_values) != len(self):
846+
fill_loc = self._first_fill_value_loc()
847+
# Inorder to align the behavior of pd.unique or
848+
# pd.Series.unique, we should keep the original
849+
# order, here we use unique again to find the
850+
# insertion place. Since the length of sp_values
851+
# is not large, maybe minor performance hurt
852+
# is worthwhile to the correctness.
853+
insert_loc = len(algos.unique(self.sp_values[:fill_loc]))
854+
uniques = np.insert(uniques, insert_loc, self.fill_value)
845855
return type(self)._from_sequence(uniques, dtype=self.dtype)
846856

847857
def _values_for_factorize(self):

pandas/tests/arrays/sparse/test_array.py

+23-10
Original file line numberDiff line numberDiff line change
@@ -391,23 +391,36 @@ def test_setting_fill_value_updates():
391391

392392

393393
@pytest.mark.parametrize(
394-
"arr, loc",
394+
"arr,fill_value,loc",
395395
[
396-
([None, 1, 2], 0),
397-
([0, None, 2], 1),
398-
([0, 1, None], 2),
399-
([0, 1, 1, None, None], 3),
400-
([1, 1, 1, 2], -1),
401-
([], -1),
396+
([None, 1, 2], None, 0),
397+
([0, None, 2], None, 1),
398+
([0, 1, None], None, 2),
399+
([0, 1, 1, None, None], None, 3),
400+
([1, 1, 1, 2], None, -1),
401+
([], None, -1),
402+
([None, 1, 0, 0, None, 2], None, 0),
403+
([None, 1, 0, 0, None, 2], 1, 1),
404+
([None, 1, 0, 0, None, 2], 2, 5),
405+
([None, 1, 0, 0, None, 2], 3, -1),
406+
([None, 0, 0, 1, 2, 1], 0, 1),
407+
([None, 0, 0, 1, 2, 1], 1, 3),
402408
],
403409
)
404-
def test_first_fill_value_loc(arr, loc):
405-
result = SparseArray(arr)._first_fill_value_loc()
410+
def test_first_fill_value_loc(arr, fill_value, loc):
411+
result = SparseArray(arr, fill_value=fill_value)._first_fill_value_loc()
406412
assert result == loc
407413

408414

409415
@pytest.mark.parametrize(
410-
"arr", [[1, 2, np.nan, np.nan], [1, np.nan, 2, np.nan], [1, 2, np.nan]]
416+
"arr",
417+
[
418+
[1, 2, np.nan, np.nan],
419+
[1, np.nan, 2, np.nan],
420+
[1, 2, np.nan],
421+
[np.nan, 1, 0, 0, np.nan, 2],
422+
[np.nan, 0, 0, 1, 2, 1],
423+
],
411424
)
412425
@pytest.mark.parametrize("fill_value", [np.nan, 0, 1])
413426
def test_unique_na_fill(arr, fill_value):

0 commit comments

Comments
 (0)