Skip to content

Commit 819bc9a

Browse files
committed
WIP: add return_inverse to Series/Index as well
1 parent b08dc3d commit 819bc9a

File tree

6 files changed

+116
-54
lines changed

6 files changed

+116
-54
lines changed

pandas/core/algorithms.py

+43-4
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,7 @@ def _value_counts_arraylike(values, dropna):
771771
return keys, counts
772772

773773

774-
def duplicated(values, keep='first'):
774+
def duplicated(values, keep='first', return_inverse=False):
775775
"""
776776
Return boolean ndarray denoting duplicate values.
777777
@@ -786,16 +786,55 @@ def duplicated(values, keep='first'):
786786
occurrence.
787787
- ``last`` : Mark duplicates as ``True`` except for the last
788788
occurrence.
789-
- False : Mark all duplicates as ``True``.
789+
- False : Mark all duplicates as ``True``. This option is not
790+
compatible with ``return_inverse``.
791+
return_inverse : boolean, default False
792+
Determines whether the mapping from unique elements to the original
793+
index should be returned. If True, the output is a tuple.
794+
795+
.. versionadded:: 0.24.0
790796
791797
Returns
792798
-------
793-
duplicated : ndarray
799+
duplicated : ndarray or or tuple of ndarray if return_inverse is True
794800
"""
795801

802+
if return_inverse and not keep:
803+
raise ValueError("The parameters return_inverse=True and "
804+
"keep=False cannot be used together (impossible "
805+
"to calculate an inverse when discarding all "
806+
"instances of a duplicate).")
807+
796808
values, dtype, ndtype = _ensure_data(values)
797809
f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
798-
return f(values, keep=keep)
810+
isdup = f(values, keep=keep)
811+
if not return_inverse:
812+
return isdup
813+
elif not isdup.any():
814+
# no need to calculate inverse if no duplicates
815+
inv = np.array(range(len(values)))
816+
return isdup, inv
817+
818+
if keep == 'first':
819+
# o2u: original indices to indices of ARRAY of unique values
820+
# u2o: reduplication from array of unique values to original array
821+
_, o2u, u2o = np.unique(values, return_inverse=True,
822+
return_index=True)
823+
inv = o2u[u2o]
824+
elif keep == 'last':
825+
# np.unique takes first occurrence as unique value,
826+
# so we flip ids that first becomes last
827+
values = values[::-1]
828+
_, o2u, u2o = np.unique(values, return_inverse=True,
829+
return_index=True)
830+
# the values in the ids-array correspond(ed) to the index of value,
831+
# which is simply np.array(range(len(values))).
832+
# By flipping ids around, we need to do the same for the index,
833+
# ___because o2u and u2o are relative to that order___.
834+
# Finally, to fit with the original order again, we need to flip the
835+
# values around one last time.
836+
inv = np.array(range(len(values)))[::-1][o2u][u2o][::-1]
837+
return isdup, inv
799838

800839

801840
def mode(values, dropna=True):

pandas/core/base.py

+25-4
Original file line numberDiff line numberDiff line change
@@ -1242,16 +1242,37 @@ def drop_duplicates(self, keep='first', inplace=False):
12421242
else:
12431243
return result
12441244

1245-
def duplicated(self, keep='first'):
1245+
def duplicated(self, keep='first', return_inverse=False):
12461246
from pandas.core.algorithms import duplicated
1247+
1248+
if return_inverse and not keep:
1249+
raise ValueError("The parameters return_inverse=True and "
1250+
"keep=False cannot be used together (impossible "
1251+
"to calculate an inverse when discarding all "
1252+
"instances of a duplicate).")
1253+
12471254
if isinstance(self, ABCIndexClass):
12481255
if self.is_unique:
1249-
return np.zeros(len(self), dtype=np.bool)
1250-
return duplicated(self, keep=keep)
1251-
else:
1256+
isdup = np.zeros(len(self), dtype=np.bool)
1257+
if not return_inverse:
1258+
return isdup
1259+
return isdup, np.array(range(len(self)))
1260+
# algorithms.duplicated has the same output signature as
1261+
# Index.duplicated -> no need to distinguish cases here
1262+
return duplicated(self, keep=keep, return_inverse=return_inverse)
1263+
1264+
# Series case
1265+
if not return_inverse:
12521266
return self._constructor(duplicated(self, keep=keep),
12531267
index=self.index).__finalize__(self)
12541268

1269+
isdup_array, inv_array = duplicated(self, keep=keep,
1270+
return_inverse=return_inverse)
1271+
isdup = self._constructor(isdup_array,
1272+
index=self.index).__finalize__(self)
1273+
inv = self._constructor(self.index[inv_array], index=self.index)
1274+
return isdup, inv
1275+
12551276
# ----------------------------------------------------------------------
12561277
# abstracts
12571278

pandas/core/frame.py

+11-28
Original file line numberDiff line numberDiff line change
@@ -4364,7 +4364,7 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
43644364
compatible with ``return_inverse``.
43654365
return_inverse : boolean, default False
43664366
Determines whether the mapping from unique elements to the original
4367-
index should be returned. If true, the output is a tuple.
4367+
index should be returned. If True, the output is a tuple.
43684368
43694369
.. versionadded:: 0.24.0
43704370
@@ -4373,12 +4373,14 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
43734373
duplicated : Series or tuple of Series if return_inverse is True
43744374
"""
43754375
from pandas.core.sorting import get_group_index
4376-
from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
4376+
from pandas._libs.hashtable import _SIZE_HINT_LIMIT
4377+
from pandas.core.algorithms import duplicated
43774378

43784379
if return_inverse and not keep:
43794380
raise ValueError("The parameters return_inverse=True and "
43804381
"keep=False cannot be used together (impossible "
4381-
"to calculate an inverse when discarding values)")
4382+
"to calculate an inverse when discarding all "
4383+
"instances of a duplicate).")
43824384

43834385
def f(vals):
43844386
labels, shape = algorithms.factorize(
@@ -4404,32 +4406,13 @@ def f(vals):
44044406
labels, shape = map(list, zip(*map(f, vals)))
44054407

44064408
ids = get_group_index(labels, shape, sort=False, xnull=False)
4407-
isdup = Series(duplicated_int64(ids, keep), index=self.index)
44084409
if not return_inverse:
4409-
return isdup
4410-
elif not isdup.any():
4411-
# no need to calculate inverse if no duplicates
4412-
inv = Series(self.index, index=self.index)
4413-
return isdup, inv
4414-
4415-
if keep == 'first':
4416-
# o2u: original indices to indices of ARRAY of unique values
4417-
# u2o: reduplication from array of unique values to original array
4418-
_, o2u, u2o = np.unique(ids, return_inverse=True,
4419-
return_index=True)
4420-
inv = Series(self.index[o2u][u2o], index=self.index)
4421-
elif keep == 'last':
4422-
# np.unique takes first occurrence as unique value,
4423-
# so we flip ids that first becomes last
4424-
ids = ids[::-1]
4425-
_, o2u, u2o = np.unique(ids, return_inverse=True,
4426-
return_index=True)
4427-
# the values in the ids-array correspond(ed) to self.index -
4428-
# by flipping ids around, we need to do the same for self.index,
4429-
# ___because o2u and u2o are relative to that order___.
4430-
# Finally, to fit with 'index=self.index' in the constructor,
4431-
# we need to flip the values around one last time
4432-
inv = Series(self.index[::-1][o2u][u2o][::-1], index=self.index)
4410+
return Series(duplicated(ids, keep=keep), index=self.index)
4411+
4412+
isdup_array, inv_array = duplicated(ids, keep=keep,
4413+
return_inverse=return_inverse)
4414+
isdup = Series(isdup_array, index=self.index)
4415+
inv = Series(self.index[inv_array], index=self.index)
44334416
return isdup, inv
44344417

44354418
# ----------------------------------------------------------------------

pandas/core/indexes/base.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -4432,7 +4432,7 @@ def drop_duplicates(self, keep='first'):
44324432
"""
44334433
return super(Index, self).drop_duplicates(keep=keep)
44344434

4435-
def duplicated(self, keep='first'):
4435+
def duplicated(self, keep='first', return_inverse=False):
44364436
"""
44374437
Indicate duplicate index values.
44384438
@@ -4449,7 +4449,17 @@ def duplicated(self, keep='first'):
44494449
occurrence.
44504450
- 'last' : Mark duplicates as ``True`` except for the last
44514451
occurrence.
4452-
- ``False`` : Mark all duplicates as ``True``.
4452+
- ``False`` : Mark all duplicates as ``True``. This option is not
4453+
compatible with ``return_inverse``.
4454+
return_inverse : boolean, default False
4455+
Determines whether the mapping from unique elements to the original
4456+
index should be returned. If True, the output is a tuple.
4457+
4458+
.. versionadded:: 0.24.0
4459+
4460+
Returns
4461+
-------
4462+
duplicated : ndarray or or tuple of ndarray if return_inverse is True
44534463
44544464
Examples
44554465
--------
@@ -4476,17 +4486,14 @@ def duplicated(self, keep='first'):
44764486
>>> idx.duplicated(keep=False)
44774487
array([ True, False, True, False, True])
44784488
4479-
Returns
4480-
-------
4481-
numpy.ndarray
4482-
44834489
See Also
44844490
--------
44854491
pandas.Series.duplicated : Equivalent method on pandas.Series
44864492
pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame
44874493
pandas.Index.drop_duplicates : Remove duplicate values from Index
44884494
"""
4489-
return super(Index, self).duplicated(keep=keep)
4495+
return super(Index, self).duplicated(keep=keep,
4496+
return_inverse=return_inverse)
44904497

44914498
_index_shared_docs['fillna'] = """
44924499
Fill NA/NaN values with the specified value

pandas/core/indexes/multi.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -930,14 +930,19 @@ def f(k, stringify):
930930
return hash_tuple(key)
931931

932932
@Appender(Index.duplicated.__doc__)
933-
def duplicated(self, keep='first'):
933+
def duplicated(self, keep='first', return_inverse=False):
934934
from pandas.core.sorting import get_group_index
935-
from pandas._libs.hashtable import duplicated_int64
935+
from pandas.core.algorithms import duplicated
936+
937+
if return_inverse and not keep:
938+
raise ValueError("The parameters return_inverse=True and "
939+
"keep=False cannot be used together (impossible "
940+
"to calculate an inverse when discarding all "
941+
"instances of a duplicate).")
936942

937943
shape = map(len, self.levels)
938944
ids = get_group_index(self.labels, shape, sort=False, xnull=False)
939-
940-
return duplicated_int64(ids, keep)
945+
return duplicated(ids, keep=keep, return_inverse=return_inverse)
941946

942947
def fillna(self, value=None, downcast=None):
943948
"""

pandas/core/series.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -1581,7 +1581,7 @@ def drop_duplicates(self, keep='first', inplace=False):
15811581
"""
15821582
return super(Series, self).drop_duplicates(keep=keep, inplace=inplace)
15831583

1584-
def duplicated(self, keep='first'):
1584+
def duplicated(self, keep='first', return_inverse=False):
15851585
"""
15861586
Indicate duplicate Series values.
15871587
@@ -1596,7 +1596,17 @@ def duplicated(self, keep='first'):
15961596
occurrence.
15971597
- 'last' : Mark duplicates as ``True`` except for the last
15981598
occurrence.
1599-
- ``False`` : Mark all duplicates as ``True``.
1599+
- ``False`` : Mark all duplicates as ``True``. This option is not
1600+
compatible with ``return_inverse``.
1601+
return_inverse : boolean, default False
1602+
Determines whether the mapping from unique elements to the original
1603+
index should be returned. If True, the output is a tuple.
1604+
1605+
.. versionadded:: 0.24.0
1606+
1607+
Returns
1608+
-------
1609+
duplicated : Series or or tuple of Series if return_inverse is True
16001610
16011611
Examples
16021612
--------
@@ -1643,17 +1653,14 @@ def duplicated(self, keep='first'):
16431653
4 True
16441654
dtype: bool
16451655
1646-
Returns
1647-
-------
1648-
pandas.core.series.Series
1649-
16501656
See Also
16511657
--------
16521658
pandas.Index.duplicated : Equivalent method on pandas.Index
16531659
pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame
16541660
pandas.Series.drop_duplicates : Remove duplicate values from Series
16551661
"""
1656-
return super(Series, self).duplicated(keep=keep)
1662+
return super(Series, self).duplicated(keep=keep,
1663+
return_inverse=return_inverse)
16571664

16581665
def idxmin(self, axis=None, skipna=True, *args, **kwargs):
16591666
"""

0 commit comments

Comments
 (0)