Skip to content

Commit 80f5881

Browse files
committed
Add tests/asv for Series/(Multi)Index; refactor
1 parent 2e2a14d commit 80f5881

File tree

15 files changed

+510
-126
lines changed

15 files changed

+510
-126
lines changed

asv_bench/benchmarks/frame_methods.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -412,10 +412,13 @@ def time_frame_nunique(self):
412412
class Duplicated(object):
413413

414414
goal_time = 0.2
415-
params = (['first', 'last'], [True, False])
415+
params = (['first', 'last', False], [True, False])
416416
param_names = ['keep', 'return_inverse']
417417

418418
def setup(self, keep, return_inverse):
419+
if keep is False and return_inverse:
420+
raise NotImplementedError
421+
419422
n = (1 << 20)
420423
t = date_range('2015-01-01', freq='S', periods=(n // 64))
421424
xs = np.random.randn(n // 64).round(2)

asv_bench/benchmarks/index_object.py

+18
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,24 @@ def time_modulo(self, dtype):
8484
self.index % 2
8585

8686

87+
class Duplicated(object):
88+
89+
goal_time = 0.2
90+
params = (['first', 'last', False], [True, False])
91+
param_names = ['keep', 'return_inverse']
92+
93+
def setup(self, keep, return_inverse):
94+
if keep is False and return_inverse:
95+
raise NotImplementedError
96+
97+
n, k = 200, 1000
98+
base = tm.makeStringIndex(n)
99+
self.idx = Index(base[np.random.choice(n, k * n)])
100+
101+
def time_duplicated(self, keep, return_inverse):
102+
self.idx.duplicated(keep=keep, return_inverse=return_inverse)
103+
104+
87105
class Range(object):
88106

89107
goal_time = 0.2

asv_bench/benchmarks/multiindex_object.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -83,17 +83,22 @@ def time_is_monotonic(self):
8383
class Duplicated(object):
8484

8585
goal_time = 0.2
86+
params = (['first', 'last', False], [True, False])
87+
param_names = ['keep', 'return_inverse']
8688

87-
def setup(self):
88-
n, k = 200, 5000
89+
def setup(self, keep, return_inverse):
90+
if keep is False and return_inverse:
91+
raise NotImplementedError
92+
93+
n, k = 200, 1000
8994
levels = [np.arange(n),
9095
tm.makeStringIndex(n).values,
9196
1000 + np.arange(n)]
9297
labels = [np.random.choice(n, (k * n)) for lev in levels]
9398
self.mi = MultiIndex(levels=levels, labels=labels)
9499

95-
def time_duplicated(self):
96-
self.mi.duplicated()
100+
def time_duplicated(self, keep, return_inverse):
101+
self.mi.duplicated(keep=keep, return_inverse=return_inverse)
97102

98103

99104
class Sortlevel(object):

asv_bench/benchmarks/series_methods.py

+18
Original file line numberDiff line numberDiff line change
@@ -134,3 +134,21 @@ def setup(self):
134134

135135
def time_series_datetimeindex_repr(self):
136136
getattr(self.s, 'a', None)
137+
138+
139+
class Duplicated(object):
140+
141+
goal_time = 0.2
142+
params = (['first', 'last', False], [True, False])
143+
param_names = ['keep', 'return_inverse']
144+
145+
def setup(self, keep, return_inverse):
146+
if keep is False and return_inverse:
147+
raise NotImplementedError
148+
149+
n, k = 200, 1000
150+
base = tm.makeStringIndex(n)
151+
self.s = Series(base[np.random.choice(n, k * n)])
152+
153+
def time_series_duplicated(self, keep, return_inverse):
154+
self.s.duplicated(keep=keep, return_inverse=return_inverse)

doc/source/whatsnew/v0.24.0.txt

+46-25
Original file line numberDiff line numberDiff line change
@@ -8,50 +8,71 @@ v0.24.0
88
New features
99
~~~~~~~~~~~~
1010

11+
.. _whatsnew_0240.enhancements.extension_array_operators
12+
13+
``ExtensionArray`` operator support
14+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
15+
16+
A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison
17+
operators. (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``:
18+
19+
1. Define each of the operators on your ``ExtensionArray`` subclass.
20+
2. Use an operator implementation from pandas that depends on operators that are already defined
21+
on the underlying elements (scalars) of the ``ExtensionArray``.
22+
23+
See the :ref:`ExtensionArray Operator Support
24+
<extending.extension.operator>` documentation section for details on both
25+
ways of adding operator support.
26+
1127
.. _whatsnew_0240.enhancements.duplicated_inverse:
1228

13-
``DataFrame.duplicated`` has gained the ``return_inverse`` kwarg
14-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
29+
The ``duplicated``-method has gained the ``return_inverse`` kwarg
30+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
31+
32+
The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavours of ``Index`` has gained a ``return_inverse`` keyword,
33+
which is False by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
34+
that allows reconstructing the original object from the deduplicated, unique subset.
1535

16-
The ``duplicated``-method has gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
17-
to a tuple of two Series, where the second Series contains the mapping from the indices of the deduplicated, unique subset back to the original index:
36+
For ``Index`` objects, the inverse is an ``np.ndarray``:
1837

1938
.. ipython:: python
2039

21-
df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
22-
index=[1, 4, 9, 16, 25])
23-
df
24-
isduplicate, inverse = df.duplicated(return_inverse=True) # default: keep='first'
40+
idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
41+
idx.has_duplicates
42+
isduplicate, inverse = idx.duplicated(return_inverse=True) # default: keep='first'
2543
isduplicate
2644
inverse
2745

28-
This allows to reconstruct the original DataFrame as follows:
46+
This allows to reconstruct the original ``Index`` as follows:
2947

3048
.. ipython:: python
3149

32-
unique = df.loc[~isduplicate] # same as df.drop_duplicates()
50+
unique = idx[~isduplicate] # same as idx.drop_duplicates()
3351
unique
34-
reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
35-
reconstruct.equals(df)
3652

37-
The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
38-
to construct an inverse).
53+
reconstruct = unique[inverse]
54+
reconstruct.equals(idx)
3955

40-
.. _whatsnew_0240.enhancements.extension_array_operators
56+
For ``DataFrame`` and ``Series`` the inverse needs to take into account the original index as well, and is therefore a ``Series``,
57+
which contains the mapping from the index of the deduplicated, unique subset back to the original index.
4158

42-
``ExtensionArray`` operator support
43-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
59+
.. ipython:: python
4460

45-
A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison
46-
operators. (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``:
61+
df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
62+
index=[1, 4, 9, 16, 25])
63+
df
64+
isduplicate, inverse = df.duplicated(keep='last', return_inverse=True)
65+
isduplicate
66+
inverse
4767

48-
1. Define each of the operators on your ``ExtensionArray`` subclass.
49-
2. Use an operator implementation from pandas that depends on operators that are already defined
50-
on the underlying elements (scalars) of the ``ExtensionArray``.
68+
unique = df.loc[~isduplicate] # same as df.drop_duplicates(keep='last')
69+
unique
70+
reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
71+
reconstruct.equals(df)
72+
73+
The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
74+
to construct an inverse).
5175

52-
See the :ref:`ExtensionArray Operator Support
53-
<extending.extension.operator>` documentation section for details on both
54-
ways of adding operator support.
5576

5677
.. _whatsnew_0240.enhancements.other:
5778

pandas/core/algorithms.py

+50-20
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,8 @@ def _value_counts_arraylike(values, dropna):
771771
return keys, counts
772772

773773

774-
def duplicated(values, keep='first', return_inverse=False):
774+
def duplicated(values, keep='first', return_index=False, return_inverse=False,
775+
stabilize=True):
775776
"""
776777
Return boolean ndarray denoting duplicate values.
777778
@@ -788,15 +789,31 @@ def duplicated(values, keep='first', return_inverse=False):
788789
occurrence.
789790
- False : Mark all duplicates as ``True``. This option is not
790791
compatible with ``return_inverse``.
792+
return_index : boolean, default False
793+
If True, also return the (array of) integer indices for the unique
794+
elements within values.
795+
796+
.. versionadded:: 0.24.0
791797
return_inverse : boolean, default False
792-
Determines whether the mapping from unique elements to the original
793-
index should be returned. If True, the output is a tuple.
798+
If True, also return the indices of the unique array that can be used
799+
to reconstruct values..
800+
801+
.. versionadded:: 0.24.0
802+
stabilize : boolean, default True
803+
This keyword is only relevant if index and/or inverse are returned. If
804+
True (the default), it will be ensured that index and inverse fit to
805+
the order of `values`. In case that index and inverse are not needed
806+
separately, but combined right away, this sorting process is
807+
unnecessary and can be disabled for improved performance by setting
808+
`stabilize=False`.
794809
795810
.. versionadded:: 0.24.0
796811
797812
Returns
798813
-------
799-
duplicated : ndarray or or tuple of ndarray if return_inverse is True
814+
duplicated : ndarray or tuple of ndarray
815+
np.ndarray if both `return_index` and `return_inverse` are False.
816+
Otherwise, tuple of ndarray.
800817
"""
801818

802819
if return_inverse and not keep:
@@ -808,33 +825,46 @@ def duplicated(values, keep='first', return_inverse=False):
808825
values, dtype, ndtype = _ensure_data(values)
809826
f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
810827
isdup = f(values, keep=keep)
811-
if not return_inverse:
828+
if not (return_index or return_inverse):
812829
return isdup
813830
elif not isdup.any():
814831
# no need to calculate inverse if no duplicates
815832
inv = np.array(range(len(values)))
816-
return isdup, inv
833+
return (isdup,) + (inv,) * return_index + (inv,) * return_inverse
817834

818835
if keep == 'first':
819-
# o2u: original indices to indices of ARRAY of unique values
820-
# u2o: reduplication from array of unique values to original array
821-
_, o2u, u2o = np.unique(values, return_inverse=True,
822-
return_index=True)
823-
inv = o2u[u2o]
836+
# ind: original indices to indices of ARRAY of unique values
837+
# inv: reduplication from array of unique values to original array
838+
# this fits together in the way that values[ind] are the unique values
839+
# and values[ind][inv] == values
840+
_, ind, inv = np.unique(values, return_index=True,
841+
return_inverse=True)
824842
elif keep == 'last':
825843
# np.unique takes first occurrence as unique value,
826-
# so we flip ids that first becomes last
844+
# so we flip values that first becomes last
827845
values = values[::-1]
828-
_, o2u, u2o = np.unique(values, return_inverse=True,
829-
return_index=True)
830-
# the values in the ids-array correspond(ed) to the index of value,
846+
_, ind, inv = np.unique(values, return_index=True,
847+
return_inverse=True)
848+
# the values in "values"correspond(ed) to the index of "values",
831849
# which is simply np.array(range(len(values))).
832-
# By flipping ids around, we need to do the same for the index,
833-
# ___because o2u and u2o are relative to that order___.
850+
# By flipping "values" around, we need to do the same for the index,
851+
# ___because ind and inv are relative to that order___.
834852
# Finally, to fit with the original order again, we need to flip the
835-
# values around one last time.
836-
inv = np.array(range(len(values)))[::-1][o2u][u2o][::-1]
837-
return isdup, inv
853+
# result around one last time.
854+
ind, inv = np.array(range(len(values)))[::-1][ind], inv[::-1]
855+
856+
if stabilize:
857+
# np.unique yields a __sorted__ list of uniques, and the index/inverse
858+
# are relative to this order. To restore the original order, we argsort
859+
# the returned index (corresponding to the mapping from values to
860+
# sorted, which is the wrong way around for us), and invert this
861+
# mapping once more (corresponding to the mapping from sorted back to
862+
# values), which is again done by argsorting.
863+
undo_sort = np.argsort(np.argsort(ind))
864+
ind, inv = ind[undo_sort], undo_sort[inv]
865+
866+
res = (isdup,) + (ind,) * return_index + (inv,) * return_inverse
867+
return res
838868

839869

840870
def mode(values, dropna=True):

pandas/core/base.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1266,11 +1266,15 @@ def duplicated(self, keep='first', return_inverse=False):
12661266
return self._constructor(duplicated(self, keep=keep),
12671267
index=self.index).__finalize__(self)
12681268

1269-
isdup_array, inv_array = duplicated(self, keep=keep,
1270-
return_inverse=return_inverse)
1269+
# return_inverse = True
1270+
isdup_array, ind_array, inv_array = duplicated(self, keep=keep,
1271+
return_index=True,
1272+
return_inverse=True,
1273+
stabilize=False)
12711274
isdup = self._constructor(isdup_array,
12721275
index=self.index).__finalize__(self)
1273-
inv = self._constructor(self.index[inv_array], index=self.index)
1276+
inv = self._constructor(self.index[ind_array[inv_array]],
1277+
index=self.index)
12741278
return isdup, inv
12751279

12761280
# ----------------------------------------------------------------------

pandas/core/frame.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -4363,8 +4363,9 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
43634363
- False : Mark all duplicates as ``True``. This option is not
43644364
compatible with ``return_inverse``.
43654365
return_inverse : boolean, default False
4366-
Determines whether the mapping from unique elements to the original
4367-
index should be returned. If True, the output is a tuple.
4366+
Determines whether the mapping from (the index of) unique elements
4367+
to the original index should be returned. If True, the output is
4368+
a tuple.
43684369
43694370
.. versionadded:: 0.24.0
43704371
@@ -4409,10 +4410,13 @@ def f(vals):
44094410
if not return_inverse:
44104411
return Series(duplicated(ids, keep=keep), index=self.index)
44114412

4412-
isdup_array, inv_array = duplicated(ids, keep=keep,
4413-
return_inverse=return_inverse)
4413+
# return_inverse = True
4414+
isdup_array, ind_array, inv_array = duplicated(ids, keep=keep,
4415+
return_index=True,
4416+
return_inverse=True,
4417+
stabilize=False)
44144418
isdup = Series(isdup_array, index=self.index)
4415-
inv = Series(self.index[inv_array], index=self.index)
4419+
inv = Series(self.index[[ind_array[inv_array]]], index=self.index)
44164420
return isdup, inv
44174421

44184422
# ----------------------------------------------------------------------

pandas/core/indexes/base.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -4456,8 +4456,9 @@ def duplicated(self, keep='first', return_inverse=False):
44564456
- ``False`` : Mark all duplicates as ``True``. This option is not
44574457
compatible with ``return_inverse``.
44584458
return_inverse : boolean, default False
4459-
Determines whether the mapping from unique elements to the original
4460-
index should be returned. If True, the output is a tuple.
4459+
Determines whether the mapping from (the index of) unique elements
4460+
to the original index should be returned. If True, the output is
4461+
a tuple.
44614462
44624463
.. versionadded:: 0.24.0
44634464

pandas/core/indexes/category.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -392,10 +392,10 @@ def unique(self, level=None):
392392
ordered=result.ordered)
393393

394394
@Appender(Index.duplicated.__doc__)
395-
def duplicated(self, keep='first'):
396-
from pandas._libs.hashtable import duplicated_int64
395+
def duplicated(self, keep='first', return_inverse=False):
396+
from pandas.core.algorithms import duplicated
397397
codes = self.codes.astype('i8')
398-
return duplicated_int64(codes, keep)
398+
return duplicated(codes, keep=keep, return_inverse=return_inverse)
399399

400400
def _to_safe_for_reshape(self):
401401
""" convert to object if we are a categorical """

pandas/core/series.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1601,8 +1601,9 @@ def duplicated(self, keep='first', return_inverse=False):
16011601
- ``False`` : Mark all duplicates as ``True``. This option is not
16021602
compatible with ``return_inverse``.
16031603
return_inverse : boolean, default False
1604-
Determines whether the mapping from unique elements to the original
1605-
index should be returned. If True, the output is a tuple.
1604+
Determines whether the mapping from (the index of) unique elements
1605+
to the original index should be returned. If True, the output is
1606+
a tuple.
16061607
16071608
.. versionadded:: 0.24.0
16081609

0 commit comments

Comments
 (0)