Skip to content

Commit 2a05487

Browse files
committed
ENH: duplicated and drop_duplicates now accept take=all kw
1 parent 4fde946 commit 2a05487

14 files changed

+406
-66
lines changed

doc/source/indexing.rst

+6-4
Original file line numberDiff line numberDiff line change
@@ -1209,17 +1209,19 @@ takes as an argument the columns to use to identify duplicated rows.
12091209
- ``drop_duplicates`` removes duplicate rows.
12101210

12111211
By default, the first observed row of a duplicate set is considered unique, but
1212-
each method has a ``take_last`` parameter that indicates the last observed row
1213-
should be taken instead.
1212+
each method has a ``keep`` parameter to specify targets.
12141213

12151214
.. ipython:: python
12161215
12171216
df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
12181217
'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
12191218
'c' : np.random.randn(7)})
12201219
df2.duplicated(['a','b'])
1220+
df2.duplicated(['a','b'], keep='last')
1221+
df2.duplicated(['a','b'], keep=False)
12211222
df2.drop_duplicates(['a','b'])
1222-
df2.drop_duplicates(['a','b'], take_last=True)
1223+
df2.drop_duplicates(['a','b'], keep='last')
1224+
df2.drop_duplicates(['a','b'], keep=False)
12231225
12241226
An alternative way to drop duplicates on the index is ``.groupby(level=0)`` combined with ``first()`` or ``last()``.
12251227

@@ -1230,7 +1232,7 @@ An alternative way to drop duplicates on the index is ``.groupby(level=0)`` comb
12301232
df3.groupby(level=0).first()
12311233
12321234
# a bit more verbose
1233-
df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b')
1235+
df3.reset_index().drop_duplicates(subset='b', keep='first').set_index('b')
12341236
12351237
.. _indexing.dictionarylike:
12361238

doc/source/whatsnew/v0.17.0.txt

+12
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,16 @@ New features
2626
Other enhancements
2727
^^^^^^^^^^^^^^^^^^
2828

29+
- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. (:issue:`6511`, :issue:`8505`)
30+
31+
.. ipython :: python
32+
33+
s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D'])
34+
s.drop_duplicates()
35+
s.drop_duplicates(keep='last')
36+
s.drop_duplicates(keep=False)
37+
38+
2939
.. _whatsnew_0170.api:
3040

3141
Backwards incompatible API changes
@@ -45,6 +55,8 @@ Other API Changes
4555
Deprecations
4656
^^^^^^^^^^^^
4757

58+
- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`)
59+
4860
.. _whatsnew_0170.prior_deprecations:
4961

5062
Removal of prior version deprecations/changes

pandas/core/base.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pandas.core import common as com
77
import pandas.core.nanops as nanops
88
import pandas.lib as lib
9-
from pandas.util.decorators import Appender, cache_readonly
9+
from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg
1010
from pandas.core.strings import StringMethods
1111
from pandas.core.common import AbstractMethodError
1212

@@ -543,18 +543,23 @@ def _dir_deletions(self):
543543
544544
Parameters
545545
----------
546-
take_last : boolean, default False
547-
Take the last observed index in a group. Default first
546+
547+
keep : {'first', 'last', False}, default 'first'
548+
- ``first`` : Drop duplicates except for the first occurrence.
549+
- ``last`` : Drop duplicates except for the last occurrence.
550+
- False : Drop all duplicates.
551+
take_last : deprecated
548552
%(inplace)s
549553
550554
Returns
551555
-------
552556
deduplicated : %(klass)s
553557
""")
554558

559+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
555560
@Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
556-
def drop_duplicates(self, take_last=False, inplace=False):
557-
duplicated = self.duplicated(take_last=take_last)
561+
def drop_duplicates(self, keep='first', inplace=False):
562+
duplicated = self.duplicated(keep=keep)
558563
result = self[np.logical_not(duplicated)]
559564
if inplace:
560565
return self._update_inplace(result)
@@ -566,18 +571,22 @@ def drop_duplicates(self, take_last=False, inplace=False):
566571
567572
Parameters
568573
----------
569-
take_last : boolean, default False
570-
Take the last observed index in a group. Default first
574+
keep : {'first', 'last', False}, default 'first'
575+
- ``first`` : Mark duplicates as ``True`` except for the first occurrence.
576+
- ``last`` : Mark duplicates as ``True`` except for the last occurrence.
577+
- False : Mark all duplicates as ``True``.
578+
take_last : deprecated
571579
572580
Returns
573581
-------
574582
duplicated : %(duplicated)s
575583
""")
576584

585+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
577586
@Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
578-
def duplicated(self, take_last=False):
587+
def duplicated(self, keep='first'):
579588
keys = com._ensure_object(self.values)
580-
duplicated = lib.duplicated(keys, take_last=take_last)
589+
duplicated = lib.duplicated(keys, keep=keep)
581590
try:
582591
return self._constructor(duplicated,
583592
index=self.index).__finalize__(self)

pandas/core/frame.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -2801,8 +2801,9 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
28012801
else:
28022802
return result
28032803

2804+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
28042805
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
2805-
def drop_duplicates(self, subset=None, take_last=False, inplace=False):
2806+
def drop_duplicates(self, subset=None, keep='first', inplace=False):
28062807
"""
28072808
Return DataFrame with duplicate rows removed, optionally only
28082809
considering certain columns
@@ -2812,8 +2813,11 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
28122813
subset : column label or sequence of labels, optional
28132814
Only consider certain columns for identifying duplicates, by
28142815
default use all of the columns
2815-
take_last : boolean, default False
2816-
Take the last observed row in a row. Defaults to the first row
2816+
keep : {'first', 'last', False}, default 'first'
2817+
- ``first`` : Drop duplicates except for the first occurrence.
2818+
- ``last`` : Drop duplicates except for the last occurrence.
2819+
- False : Drop all duplicates.
2820+
take_last : deprecated
28172821
inplace : boolean, default False
28182822
Whether to drop duplicates in place or to return a copy
28192823
cols : kwargs only argument of subset [deprecated]
@@ -2822,7 +2826,7 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
28222826
-------
28232827
deduplicated : DataFrame
28242828
"""
2825-
duplicated = self.duplicated(subset, take_last=take_last)
2829+
duplicated = self.duplicated(subset, keep=keep)
28262830

28272831
if inplace:
28282832
inds, = (-duplicated).nonzero()
@@ -2831,8 +2835,9 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
28312835
else:
28322836
return self[-duplicated]
28332837

2838+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
28342839
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
2835-
def duplicated(self, subset=None, take_last=False):
2840+
def duplicated(self, subset=None, keep='first'):
28362841
"""
28372842
Return boolean Series denoting duplicate rows, optionally only
28382843
considering certain columns
@@ -2842,9 +2847,13 @@ def duplicated(self, subset=None, take_last=False):
28422847
subset : column label or sequence of labels, optional
28432848
Only consider certain columns for identifying duplicates, by
28442849
default use all of the columns
2845-
take_last : boolean, default False
2846-
For a set of distinct duplicate rows, flag all but the last row as
2847-
duplicated. Default is for all but the first row to be flagged
2850+
keep : {'first', 'last', False}, default 'first'
2851+
- ``first`` : Mark duplicates as ``True`` except for the
2852+
first occurrence.
2853+
- ``last`` : Mark duplicates as ``True`` except for the
2854+
last occurrence.
2855+
- False : Mark all duplicates as ``True``.
2856+
take_last : deprecated
28482857
cols : kwargs only argument of subset [deprecated]
28492858
28502859
Returns
@@ -2870,7 +2879,7 @@ def f(vals):
28702879
labels, shape = map(list, zip( * map(f, vals)))
28712880

28722881
ids = get_group_index(labels, shape, sort=False, xnull=False)
2873-
return Series(duplicated_int64(ids, take_last), index=self.index)
2882+
return Series(duplicated_int64(ids, keep), index=self.index)
28742883

28752884
#----------------------------------------------------------------------
28762885
# Sorting

pandas/core/index.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from pandas.lib import Timestamp, Timedelta, is_datetime_array
1717
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
1818
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
19-
deprecate)
19+
deprecate, deprecate_kwarg)
2020
import pandas.core.common as com
2121
from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype,
2222
_values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype,
@@ -2571,14 +2571,16 @@ def drop(self, labels, errors='raise'):
25712571
indexer = indexer[~mask]
25722572
return self.delete(indexer)
25732573

2574+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
25742575
@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
2575-
def drop_duplicates(self, take_last=False):
2576-
result = super(Index, self).drop_duplicates(take_last=take_last)
2576+
def drop_duplicates(self, keep='first'):
2577+
result = super(Index, self).drop_duplicates(keep=keep)
25772578
return self._constructor(result)
25782579

2580+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
25792581
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
2580-
def duplicated(self, take_last=False):
2581-
return super(Index, self).duplicated(take_last=take_last)
2582+
def duplicated(self, keep='first'):
2583+
return super(Index, self).duplicated(keep=keep)
25822584

25832585

25842586
def _evaluate_with_timedelta_like(self, other, op, opstr):
@@ -2997,10 +2999,11 @@ def _engine(self):
29972999
def is_unique(self):
29983000
return not self.duplicated().any()
29993001

3002+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
30003003
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
3001-
def duplicated(self, take_last=False):
3004+
def duplicated(self, keep='first'):
30023005
from pandas.hashtable import duplicated_int64
3003-
return duplicated_int64(self.codes.astype('i8'), take_last)
3006+
return duplicated_int64(self.codes.astype('i8'), keep)
30043007

30053008
def get_loc(self, key, method=None):
30063009
"""
@@ -4147,15 +4150,16 @@ def _has_complex_internals(self):
41474150
def is_unique(self):
41484151
return not self.duplicated().any()
41494152

4153+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
41504154
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
4151-
def duplicated(self, take_last=False):
4155+
def duplicated(self, keep='first'):
41524156
from pandas.core.groupby import get_group_index
41534157
from pandas.hashtable import duplicated_int64
41544158

41554159
shape = map(len, self.levels)
41564160
ids = get_group_index(self.labels, shape, sort=False, xnull=False)
41574161

4158-
return duplicated_int64(ids, take_last)
4162+
return duplicated_int64(ids, keep)
41594163

41604164
def get_value(self, series, key):
41614165
# somewhat broken encapsulation

pandas/core/series.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
import pandas.core.datetools as datetools
4545
import pandas.core.format as fmt
4646
import pandas.core.nanops as nanops
47-
from pandas.util.decorators import Appender, cache_readonly
47+
from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg
4848

4949
import pandas.lib as lib
5050
import pandas.tslib as tslib
@@ -1137,14 +1137,15 @@ def mode(self):
11371137
from pandas.core.algorithms import mode
11381138
return mode(self)
11391139

1140+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
11401141
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
1141-
def drop_duplicates(self, take_last=False, inplace=False):
1142-
return super(Series, self).drop_duplicates(take_last=take_last,
1143-
inplace=inplace)
1142+
def drop_duplicates(self, keep='first', inplace=False):
1143+
return super(Series, self).drop_duplicates(keep=keep, inplace=inplace)
11441144

1145+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
11451146
@Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs)
1146-
def duplicated(self, take_last=False):
1147-
return super(Series, self).duplicated(take_last=take_last)
1147+
def duplicated(self, keep='first'):
1148+
return super(Series, self).duplicated(keep=keep)
11481149

11491150
def idxmin(self, axis=None, out=None, skipna=True):
11501151
"""

pandas/hashtable.pyx

+17-3
Original file line numberDiff line numberDiff line change
@@ -1051,23 +1051,37 @@ def mode_int64(ndarray[int64_t] values):
10511051

10521052
@cython.wraparound(False)
10531053
@cython.boundscheck(False)
1054-
def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
1054+
def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
10551055
cdef:
10561056
int ret = 0
10571057
Py_ssize_t i, n = len(values)
10581058
kh_int64_t * table = kh_init_int64()
10591059
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
10601060

1061+
# for keep=all to preserve occurrence loc
1062+
dict seen = dict()
1063+
10611064
kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
10621065

1063-
if take_last:
1066+
if keep == 'last':
10641067
for i from n > i >=0:
10651068
kh_put_int64(table, values[i], &ret)
10661069
out[i] = ret == 0
1067-
else:
1070+
elif keep == 'first':
10681071
for i from 0 <= i < n:
10691072
kh_put_int64(table, values[i], &ret)
10701073
out[i] = ret == 0
1074+
elif keep is False:
1075+
for i from 0 <= i < n:
1076+
row = values[i]
1077+
if row in seen:
1078+
out[i] = 1
1079+
out[seen[row]] = 1
1080+
else:
1081+
seen[row] = i
1082+
out[i] = 0
1083+
else:
1084+
raise ValueError('keep must be either "first", "last" or False')
10711085

10721086
kh_destroy_int64(table)
10731087
return out

pandas/lib.pyx

+19-7
Original file line numberDiff line numberDiff line change
@@ -1292,35 +1292,47 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
12921292

12931293
return result
12941294

1295-
def duplicated(ndarray[object] values, take_last=False):
1295+
1296+
def duplicated(ndarray[object] values, object keep='first'):
12961297
cdef:
12971298
Py_ssize_t i, n
1298-
set seen = set()
1299+
dict seen = dict()
12991300
object row
13001301

13011302
n = len(values)
13021303
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
13031304

1304-
if take_last:
1305+
if keep == 'last':
13051306
for i from n > i >= 0:
13061307
row = values[i]
1307-
13081308
if row in seen:
13091309
result[i] = 1
13101310
else:
1311-
seen.add(row)
1311+
seen[row] = i
13121312
result[i] = 0
1313-
else:
1313+
elif keep == 'first':
13141314
for i from 0 <= i < n:
13151315
row = values[i]
13161316
if row in seen:
13171317
result[i] = 1
13181318
else:
1319-
seen.add(row)
1319+
seen[row] = i
13201320
result[i] = 0
1321+
elif keep is False:
1322+
for i from 0 <= i < n:
1323+
row = values[i]
1324+
if row in seen:
1325+
result[i] = 1
1326+
result[seen[row]] = 1
1327+
else:
1328+
seen[row] = i
1329+
result[i] = 0
1330+
else:
1331+
raise ValueError('keep must be either "first", "last" or False')
13211332

13221333
return result.view(np.bool_)
13231334

1335+
13241336
def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
13251337
cdef:
13261338
Py_ssize_t i, group_size, n, start

0 commit comments

Comments
 (0)