Skip to content

ENH: duplicated and drop_duplicates now accept keep kw #10236

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 8, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions doc/source/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1178,17 +1178,19 @@ takes as an argument the columns to use to identify duplicated rows.
- ``drop_duplicates`` removes duplicate rows.

By default, the first observed row of a duplicate set is considered unique, but
each method has a ``take_last`` parameter that indicates the last observed row
should be taken instead.
each method has a ``keep`` parameter to specify targets to be kept.

.. ipython:: python
df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
'c' : np.random.randn(7)})
df2.duplicated(['a','b'])
df2.duplicated(['a','b'], keep='last')
df2.duplicated(['a','b'], keep=False)
df2.drop_duplicates(['a','b'])
df2.drop_duplicates(['a','b'], take_last=True)
df2.drop_duplicates(['a','b'], keep='last')
df2.drop_duplicates(['a','b'], keep=False)
An alternative way to drop duplicates on the index is ``.groupby(level=0)`` combined with ``first()`` or ``last()``.

Expand All @@ -1199,7 +1201,7 @@ An alternative way to drop duplicates on the index is ``.groupby(level=0)`` comb
df3.groupby(level=0).first()
# a bit more verbose
df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b')
df3.reset_index().drop_duplicates(subset='b', keep='first').set_index('b')
.. _indexing.dictionarylike:

Expand Down
10 changes: 10 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,15 @@ Other enhancements
- ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`).

- ``pd.pivot`` will now allow passing index as ``None`` (:issue:`3962`).
- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations <whatsnew_0170.deprecations>` (:issue:`6511`, :issue:`8505`)

.. ipython :: python

s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D'])
s.drop_duplicates()
s.drop_duplicates(keep='last')
s.drop_duplicates(keep=False)


.. _whatsnew_0170.api:

Expand Down Expand Up @@ -520,6 +529,7 @@ Deprecations
===================== =================================

- ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`)

.. _whatsnew_0170.prior_deprecations:

Expand Down
27 changes: 18 additions & 9 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pandas.core import common as com
import pandas.core.nanops as nanops
import pandas.lib as lib
from pandas.util.decorators import Appender, cache_readonly
from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg
from pandas.core.strings import StringMethods
from pandas.core.common import AbstractMethodError

Expand Down Expand Up @@ -543,18 +543,23 @@ def _dir_deletions(self):
Parameters
----------
take_last : boolean, default False
Take the last observed index in a group. Default first
keep : {'first', 'last', False}, default 'first'
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.
take_last : deprecated
%(inplace)s
Returns
-------
deduplicated : %(klass)s
""")

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
def drop_duplicates(self, take_last=False, inplace=False):
duplicated = self.duplicated(take_last=take_last)
def drop_duplicates(self, keep='first', inplace=False):
duplicated = self.duplicated(keep=keep)
result = self[np.logical_not(duplicated)]
if inplace:
return self._update_inplace(result)
Expand All @@ -566,18 +571,22 @@ def drop_duplicates(self, take_last=False, inplace=False):
Parameters
----------
take_last : boolean, default False
Take the last observed index in a group. Default first
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last occurrence.
- False : Mark all duplicates as ``True``.
take_last : deprecated
Returns
-------
duplicated : %(duplicated)s
""")

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
def duplicated(self, take_last=False):
def duplicated(self, keep='first'):
keys = com._ensure_object(self.values)
duplicated = lib.duplicated(keys, take_last=take_last)
duplicated = lib.duplicated(keys, keep=keep)
try:
return self._constructor(duplicated,
index=self.index).__finalize__(self)
Expand Down
27 changes: 18 additions & 9 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2866,8 +2866,9 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
else:
return result

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
def drop_duplicates(self, subset=None, take_last=False, inplace=False):
def drop_duplicates(self, subset=None, keep='first', inplace=False):
"""
Return DataFrame with duplicate rows removed, optionally only
considering certain columns
Expand All @@ -2877,8 +2878,11 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns
take_last : boolean, default False
Take the last observed row in a row. Defaults to the first row
keep : {'first', 'last', False}, default 'first'
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.
take_last : deprecated
inplace : boolean, default False
Whether to drop duplicates in place or to return a copy
cols : kwargs only argument of subset [deprecated]
Expand All @@ -2887,7 +2891,7 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
-------
deduplicated : DataFrame
"""
duplicated = self.duplicated(subset, take_last=take_last)
duplicated = self.duplicated(subset, keep=keep)

if inplace:
inds, = (-duplicated).nonzero()
Expand All @@ -2896,8 +2900,9 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
else:
return self[-duplicated]

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
def duplicated(self, subset=None, take_last=False):
def duplicated(self, subset=None, keep='first'):
"""
Return boolean Series denoting duplicate rows, optionally only
considering certain columns
Expand All @@ -2907,9 +2912,13 @@ def duplicated(self, subset=None, take_last=False):
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns
take_last : boolean, default False
For a set of distinct duplicate rows, flag all but the last row as
duplicated. Default is for all but the first row to be flagged
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the
first occurrence.
- ``last`` : Mark duplicates as ``True`` except for the
last occurrence.
- False : Mark all duplicates as ``True``.
take_last : deprecated
cols : kwargs only argument of subset [deprecated]
Returns
Expand All @@ -2935,7 +2944,7 @@ def f(vals):
labels, shape = map(list, zip( * map(f, vals)))

ids = get_group_index(labels, shape, sort=False, xnull=False)
return Series(duplicated_int64(ids, take_last), index=self.index)
return Series(duplicated_int64(ids, keep), index=self.index)

#----------------------------------------------------------------------
# Sorting
Expand Down
22 changes: 13 additions & 9 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pandas.lib import Timestamp, Timedelta, is_datetime_array
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
deprecate)
deprecate, deprecate_kwarg)
import pandas.core.common as com
from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype,
_values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype,
Expand Down Expand Up @@ -2623,13 +2623,15 @@ def drop(self, labels, errors='raise'):
indexer = indexer[~mask]
return self.delete(indexer)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
def drop_duplicates(self, take_last=False):
return super(Index, self).drop_duplicates(take_last=take_last)
def drop_duplicates(self, keep='first'):
return super(Index, self).drop_duplicates(keep=keep)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
return super(Index, self).duplicated(take_last=take_last)
def duplicated(self, keep='first'):
return super(Index, self).duplicated(keep=keep)

def _evaluate_with_timedelta_like(self, other, op, opstr):
raise TypeError("can only perform ops with timedelta like values")
Expand Down Expand Up @@ -3056,10 +3058,11 @@ def _engine(self):
def is_unique(self):
return not self.duplicated().any()

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
def duplicated(self, keep='first'):
from pandas.hashtable import duplicated_int64
return duplicated_int64(self.codes.astype('i8'), take_last)
return duplicated_int64(self.codes.astype('i8'), keep)

def get_loc(self, key, method=None):
"""
Expand Down Expand Up @@ -4219,15 +4222,16 @@ def _has_complex_internals(self):
def is_unique(self):
return not self.duplicated().any()

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
def duplicated(self, keep='first'):
from pandas.core.groupby import get_group_index
from pandas.hashtable import duplicated_int64

shape = map(len, self.levels)
ids = get_group_index(self.labels, shape, sort=False, xnull=False)

return duplicated_int64(ids, take_last)
return duplicated_int64(ids, keep)

def get_value(self, series, key):
# somewhat broken encapsulation
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
import pandas.core.datetools as datetools
import pandas.core.format as fmt
import pandas.core.nanops as nanops
from pandas.util.decorators import Appender, cache_readonly
from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg

import pandas.lib as lib
import pandas.tslib as tslib
Expand Down Expand Up @@ -1155,14 +1155,15 @@ def mode(self):
from pandas.core.algorithms import mode
return mode(self)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
def drop_duplicates(self, take_last=False, inplace=False):
return super(Series, self).drop_duplicates(take_last=take_last,
inplace=inplace)
def drop_duplicates(self, keep='first', inplace=False):
return super(Series, self).drop_duplicates(keep=keep, inplace=inplace)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs)
def duplicated(self, take_last=False):
return super(Series, self).duplicated(take_last=take_last)
def duplicated(self, keep='first'):
return super(Series, self).duplicated(keep=keep)

def idxmin(self, axis=None, out=None, skipna=True):
"""
Expand Down
28 changes: 22 additions & 6 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1026,25 +1026,41 @@ def mode_int64(int64_t[:] values):

@cython.wraparound(False)
@cython.boundscheck(False)
def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
cdef:
int ret = 0
int ret = 0, value, k
Py_ssize_t i, n = len(values)
kh_int64_t * table = kh_init_int64()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')

kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))

with nogil:
if take_last:
if keep not in ('last', 'first', False):
raise ValueError('keep must be either "first", "last" or False')

if keep == 'last':
with nogil:
for i from n > i >=0:
kh_put_int64(table, values[i], &ret)
out[i] = ret == 0
else:
elif keep == 'first':
with nogil:
for i from 0 <= i < n:
kh_put_int64(table, values[i], &ret)
out[i] = ret == 0

else:
with nogil:
for i from 0 <= i < n:
value = values[i]
k = kh_get_int64(table, value)
if k != table.n_buckets:
out[table.vals[k]] = 1
out[i] = 1
else:
k = kh_put_int64(table, value, &ret)
table.keys[k] = value
table.vals[k] = i
out[i] = 0
kh_destroy_int64(table)
return out

Expand Down
26 changes: 19 additions & 7 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1348,35 +1348,47 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):

return result

def duplicated(ndarray[object] values, take_last=False):

def duplicated(ndarray[object] values, object keep='first'):
cdef:
Py_ssize_t i, n
set seen = set()
dict seen = dict()
object row

n = len(values)
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)

if take_last:
if keep == 'last':
for i from n > i >= 0:
row = values[i]

if row in seen:
result[i] = 1
else:
seen.add(row)
seen[row] = i
result[i] = 0
else:
elif keep == 'first':
for i from 0 <= i < n:
row = values[i]
if row in seen:
result[i] = 1
else:
seen.add(row)
seen[row] = i
result[i] = 0
elif keep is False:
for i from 0 <= i < n:
row = values[i]
if row in seen:
result[i] = 1
result[seen[row]] = 1
else:
seen[row] = i
result[i] = 0
else:
raise ValueError('keep must be either "first", "last" or False')

return result.view(np.bool_)


def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
cdef:
Py_ssize_t i, group_size, n, start
Expand Down
Loading