WIP: add return_inverse to Series/Index as well

h-vetinari · h-vetinari · commit 2e2a14d219e0 · 2018-07-04T18:42:54.000+02:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -771,7 +771,7 @@ def _value_counts_arraylike(values, dropna):
     return keys, counts
 
 
-def duplicated(values, keep='first'):
+def duplicated(values, keep='first', return_inverse=False):
     """
     Return boolean ndarray denoting duplicate values.
 
@@ -786,16 +786,55 @@ def duplicated(values, keep='first'):
           occurrence.
         - ``last`` : Mark duplicates as ``True`` except for the last
           occurrence.
-        - False : Mark all duplicates as ``True``.
+        - False : Mark all duplicates as ``True``. This option is not
+          compatible with ``return_inverse``.
+    return_inverse : boolean, default False
+        Determines whether the mapping from unique elements to the original
+        index should be returned. If True, the output is a tuple.
+
+        .. versionadded:: 0.24.0
 
     Returns
     -------
-    duplicated : ndarray
+    duplicated : ndarray or or tuple of ndarray if return_inverse is True
     """
 
+    if return_inverse and not keep:
+        raise ValueError("The parameters return_inverse=True and "
+                         "keep=False cannot be used together (impossible "
+                         "to calculate an inverse when discarding all "
+                         "instances of a duplicate).")
+
     values, dtype, ndtype = _ensure_data(values)
     f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
-    return f(values, keep=keep)
+    isdup = f(values, keep=keep)
+    if not return_inverse:
+        return isdup
+    elif not isdup.any():
+        # no need to calculate inverse if no duplicates
+        inv = np.array(range(len(values)))
+        return isdup, inv
+
+    if keep == 'first':
+        # o2u: original indices to indices of ARRAY of unique values
+        # u2o: reduplication from array of unique values to original array
+        _, o2u, u2o = np.unique(values, return_inverse=True,
+                                return_index=True)
+        inv = o2u[u2o]
+    elif keep == 'last':
+        # np.unique takes first occurrence as unique value,
+        # so we flip ids that first becomes last
+        values = values[::-1]
+        _, o2u, u2o = np.unique(values, return_inverse=True,
+                                return_index=True)
+        # the values in the ids-array correspond(ed) to the index of value,
+        # which is simply np.array(range(len(values))).
+        # By flipping ids around, we need to do the same for the index,
+        # ___because o2u and u2o are relative to that order___.
+        # Finally, to fit with the original order again, we need to flip the
+        # values around one last time.
+        inv = np.array(range(len(values)))[::-1][o2u][u2o][::-1]
+    return isdup, inv
 
 
 def mode(values, dropna=True):
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1242,16 +1242,37 @@ def drop_duplicates(self, keep='first', inplace=False):
         else:
             return result
 
-    def duplicated(self, keep='first'):
+    def duplicated(self, keep='first', return_inverse=False):
         from pandas.core.algorithms import duplicated
+
+        if return_inverse and not keep:
+            raise ValueError("The parameters return_inverse=True and "
+                             "keep=False cannot be used together (impossible "
+                             "to calculate an inverse when discarding all "
+                             "instances of a duplicate).")
+
         if isinstance(self, ABCIndexClass):
             if self.is_unique:
-                return np.zeros(len(self), dtype=np.bool)
-            return duplicated(self, keep=keep)
-        else:
+                isdup = np.zeros(len(self), dtype=np.bool)
+                if not return_inverse:
+                    return isdup
+                return isdup, np.array(range(len(self)))
+            # algorithms.duplicated has the same output signature as
+            # Index.duplicated -> no need to distinguish cases here
+            return duplicated(self, keep=keep, return_inverse=return_inverse)
+
+        # Series case
+        if not return_inverse:
             return self._constructor(duplicated(self, keep=keep),
                                      index=self.index).__finalize__(self)
 
+        isdup_array, inv_array = duplicated(self, keep=keep,
+                                            return_inverse=return_inverse)
+        isdup = self._constructor(isdup_array,
+                                  index=self.index).__finalize__(self)
+        inv = self._constructor(self.index[inv_array], index=self.index)
+        return isdup, inv
+
     # ----------------------------------------------------------------------
     # abstracts
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4364,7 +4364,7 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
               compatible with ``return_inverse``.
         return_inverse : boolean, default False
             Determines whether the mapping from unique elements to the original
-            index should be returned. If true, the output is a tuple.
+            index should be returned. If True, the output is a tuple.
 
             .. versionadded:: 0.24.0
 
@@ -4373,12 +4373,14 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
         duplicated : Series or tuple of Series if return_inverse is True
         """
         from pandas.core.sorting import get_group_index
-        from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
+        from pandas._libs.hashtable import _SIZE_HINT_LIMIT
+        from pandas.core.algorithms import duplicated
 
         if return_inverse and not keep:
             raise ValueError("The parameters return_inverse=True and "
                              "keep=False cannot be used together (impossible "
-                             "to calculate an inverse when discarding values)")
+                             "to calculate an inverse when discarding all "
+                             "instances of a duplicate).")
 
         def f(vals):
             labels, shape = algorithms.factorize(
@@ -4404,32 +4406,13 @@ def f(vals):
         labels, shape = map(list, zip(*map(f, vals)))
 
         ids = get_group_index(labels, shape, sort=False, xnull=False)
-        isdup = Series(duplicated_int64(ids, keep), index=self.index)
         if not return_inverse:
-            return isdup
-        elif not isdup.any():
-            # no need to calculate inverse if no duplicates
-            inv = Series(self.index, index=self.index)
-            return isdup, inv
-
-        if keep == 'first':
-            # o2u: original indices to indices of ARRAY of unique values
-            # u2o: reduplication from array of unique values to original array
-            _, o2u, u2o = np.unique(ids, return_inverse=True,
-                                    return_index=True)
-            inv = Series(self.index[o2u][u2o], index=self.index)
-        elif keep == 'last':
-            # np.unique takes first occurrence as unique value,
-            # so we flip ids that first becomes last
-            ids = ids[::-1]
-            _, o2u, u2o = np.unique(ids, return_inverse=True,
-                                    return_index=True)
-            # the values in the ids-array correspond(ed) to self.index -
-            # by flipping ids around, we need to do the same for self.index,
-            # ___because o2u and u2o are relative to that order___.
-            # Finally, to fit with 'index=self.index' in the constructor,
-            # we need to flip the values around one last time
-            inv = Series(self.index[::-1][o2u][u2o][::-1], index=self.index)
+            return Series(duplicated(ids, keep=keep), index=self.index)
+
+        isdup_array, inv_array = duplicated(ids, keep=keep,
+                                            return_inverse=return_inverse)
+        isdup = Series(isdup_array, index=self.index)
+        inv = Series(self.index[inv_array], index=self.index)
         return isdup, inv
 
     # ----------------------------------------------------------------------
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4436,7 +4436,7 @@ def drop_duplicates(self, keep='first'):
         """
         return super(Index, self).drop_duplicates(keep=keep)
 
-    def duplicated(self, keep='first'):
+    def duplicated(self, keep='first', return_inverse=False):
         """
         Indicate duplicate index values.
 
@@ -4453,7 +4453,17 @@ def duplicated(self, keep='first'):
               occurrence.
             - 'last' : Mark duplicates as ``True`` except for the last
               occurrence.
-            - ``False`` : Mark all duplicates as ``True``.
+            - ``False`` : Mark all duplicates as ``True``. This option is not
+              compatible with ``return_inverse``.
+        return_inverse : boolean, default False
+            Determines whether the mapping from unique elements to the original
+            index should be returned. If True, the output is a tuple.
+
+            .. versionadded:: 0.24.0
+
+        Returns
+        -------
+        duplicated : ndarray or or tuple of ndarray if return_inverse is True
 
         Examples
         --------
@@ -4480,17 +4490,14 @@ def duplicated(self, keep='first'):
         >>> idx.duplicated(keep=False)
         array([ True, False,  True, False,  True])
 
-        Returns
-        -------
-        numpy.ndarray
-
         See Also
         --------
         pandas.Series.duplicated : Equivalent method on pandas.Series
         pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame
         pandas.Index.drop_duplicates : Remove duplicate values from Index
         """
-        return super(Index, self).duplicated(keep=keep)
+        return super(Index, self).duplicated(keep=keep,
+                                             return_inverse=return_inverse)
 
     _index_shared_docs['fillna'] = """
         Fill NA/NaN values with the specified value
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -916,14 +916,19 @@ def f(k, stringify):
         return hash_tuple(key)
 
     @Appender(Index.duplicated.__doc__)
-    def duplicated(self, keep='first'):
+    def duplicated(self, keep='first', return_inverse=False):
         from pandas.core.sorting import get_group_index
-        from pandas._libs.hashtable import duplicated_int64
+        from pandas.core.algorithms import duplicated
+
+        if return_inverse and not keep:
+            raise ValueError("The parameters return_inverse=True and "
+                             "keep=False cannot be used together (impossible "
+                             "to calculate an inverse when discarding all "
+                             "instances of a duplicate).")
 
         shape = map(len, self.levels)
         ids = get_group_index(self.labels, shape, sort=False, xnull=False)
-
-        return duplicated_int64(ids, keep)
+        return duplicated(ids, keep=keep, return_inverse=return_inverse)
 
     def fillna(self, value=None, downcast=None):
         """
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1583,7 +1583,7 @@ def drop_duplicates(self, keep='first', inplace=False):
         """
         return super(Series, self).drop_duplicates(keep=keep, inplace=inplace)
 
-    def duplicated(self, keep='first'):
+    def duplicated(self, keep='first', return_inverse=False):
         """
         Indicate duplicate Series values.
 
@@ -1598,7 +1598,17 @@ def duplicated(self, keep='first'):
               occurrence.
             - 'last' : Mark duplicates as ``True`` except for the last
               occurrence.
-            - ``False`` : Mark all duplicates as ``True``.
+            - ``False`` : Mark all duplicates as ``True``. This option is not
+              compatible with ``return_inverse``.
+        return_inverse : boolean, default False
+            Determines whether the mapping from unique elements to the original
+            index should be returned. If True, the output is a tuple.
+
+            .. versionadded:: 0.24.0
+
+        Returns
+        -------
+        duplicated : Series or or tuple of Series if return_inverse is True
 
         Examples
         --------
@@ -1645,17 +1655,14 @@ def duplicated(self, keep='first'):
         4     True
         dtype: bool
 
-        Returns
-        -------
-        pandas.core.series.Series
-
         See Also
         --------
         pandas.Index.duplicated : Equivalent method on pandas.Index
         pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame
         pandas.Series.drop_duplicates : Remove duplicate values from Series
         """
-        return super(Series, self).duplicated(keep=keep)
+        return super(Series, self).duplicated(keep=keep,
+                                              return_inverse=return_inverse)
 
     def idxmin(self, axis=None, skipna=True, *args, **kwargs):
         """