@@ -771,7 +771,8 @@ def _value_counts_arraylike(values, dropna):
771
771
return keys , counts
772
772
773
773
774
- def duplicated (values , keep = 'first' , return_inverse = False ):
774
+ def duplicated (values , keep = 'first' , return_index = False , return_inverse = False ,
775
+ stabilize = True ):
775
776
"""
776
777
Return boolean ndarray denoting duplicate values.
777
778
@@ -788,15 +789,31 @@ def duplicated(values, keep='first', return_inverse=False):
788
789
occurrence.
789
790
- False : Mark all duplicates as ``True``. This option is not
790
791
compatible with ``return_inverse``.
792
+ return_index : boolean, default False
793
+ If True, also return the (array of) integer indices for the unique
794
+ elements within values.
795
+
796
+ .. versionadded:: 0.24.0
791
797
return_inverse : boolean, default False
792
- Determines whether the mapping from unique elements to the original
793
- index should be returned. If True, the output is a tuple.
798
+ If True, also return the indices of the unique array that can be used
799
+ to reconstruct values..
800
+
801
+ .. versionadded:: 0.24.0
802
+ stabilize : boolean, default True
803
+ This keyword is only relevant if index and/or inverse are returned. If
804
+ True (the default), it will be ensured that index and inverse fit to
805
+ the order of `values`. In case that index and inverse are not needed
806
+ separately, but combined right away, this sorting process is
807
+ unnecessary and can be disabled for improved performance by setting
808
+ `stabilize=False`.
794
809
795
810
.. versionadded:: 0.24.0
796
811
797
812
Returns
798
813
-------
799
- duplicated : ndarray or or tuple of ndarray if return_inverse is True
814
+ duplicated : ndarray or tuple of ndarray
815
+ np.ndarray if both `return_index` and `return_inverse` are False.
816
+ Otherwise, tuple of ndarray.
800
817
"""
801
818
802
819
if return_inverse and not keep :
@@ -808,33 +825,46 @@ def duplicated(values, keep='first', return_inverse=False):
808
825
values , dtype , ndtype = _ensure_data (values )
809
826
f = getattr (htable , "duplicated_{dtype}" .format (dtype = ndtype ))
810
827
isdup = f (values , keep = keep )
811
- if not return_inverse :
828
+ if not ( return_index or return_inverse ) :
812
829
return isdup
813
830
elif not isdup .any ():
814
831
# no need to calculate inverse if no duplicates
815
832
inv = np .array (range (len (values )))
816
- return isdup , inv
833
+ return ( isdup ,) + ( inv ,) * return_index + ( inv ,) * return_inverse
817
834
818
835
if keep == 'first' :
819
- # o2u: original indices to indices of ARRAY of unique values
820
- # u2o: reduplication from array of unique values to original array
821
- _ , o2u , u2o = np .unique (values , return_inverse = True ,
822
- return_index = True )
823
- inv = o2u [u2o ]
836
+ # ind: original indices to indices of ARRAY of unique values
837
+ # inv: reduplication from array of unique values to original array
838
+ # this fits together in the way that values[ind] are the unique values
839
+ # and values[ind][inv] == values
840
+ _ , ind , inv = np .unique (values , return_index = True ,
841
+ return_inverse = True )
824
842
elif keep == 'last' :
825
843
# np.unique takes first occurrence as unique value,
826
- # so we flip ids that first becomes last
844
+ # so we flip values that first becomes last
827
845
values = values [::- 1 ]
828
- _ , o2u , u2o = np .unique (values , return_inverse = True ,
829
- return_index = True )
830
- # the values in the ids-array correspond(ed) to the index of value ,
846
+ _ , ind , inv = np .unique (values , return_index = True ,
847
+ return_inverse = True )
848
+ # the values in "values" correspond(ed) to the index of "values" ,
831
849
# which is simply np.array(range(len(values))).
832
- # By flipping ids around, we need to do the same for the index,
833
- # ___because o2u and u2o are relative to that order___.
850
+ # By flipping "values" around, we need to do the same for the index,
851
+ # ___because ind and inv are relative to that order___.
834
852
# Finally, to fit with the original order again, we need to flip the
835
- # values around one last time.
836
- inv = np .array (range (len (values )))[::- 1 ][o2u ][u2o ][::- 1 ]
837
- return isdup , inv
853
+ # result around one last time.
854
+ ind , inv = np .array (range (len (values )))[::- 1 ][ind ], inv [::- 1 ]
855
+
856
+ if stabilize :
857
+ # np.unique yields a __sorted__ list of uniques, and the index/inverse
858
+ # are relative to this order. To restore the original order, we argsort
859
+ # the returned index (corresponding to the mapping from values to
860
+ # sorted, which is the wrong way around for us), and invert this
861
+ # mapping once more (corresponding to the mapping from sorted back to
862
+ # values), which is again done by argsorting.
863
+ undo_sort = np .argsort (np .argsort (ind ))
864
+ ind , inv = ind [undo_sort ], undo_sort [inv ]
865
+
866
+ res = (isdup ,) + (ind ,) * return_index + (inv ,) * return_inverse
867
+ return res
838
868
839
869
840
870
def mode (values , dropna = True ):
0 commit comments