From 9dd8cad6c867095dfe7451f066fc71f78d3f068b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Mar 2020 10:22:24 -0800 Subject: [PATCH 1/2] CLN: use _values_for_argsort for join_non_unique, join_monotonic --- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/period.py | 6 ------ pandas/core/indexes/base.py | 30 +++++++++++++++++++++++------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 40a169d03f39c..71a3ccfb64024 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1494,7 +1494,7 @@ def check_for_ordered(self, op): ) def _values_for_argsort(self): - return self._codes.copy() + return self._codes def argsort(self, ascending=True, kind="quicksort", **kwargs): """ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8141e2c78a7e2..061708239d581 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -453,12 +453,6 @@ def to_timestamp(self, freq=None, how="start"): new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) return DatetimeArray._from_sequence(new_data, freq="infer") - # -------------------------------------------------------------------- - # Array-like / EA-Interface Methods - - def _values_for_argsort(self): - return self._data - # -------------------------------------------------------------------- def _time_shift(self, periods, freq=None): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6f44b5abf5b04..1cd772a72058d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3385,6 +3385,7 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) ------- join_index, (left_indexer, right_indexer) """ + other = ensure_index(other) self_is_mi = isinstance(self, ABCMultiIndex) other_is_mi = isinstance(other, ABCMultiIndex) @@ -3404,8 +3405,6 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) other, level, how=how, return_indexers=return_indexers ) - other = ensure_index(other) - if len(other) == 0 and how in ("left", "outer"): join_index = self._shallow_copy() if return_indexers: @@ -3567,16 +3566,26 @@ def _join_multi(self, other, how, return_indexers=True): def _join_non_unique(self, other, how="left", return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers + # We only get here if dtypes match + assert self.dtype == other.dtype + + if is_extension_array_dtype(self.dtype): + lvalues = self._data._values_for_argsort() + rvalues = other._data._values_for_argsort() + else: + lvalues = self._values + rvalues = other._values + left_idx, right_idx = _get_join_indexers( - [self._ndarray_values], [other._ndarray_values], how=how, sort=True + [lvalues], [rvalues], how=how, sort=True ) left_idx = ensure_platform_int(left_idx) right_idx = ensure_platform_int(right_idx) - join_index = np.asarray(self._ndarray_values.take(left_idx)) + join_index = np.asarray(lvalues.take(left_idx)) mask = left_idx == -1 - np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) + np.putmask(join_index, mask, rvalues.take(right_idx)) join_index = self._wrap_joined_index(join_index, other) @@ -3727,6 +3736,9 @@ def _get_leaf_sorter(labels): return join_index def _join_monotonic(self, other, how="left", return_indexers=False): + # We only get here with matching dtypes + assert other.dtype == self.dtype + if self.equals(other): ret_index = other if how == "right" else self if return_indexers: @@ -3734,8 +3746,12 @@ def _join_monotonic(self, other, how="left", return_indexers=False): else: return ret_index - sv = self._ndarray_values - ov = other._ndarray_values + if is_extension_array_dtype(self.dtype): + sv = self._data._values_for_argsort() + ov = other._data._values_for_argsort() + else: + sv = self._values + ov = other._values if self.is_unique and other.is_unique: # We can perform much better than the general case From d5485e3c6bb9b9b0e2066394a35451b2d4df85b3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Mar 2020 10:28:00 -0800 Subject: [PATCH 2/2] revert unnecessary --- pandas/core/arrays/period.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 061708239d581..8141e2c78a7e2 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -453,6 +453,12 @@ def to_timestamp(self, freq=None, how="start"): new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) return DatetimeArray._from_sequence(new_data, freq="infer") + # -------------------------------------------------------------------- + # Array-like / EA-Interface Methods + + def _values_for_argsort(self): + return self._data + # -------------------------------------------------------------------- def _time_shift(self, periods, freq=None):