From 0d7a41ba7820ebbe130f5753963376e844cbf54b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 5 Nov 2020 07:16:05 -0800 Subject: [PATCH 1/2] REF: implement Categorical.encode_with_my_categories --- pandas/core/arrays/categorical.py | 30 +++++++++++++++++++++++------- pandas/core/dtypes/concat.py | 2 +- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 626fb495dec03..e5a5718d96cbe 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1694,9 +1694,8 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray: # Indexing on codes is more efficient if categories are the same, # so we can apply some optimizations based on the degree of # dtype-matching. - codes = recode_for_categories( - target.codes, target.categories, self.categories, copy=False - ) + cat = self.encode_with_my_categories(target) + codes = cat._codes else: codes = self.categories.get_indexer(target) @@ -1868,8 +1867,8 @@ def _validate_setitem_value(self, value): "without identical categories" ) # is_dtype_equal implies categories_match_up_to_permutation - new_codes = self._validate_listlike(value) - value = Categorical.from_codes(new_codes, dtype=self.dtype) + value = self.encode_with_my_categories(value) + return value._codes # wrap scalars and hashable-listlikes in list rvalue = value if not is_hashable(value) else [value] @@ -2101,8 +2100,8 @@ def equals(self, other: object) -> bool: if not isinstance(other, Categorical): return False elif self._categories_match_up_to_permutation(other): - other_codes = self._validate_listlike(other) - return np.array_equal(self._codes, other_codes) + other = self.encode_with_my_categories(other) + return np.array_equal(self._codes, other._codes) return False @classmethod @@ -2113,6 +2112,23 @@ def _concat_same_type(self, to_concat): # ------------------------------------------------------------------ + def encode_with_my_categories(self, other: "Categorical") -> "Categorical": + """ + Re-encode another categorical using this Categorical's categories. + + Notes + ----- + This assumes we have already checked + self._categories_match_up_to_permutation(other). + """ + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + codes = recode_for_categories( + other.codes, other.categories, self.categories, copy=False + ) + return self._from_backing_data(codes) + def _categories_match_up_to_permutation(self, other: "Categorical") -> bool: """ Returns True if categoricals are the same dtype diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 99dc01ef421d1..11f8ed342fe2c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -301,7 +301,7 @@ def _maybe_unwrap(x): categories = first.categories ordered = first.ordered - all_codes = [first._validate_listlike(x) for x in to_union] + all_codes = [first.encode_with_my_categories(x)._codes for x in to_union] new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: From 46635294b5daf864ce4e80e2fdffeb2344481b29 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 7 Nov 2020 18:38:07 -0800 Subject: [PATCH 2/2] privatize --- pandas/core/arrays/categorical.py | 8 ++++---- pandas/core/dtypes/concat.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 61d6ceade25b7..87a049c77dc32 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1693,7 +1693,7 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray: # Indexing on codes is more efficient if categories are the same, # so we can apply some optimizations based on the degree of # dtype-matching. - cat = self.encode_with_my_categories(target) + cat = self._encode_with_my_categories(target) codes = cat._codes else: codes = self.categories.get_indexer(target) @@ -1866,7 +1866,7 @@ def _validate_setitem_value(self, value): "without identical categories" ) # is_dtype_equal implies categories_match_up_to_permutation - value = self.encode_with_my_categories(value) + value = self._encode_with_my_categories(value) return value._codes # wrap scalars and hashable-listlikes in list @@ -2099,7 +2099,7 @@ def equals(self, other: object) -> bool: if not isinstance(other, Categorical): return False elif self._categories_match_up_to_permutation(other): - other = self.encode_with_my_categories(other) + other = self._encode_with_my_categories(other) return np.array_equal(self._codes, other._codes) return False @@ -2111,7 +2111,7 @@ def _concat_same_type(self, to_concat): # ------------------------------------------------------------------ - def encode_with_my_categories(self, other: "Categorical") -> "Categorical": + def _encode_with_my_categories(self, other: "Categorical") -> "Categorical": """ Re-encode another categorical using this Categorical's categories. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 11f8ed342fe2c..a38d9cbad0d64 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -301,7 +301,7 @@ def _maybe_unwrap(x): categories = first.categories ordered = first.ordered - all_codes = [first.encode_with_my_categories(x)._codes for x in to_union] + all_codes = [first._encode_with_my_categories(x)._codes for x in to_union] new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: