Skip to content

Commit 2d8871f

Browse files
authored
REF: implement Categorical.encode_with_my_categories (#37650)
* REF: implement Categorical.encode_with_my_categories * privatize
1 parent 19d6a61 commit 2d8871f

File tree

2 files changed

+24
-8
lines changed

2 files changed

+24
-8
lines changed

pandas/core/arrays/categorical.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -1693,9 +1693,8 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
16931693
# Indexing on codes is more efficient if categories are the same,
16941694
# so we can apply some optimizations based on the degree of
16951695
# dtype-matching.
1696-
codes = recode_for_categories(
1697-
target.codes, target.categories, self.categories, copy=False
1698-
)
1696+
cat = self._encode_with_my_categories(target)
1697+
codes = cat._codes
16991698
else:
17001699
codes = self.categories.get_indexer(target)
17011700

@@ -1867,8 +1866,8 @@ def _validate_setitem_value(self, value):
18671866
"without identical categories"
18681867
)
18691868
# is_dtype_equal implies categories_match_up_to_permutation
1870-
new_codes = self._validate_listlike(value)
1871-
value = Categorical.from_codes(new_codes, dtype=self.dtype)
1869+
value = self._encode_with_my_categories(value)
1870+
return value._codes
18721871

18731872
# wrap scalars and hashable-listlikes in list
18741873
rvalue = value if not is_hashable(value) else [value]
@@ -2100,8 +2099,8 @@ def equals(self, other: object) -> bool:
21002099
if not isinstance(other, Categorical):
21012100
return False
21022101
elif self._categories_match_up_to_permutation(other):
2103-
other_codes = self._validate_listlike(other)
2104-
return np.array_equal(self._codes, other_codes)
2102+
other = self._encode_with_my_categories(other)
2103+
return np.array_equal(self._codes, other._codes)
21052104
return False
21062105

21072106
@classmethod
@@ -2112,6 +2111,23 @@ def _concat_same_type(self, to_concat):
21122111

21132112
# ------------------------------------------------------------------
21142113

2114+
def _encode_with_my_categories(self, other: "Categorical") -> "Categorical":
2115+
"""
2116+
Re-encode another categorical using this Categorical's categories.
2117+
2118+
Notes
2119+
-----
2120+
This assumes we have already checked
2121+
self._categories_match_up_to_permutation(other).
2122+
"""
2123+
# Indexing on codes is more efficient if categories are the same,
2124+
# so we can apply some optimizations based on the degree of
2125+
# dtype-matching.
2126+
codes = recode_for_categories(
2127+
other.codes, other.categories, self.categories, copy=False
2128+
)
2129+
return self._from_backing_data(codes)
2130+
21152131
def _categories_match_up_to_permutation(self, other: "Categorical") -> bool:
21162132
"""
21172133
Returns True if categoricals are the same dtype

pandas/core/dtypes/concat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def _maybe_unwrap(x):
301301
categories = first.categories
302302
ordered = first.ordered
303303

304-
all_codes = [first._validate_listlike(x) for x in to_union]
304+
all_codes = [first._encode_with_my_categories(x)._codes for x in to_union]
305305
new_codes = np.concatenate(all_codes)
306306

307307
if sort_categories and not ignore_order and ordered:

0 commit comments

Comments
 (0)