Skip to content

REF: Categorical.is_dtype_equal -> categories_match_up_to_permutation #37545

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 2, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ Deprecations
- :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`)
- Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`)
- :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`)
- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`)

.. ---------------------------------------------------------------------------

Expand Down
20 changes: 15 additions & 5 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def func(self, other):
# the same (maybe up to ordering, depending on ordered)

msg = "Categoricals can only be compared if 'categories' are the same."
if not self.is_dtype_equal(other):
if not self._categories_match_up_to_permutation(other):
raise TypeError(msg)

if not self.ordered and not self.categories.equals(other.categories):
Expand Down Expand Up @@ -1874,11 +1874,12 @@ def _validate_setitem_value(self, value):

# require identical categories set
if isinstance(value, Categorical):
if not is_dtype_equal(self, value):
if not is_dtype_equal(self.dtype, value.dtype):
raise ValueError(
"Cannot set a Categorical with another, "
"without identical categories"
)
# is_dtype_equal implies categories_match_up_to_permutation
new_codes = self._validate_listlike(value)
value = Categorical.from_codes(new_codes, dtype=self.dtype)

Expand Down Expand Up @@ -2112,7 +2113,7 @@ def equals(self, other: object) -> bool:
"""
if not isinstance(other, Categorical):
return False
elif self.is_dtype_equal(other):
elif self._categories_match_up_to_permutation(other):
other_codes = self._validate_listlike(other)
return np.array_equal(self._codes, other_codes)
return False
Expand All @@ -2125,7 +2126,7 @@ def _concat_same_type(self, to_concat):

# ------------------------------------------------------------------

def is_dtype_equal(self, other):
def _categories_match_up_to_permutation(self, other: "Categorical") -> bool:
"""
Returns True if categoricals are the same dtype
same categories, and same ordered
Expand All @@ -2138,8 +2139,17 @@ def is_dtype_equal(self, other):
-------
bool
"""
return hash(self.dtype) == hash(other.dtype)

def is_dtype_equal(self, other) -> bool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should deprecate any is_dtype_equal methods on any other Array/Index (not sure if we have any)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's just a CategoricalDtype.is_dtype_equal left. ill be happy to see that go

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woops, no, got rid of that a while ago

warn(
"Categorical.is_dtype_equal is deprecated and will be removed "
"in a future version",
FutureWarning,
stacklevel=2,
)
try:
return hash(self.dtype) == hash(other.dtype)
return self._categories_match_up_to_permutation(other)
except (AttributeError, TypeError):
return False

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def _maybe_unwrap(x):
raise TypeError("dtype of categories must be the same")

ordered = False
if all(first.is_dtype_equal(other) for other in to_union[1:]):
if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]):
# identical categories - fastpath
categories = first.categories
ordered = first.ordered
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def _is_dtype_compat(self, other) -> Categorical:
"""
if is_categorical_dtype(other):
other = extract_array(other)
if not other.is_dtype_equal(self):
if not other._categories_match_up_to_permutation(self):
raise TypeError(
"categories must match existing categories when appending"
)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1083,7 +1083,7 @@ def _maybe_coerce_merge_keys(self):
# if either left or right is a categorical
# then the must match exactly in categories & ordered
if lk_is_cat and rk_is_cat:
if lk.is_dtype_equal(rk):
if lk._categories_match_up_to_permutation(rk):
continue

elif lk_is_cat or rk_is_cat:
Expand Down
45 changes: 28 additions & 17 deletions pandas/tests/arrays/categorical/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,34 +8,45 @@


class TestCategoricalDtypes:
def test_is_equal_dtype(self):
def test_is_dtype_equal_deprecated(self):
# GH#37545
c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False)

with tm.assert_produces_warning(FutureWarning):
c1.is_dtype_equal(c1)

def test_categories_match_up_to_permutation(self):

# test dtype comparisons between cats

c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False)
c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False)
c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True)
assert c1.is_dtype_equal(c1)
assert c2.is_dtype_equal(c2)
assert c3.is_dtype_equal(c3)
assert c1.is_dtype_equal(c2)
assert not c1.is_dtype_equal(c3)
assert not c1.is_dtype_equal(Index(list("aabca")))
assert not c1.is_dtype_equal(c1.astype(object))
assert c1.is_dtype_equal(CategoricalIndex(c1))
assert c1.is_dtype_equal(CategoricalIndex(c1, categories=list("cab")))
assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
assert c1._categories_match_up_to_permutation(c1)
assert c2._categories_match_up_to_permutation(c2)
assert c3._categories_match_up_to_permutation(c3)
assert c1._categories_match_up_to_permutation(c2)
assert not c1._categories_match_up_to_permutation(c3)
assert not c1._categories_match_up_to_permutation(Index(list("aabca")))
assert not c1._categories_match_up_to_permutation(c1.astype(object))
assert c1._categories_match_up_to_permutation(CategoricalIndex(c1))
assert c1._categories_match_up_to_permutation(
CategoricalIndex(c1, categories=list("cab"))
)
assert not c1._categories_match_up_to_permutation(
CategoricalIndex(c1, ordered=True)
)

# GH 16659
s1 = Series(c1)
s2 = Series(c2)
s3 = Series(c3)
assert c1.is_dtype_equal(s1)
assert c2.is_dtype_equal(s2)
assert c3.is_dtype_equal(s3)
assert c1.is_dtype_equal(s2)
assert not c1.is_dtype_equal(s3)
assert not c1.is_dtype_equal(s1.astype(object))
assert c1._categories_match_up_to_permutation(s1)
assert c2._categories_match_up_to_permutation(s2)
assert c3._categories_match_up_to_permutation(s3)
assert c1._categories_match_up_to_permutation(s2)
assert not c1._categories_match_up_to_permutation(s3)
assert not c1._categories_match_up_to_permutation(s1.astype(object))

def test_set_dtype_same(self):
c = Categorical(["a", "b", "c"])
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1707,8 +1707,8 @@ def test_other_columns(self, left, right):
tm.assert_series_equal(result, expected)

# categories are preserved
assert left.X.values.is_dtype_equal(merged.X.values)
assert right.Z.values.is_dtype_equal(merged.Z.values)
assert left.X.values._categories_match_up_to_permutation(merged.X.values)
assert right.Z.values._categories_match_up_to_permutation(merged.Z.values)

@pytest.mark.parametrize(
"change",
Expand All @@ -1725,7 +1725,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right):
X = change(right.X.astype("object"))
right = right.assign(X=X)
assert is_categorical_dtype(left.X.values.dtype)
# assert not left.X.values.is_dtype_equal(right.X.values)
# assert not left.X.values._categories_match_up_to_permutation(right.X.values)

merged = pd.merge(left, right, on="X", how=join_type)

Expand Down