Skip to content

Commit 6f633c7

Browse files
committed
Allow for na values that are of same type as the data
1 parent f21890e commit 6f633c7

File tree

7 files changed

+61
-4
lines changed

7 files changed

+61
-4
lines changed

doc/source/whatsnew/v1.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,7 @@ Deprecations
484484
- Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`)
485485
- The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`)
486486
- The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`)
487+
- Deprecated membership checks for nan-likes in :class:`Categorical`. In the future the membership check will only return True if the nan-like is ``nan`` or of the same dtype as the underlying categories (:issue:`37867`)
487488

488489
.. ---------------------------------------------------------------------------
489490

pandas/core/arrays/base.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -362,10 +362,13 @@ def __contains__(self, item) -> bool:
362362
# GH37867
363363
# comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA]
364364
# would raise a TypeError. The implementation below works around that.
365-
if item is self.dtype.na_value:
366-
return self.isna().any() if self._can_hold_na else False
367-
elif is_scalar(item) and isna(item):
368-
return False
365+
if is_scalar(item) and isna(item):
366+
if not self._can_hold_na:
367+
return False
368+
elif item is self.dtype.na_value or isinstance(item, self.dtype.type):
369+
return self.isna().any()
370+
else:
371+
return False
369372
else:
370373
return (item == self).any()
371374

pandas/core/arrays/categorical.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1745,8 +1745,16 @@ def __contains__(self, key) -> bool:
17451745
"""
17461746
Returns True if `key` is in this Categorical.
17471747
"""
1748+
# in pandas 2.0, remove this method.
1749+
17481750
# if key is a NaN, check if any NaN is in self.
17491751
if is_valid_nat_for_dtype(key, self.categories.dtype):
1752+
if key is not self.dtype.na_value and not isinstance(key, self.dtype.type):
1753+
warn(f"Membership check with {key} will return False in the future. "
1754+
f"Consider using {self.dtype.na_value} instead",
1755+
FutureWarning,
1756+
stacklevel=2,
1757+
)
17501758
return self.isna().any()
17511759

17521760
return contains(self, key, container=self._codes)

pandas/core/arrays/numpy_.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,13 @@ def numpy_dtype(self) -> np.dtype:
5151
"""
5252
return self._dtype
5353

54+
@property
55+
def na_value(self) -> object:
56+
if issubclass(self.type, np.floating):
57+
return self.type("nan")
58+
else:
59+
return super().na_value
60+
5461
@property
5562
def name(self) -> str:
5663
"""

pandas/tests/extension/arrow/test_bool.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ def test_view(self, data):
5050
# __setitem__ does not work, so we only have a smoke-test
5151
data.view()
5252

53+
@pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet")
54+
def test_contains(self, data, data_missing):
55+
super().test_contains(data, data_missing)
56+
5357

5458
class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
5559
def test_from_dtype(self, data):

pandas/tests/extension/decimal/array.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,14 @@ def __setitem__(self, key, value):
155155
def __len__(self) -> int:
156156
return len(self._data)
157157

158+
def __contains__(self, item) -> bool:
159+
if not isinstance(item, decimal.Decimal):
160+
return False
161+
elif item.is_nan():
162+
return self.isna().any()
163+
else:
164+
return super().__contains__(item)
165+
158166
@property
159167
def nbytes(self) -> int:
160168
n = len(self)

pandas/tests/extension/test_categorical.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,32 @@ def test_memory_usage(self, data):
8787
# Is this deliberate?
8888
super().test_memory_usage(data)
8989

90+
def test_contains(self, data, data_missing):
91+
# GH-37867
92+
# na value handling in Categorical.__contains__ is deprecated.
93+
# See base.BaseInterFaceTests.test_contains for more details.
94+
95+
na_value = data.dtype.na_value
96+
# ensure data without missing values
97+
data = data[~data.isna()]
98+
99+
# first elements are non-missing
100+
assert data[0] in data
101+
assert data_missing[0] in data_missing
102+
103+
# check the presence of na_value
104+
assert na_value in data_missing
105+
assert na_value not in data
106+
107+
# the data can never contain other nan-likes than na_value
108+
for na_value_type in {None, np.nan, pd.NA, pd.NaT}:
109+
if na_value_type is na_value:
110+
continue
111+
112+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
113+
assert na_value_type not in data
114+
assert na_value_type in data_missing
115+
90116

91117
class TestConstructors(base.BaseConstructorsTests):
92118
pass

0 commit comments

Comments
 (0)