From 4836a29ceb342f14c856d42fa7a18cb29eaa6d51 Mon Sep 17 00:00:00 2001
From: fjetter <fjetter@users.noreply.github.com>
Date: Sun, 13 May 2018 19:44:15 +0200
Subject: [PATCH 1/5] PERF: implement __contains__ for Categorical

---
 asv_bench/benchmarks/categoricals.py | 44 ++++++++++++++++++++++++++++
 doc/source/whatsnew/v0.24.0.txt      |  2 +-
 pandas/core/arrays/categorical.py    |  7 +++++
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index 5464e7cba22c3..f5b08310d9d04 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -193,3 +193,47 @@ def time_categorical_series_is_monotonic_increasing(self):
 
     def time_categorical_series_is_monotonic_decreasing(self):
         self.s.is_monotonic_decreasing
+
+
+class Contains(object):
+
+    params = (["a", "c", "d", "z", np.nan], [True, False])
+    param_names = ["value", "has_nan"]
+
+    def setup(self, value, has_nan):
+        n = 1 * 10 ** 4
+        obj_values = list("a" * n + "b" * n + "c" * n)
+        if has_nan:
+            obj_values = [np.nan] + obj_values[:-2] + [np.nan]
+
+        self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd"))
+        self.cat = pd.Categorical(obj_values, categories=list("abcd"))
+
+    def time_contains_index(self, value, has_nan):
+        value in self.ci
+
+    def time_cat_isin(self, value, has_nan):
+        value in self.cat
+
+
+class Indexing(object):
+
+    params = (["a", "c"], [True, False])
+    param_names = ["value", "has_nan"]
+
+    def setup(self, value, has_nan):
+        n = 1 * 10 ** 4
+        obj_values = list("a" * n + "b" * n + "c" * n)
+        if has_nan:
+            obj_values = [np.nan] + obj_values[:-2] + [np.nan]
+
+        self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd"))
+        self.cat = pd.Categorical(obj_values, categories=list("abcd"))
+        self.df = pd.DataFrame(dict(A=range(n * 3)), index=self.ci)
+        self.ser = pd.Series(range(n * 3), index=self.ci)
+
+    def time_loc_df(self, value, has_nan):
+        self.df.loc[value]
+
+    def time_loc_ser(self, value, has_nan):
+        self.ser.loc[value]
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index c69de149a0f35..40f9ab5b24506 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -64,7 +64,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
--
+- Improved performance of indexing on a Series/DataFrame with a CategoricalIndex
 
 .. _whatsnew_0240.docs:
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 30f9c56d24f02..34dbebd758f4e 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1847,6 +1847,13 @@ def __iter__(self):
         """Returns an Iterator over the values of this Categorical."""
         return iter(self.get_values().tolist())
 
+    def __contains__(self, key):
+        """Returns True if `key` is in this Categorical."""
+        if key in self.categories:
+            return self.categories.get_loc(key) in self.codes
+        else:
+            return False
+
     def _tidy_repr(self, max_vals=10, footer=True):
         """ a short repr displaying only max_vals and an optional (but default
         footer)

From 40f1cac79f820f3a3d2c9f6eb2fc9c65c41061c2 Mon Sep 17 00:00:00 2001
From: fjetter <fjetter@users.noreply.github.com>
Date: Wed, 6 Jun 2018 23:34:51 +0200
Subject: [PATCH 2/5] Add case for nan in categorical.__contains__

---
 pandas/core/arrays/categorical.py     | 2 ++
 pandas/tests/indexes/test_category.py | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 34dbebd758f4e..cc27f5e4b9e75 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1851,6 +1851,8 @@ def __contains__(self, key):
         """Returns True if `key` is in this Categorical."""
         if key in self.categories:
             return self.categories.get_loc(key) in self.codes
+        elif isna(key) and self.isna().any():
+            return True
         else:
             return False
 
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
index a2a4170256088..68c6d63b32f62 100644
--- a/pandas/tests/indexes/test_category.py
+++ b/pandas/tests/indexes/test_category.py
@@ -244,6 +244,10 @@ def test_contains(self):
             list('aabbca') + [np.nan], categories=list('cabdef'))
         assert np.nan in ci
 
+        ci = CategoricalIndex(
+            list('aaa'), categories=list('cabdef'))
+        assert 'f' not in ci
+
     def test_min_max(self):
 
         ci = self.create_index(ordered=False)

From 5f8b5e075dc48c184eb310db23481c2c81e67acb Mon Sep 17 00:00:00 2001
From: fjetter <fjetter@users.noreply.github.com>
Date: Fri, 8 Jun 2018 15:43:22 +0200
Subject: [PATCH 3/5] review comments

---
 asv_bench/benchmarks/categoricals.py | 16 +++++++++++-----
 doc/source/whatsnew/v0.24.0.txt      |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index f5b08310d9d04..41460eaf47699 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -197,7 +197,14 @@ def time_categorical_series_is_monotonic_decreasing(self):
 
 class Contains(object):
 
-    params = (["a", "c", "d", "z", np.nan], [True, False])
+    params = ([
+        "b",  # in array
+        "d",  # in categories but not in codes
+        "z",  # nowhere
+        np.nan,
+    ],
+        [True, False],
+    )
     param_names = ["value", "has_nan"]
 
     def setup(self, value, has_nan):
@@ -227,10 +234,9 @@ def setup(self, value, has_nan):
         if has_nan:
             obj_values = [np.nan] + obj_values[:-2] + [np.nan]
 
-        self.ci = pd.CategoricalIndex(obj_values, categories=list("abcd"))
-        self.cat = pd.Categorical(obj_values, categories=list("abcd"))
-        self.df = pd.DataFrame(dict(A=range(n * 3)), index=self.ci)
-        self.ser = pd.Series(range(n * 3), index=self.ci)
+        ci = pd.CategoricalIndex(obj_values, categories=list("abcd"))
+        self.df = pd.DataFrame(dict(A=range(n * 3)), index=ci)
+        self.ser = pd.Series(range(n * 3), index=ci)
 
     def time_loc_df(self, value, has_nan):
         self.df.loc[value]
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 40f9ab5b24506..8854cfd5d9e4a 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -64,7 +64,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
-- Improved performance of indexing on a Series/DataFrame with a CategoricalIndex
+- Improved performance of indexing on a Series/DataFrame with a ``CategoricalIndex`` (:issue:`21022`)
 
 .. _whatsnew_0240.docs:
 

From c5387a71482bfe7ccc1e730d24765ea812f617c7 Mon Sep 17 00:00:00 2001
From: fjetter <fjetter@users.noreply.github.com>
Date: Sat, 9 Jun 2018 11:51:10 +0200
Subject: [PATCH 4/5] PERF: __contains__ of Categorical improved

---
 pandas/core/arrays/categorical.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index cc27f5e4b9e75..3791f854ea7c2 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1849,10 +1849,14 @@ def __iter__(self):
 
     def __contains__(self, key):
         """Returns True if `key` is in this Categorical."""
-        if key in self.categories:
-            return self.categories.get_loc(key) in self.codes
-        elif isna(key) and self.isna().any():
-            return True
+        hash(key)
+        if isna(key):
+            return self.isna().any()
+        elif self.categories._defer_to_indexing:  # e.g. Interval values
+            loc = self.categories.get_loc(key)
+            return np.isin(self.codes, loc).any()
+        elif key in self.categories:
+            return self.categories.get_loc(key) in self._codes
         else:
             return False
 

From a3550e8e87935d3e215ed9a3a4d82d3aedb3b72b Mon Sep 17 00:00:00 2001
From: fjetter <fjetter@users.noreply.github.com>
Date: Sat, 9 Jun 2018 11:51:40 +0200
Subject: [PATCH 5/5] BUG: Fix membership check for interval categoricals

---
 doc/source/whatsnew/v0.24.0.txt       |  2 +-
 pandas/core/indexes/category.py       | 10 ----------
 pandas/tests/indexes/test_category.py |  7 +++++++
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 8854cfd5d9e4a..4158edf683c89 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -83,7 +83,7 @@ Bug Fixes
 Categorical
 ^^^^^^^^^^^
 
--
+- Fixed an issue where membership checks on ``CategoricalIndex`` with interval values may return false positive (:issue:`21022`)
 -
 -
 
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 150eca32e229d..8722170ac41d4 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -323,20 +323,10 @@ def _reverse_indexer(self):
 
     @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs)
     def __contains__(self, key):
-        hash(key)
-
-        if self.categories._defer_to_indexing:
-            return key in self.categories
-
         return key in self.values
 
     @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
     def contains(self, key):
-        hash(key)
-
-        if self.categories._defer_to_indexing:
-            return self.categories.contains(key)
-
         return key in self.values
 
     def __array__(self, dtype=None):
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
index 68c6d63b32f62..88d76210da8ba 100644
--- a/pandas/tests/indexes/test_category.py
+++ b/pandas/tests/indexes/test_category.py
@@ -248,6 +248,13 @@ def test_contains(self):
             list('aaa'), categories=list('cabdef'))
         assert 'f' not in ci
 
+    def test_containst_defer_to_indexing(self):
+        intervals = pd.interval_range(1, 4)
+        cat = pd.CategoricalIndex(list(intervals[:-1]), categories=intervals)
+        assert intervals[0] in cat
+        assert intervals[1] in cat
+        assert intervals[2] not in cat
+
     def test_min_max(self):
 
         ci = self.create_index(ordered=False)